-
Notifications
You must be signed in to change notification settings - Fork 0
/
RF_AttributeSelection_T1.r
99 lines (77 loc) · 2.43 KB
/
RF_AttributeSelection_T1.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
### MEAN DECREASE IN GINI ###
#data import
mdata <- read.csv("/nfs/cms/mtech18/salam.saudagar/Desktop/ML/data.csv")
str(mdata)
mdata <- mdata[,-c(1,33)]
#data partition
shuff=mdata[sample(1:nrow(mdata)),]
train=shuff[(1:456),]
test=shuff[c(457:570),]
# random forest
library(randomForest)
library(caret)
set.seed(1234)
rf <- randomForest(diagnosis~. , data = train,ntree=400,importance = TRUE,proximity = TRUE)
pd <- predict(rf,test)
cm=confusionMatrix(pd,test$diagnosis)
#sorting attributes according their importance
im=importance(rf)
sr=sort(im[,3],decreasing = T)
nn=seq(2,30,by=2)
#finding important attributes and eliminating less important attributes
acc1=NULL
for(n in nn)
{
var.predict<-paste(names(sr)[1:n],collapse="+")
rnft.formula <- as.formula(paste(names(mdata)[1], var.predict, sep = " ~ "))
rnft <- randomForest(rnft.formula ,data = train, ntree=400,importance = TRUE, proximity = TRUE)
pred2 <- predict(rnft,test)
cm2 <- confusionMatrix(pred2,test$diagnosis)
acc=as.numeric(cm2$overall['Accuracy'])
acc1=c(acc1,acc)
}
mtr=seq(2,30,by=2)
mx.ac=which.max(acc1)
bst.mtr=mtr[mx.ac] # best number of attributes
bst=seq(2,bst.mtr,by=2)
# Applying CV fold on best attributes
CV<-cut(1:nrow(mdata),breaks=5, labels=FALSE)
ntre=seq(50,400,by=50)
c4=NULL
c5=NULL
ntr=NULL
for(i in bst)
{
c3=NULL
for(j in ntre)
{
c1=NULL
for( m in 1:5)
{
test1<- shuff[which(m==CV,arr.ind=T),]
train1<- shuff[-(which(m==CV,arr.ind=T)),]
var.predict1<-paste(names(sr)[1:length(bst)],collapse="+")
rnft.formula1 <- as.formula(paste(names(mydata)[1], var.predict1, sep = " ~ "))
rf1 <- randomForest(rnft.formula1 ,data = train1,ntree=j,mtry=i, importance = TRUE, proximity = TRUE)
pred1 <- predict(rf1,test1)
cm1 <- confusionMatrix(pred1,test1$diagnosis)
over1=as.numeric(cm1$overall['Accuracy'])
c1=c(c1,over1)
}
c2=mean(c1)
c3=c(c3,c2)
}
c4=c(c4,max(c3))
ntr=c(ntr,ntre[which.max(c3)])
}
zz=data.frame(bst,c4,ntr)
zx=which.max(zz[,2])
MTRY=zz[zx,1]
ACCURACY=zz[zx,2]
NTREE=zz[zx,3]
print("Best accuracy and its respective Mtry and Ntree are:")
print(data.frame(MTRY,ACCURACY,NTREE))
print("No of important attributes are :")
print(bst.mtr)
print("Important attributes are :")
print(data.frame(names(sr)[1:bst.mtr]))