-
Notifications
You must be signed in to change notification settings - Fork 2
/
Lit model.R
109 lines (85 loc) · 3.36 KB
/
Lit model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#training data
mydata <- read.csv("C:/Users/vipul/Documents/Documents/Masters/Course documents/Fall 2019 sem/Data Mining/Assignments/HW3/jantojun.csv")
View(mydata)
str(mydata)
mydata$Total.loss <- as.numeric(mydata$Total.loss)
mydata$District <- as.factor(mydata$District)
mydata$Alarm.Date <- as.factor(mydata$Alarm.Date)
mydata$Zip <- as.factor(mydata$Zip)
mydata$Property.Use <- as.factor(mydata$Property.Use)
mydata$Fire <- as.factor(mydata$Fire)
str(mydata)
# Fire : 1- High risk ; 0- low risk
set.seed(123)
train.index<-sample(row.names(mydata),0.6*dim(mydata)[1])
valid.index<-setdiff(row.names(mydata),train.index)
train<-mydata[train.index, ]
validate<-mydata[valid.index, ]
summary(train)
#Plot to visualize the distribution of data
hist(Total.loss)
hist(log(train$Total.loss))
#Matrix plots to check the relation between different predictors with Total losses (run all plots together)
#include percentage of false to true alarms#
par(mfrow=c(2,2))
plot(Property.Use,log(Total.loss),main = "Total Losses vs Property use",xlab="Property use",ylab="logarithm of Total losses") #need ggplot#
plot(District,log(Total.loss),main = "Total Losses vs District",xlab="District",ylab="logarithm of Total losses")
plot(Zip,log(Total.loss),main = "Total Losses vs Zip code",xlab="Zip code",ylab="logarithm of Total losses")
plot(Incident.Type,log(Total.loss),main = "Total Losses vs Incident type",xlab="Incident Type",ylab="logarithm of Total losses")
#pivot table to find risk level of incidents in each zip code
library(pivottabler)
par(mfrow=c(1,1))
pvt <- PivotTable$new()
pvt$addData(mydata)
pvt$addColumnDataGroups("Fire")
pvt$addRowDataGroups("Zip")
pvt$defineCalculation(calculationName="Fire", summariseExpression="n()")
pvt$renderPivot()
train$Incident.Type <- as.numeric(train$Incident.Type)
train$Alarm.Date <- as.numeric(train$Alarm.Date)
train$District <- as.numeric(train$District)
train$Zip <- as.numeric(train$Zip)
train$Property.Use <- as.numeric(train$Property.Use)
validate$Incident.Type <- as.numeric(validate$Incident.Type)
validate$Alarm.Date <- as.numeric(validate$Alarm.Date)
validate$District <- as.numeric(validate$District)
validate$Zip <- as.numeric(validate$Zip)
validate$Property.Use <- as.numeric(validate$Property.Use)
#correlations
library(corrplot)
par(mfrow=c(1,1))
correlations <- cor(train[,1:6])
corrplot(correlations, method="circle")
#matrix plot
par(mfrow=c(1,6))
pairs(train, col=train$Fire)
#
library(caret)
x <- train[,1:6]
y <- train[,7]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)
##Logistic Regression
logmodel<-glm(Fire~ Property.Use + Incident.Type + District +Zip, data = train, family = binomial)
summary(logmodel)
par(mfrow=c(2,2))
plot(logmodel)
#evaluate performance
glm.probs <- predict(logmodel,type = "response")
glm.probs[1:5]
sum(glm.probs[1:5])/5
glm.pred <- ifelse(glm.probs > 0.64, "True", "False")
attach(train)
table(glm.pred,Fire)
#performance AUC
library(ROCR)
library(rpart)
par(mfrow=c(1,1))
pred <- predict(logmodel, newdata =validate,OOB=TRUE)
pred <- prediction(as.numeric(pred), as.numeric(validate$Fire))
perf <- performance(pred,"tpr","fpr")
plot(perf, main="lift curve", colorize=T)
#Auc
auc<- performance(pred,measure = "auc")
auc <- [email protected][[1]]
auc