-
Notifications
You must be signed in to change notification settings - Fork 0
/
Naive Bayes Classifier.py
137 lines (108 loc) · 3.58 KB
/
Naive Bayes Classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# coding: utf-8
# In[2]:
get_ipython().magic('matplotlib inline')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
names = ("age, workclass, fnlwgt, education, education-num, "
"marital-status, occupation, relationship, race, sex, "
"capital-gain, capital-loss, hours-per-week, "
"native-country, income").split(', ')
data = pd.read_csv('d:/data/adult.data',names = names)
data = data.dropna() # pandas.DataFrame.dropna 可以删除数据框中的缺失值
target_names = data['income'].unique()
target = data['income']
features_data = data.drop('income', axis=1)
numeric_features = [c for c in features_data if features_data[c].dtype.kind in ('i', 'f')] # 提取数值类型为整数或浮点数的变量
numeric_data = features_data[numeric_features]
categorical_data = features_data.drop(numeric_features, 1)
categorical_data_encoded = categorical_data.apply(lambda x: pd.factorize(x)[0]) # pd.factorize即可将分类变量转换为数值表示
features = pd.concat([numeric_data, categorical_data_encoded], axis=1)
X = features.values.astype(np.float32) # 转换数据类型
y = (target.values == ' >50K').astype(np.int32) # 收入水平 ">50K" 记为1,“<=50K” 记为0
from sklearn.cross_validation import train_test_split # sklearn库中train_test_split函数可实现该划分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0) # 参数test_size设置训练集占比
# In[3]:
from sklearn.cross_validation import cross_val_score
#高斯朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
# print("Number of mislabeled points out of a total %d points : %d"
# % (iris.data.shape[0],(iris.target != y_pred).sum()))
right = 0
wrong = 0
i = 0
for single in y_pred:
if single == y_test[i]:
right = right + 1
else:
wrong = wrong + 1
i = i + 1
print(right + wrong)
print(right/(right + wrong))
print(wrong/(right + wrong))
# In[8]:
# print(y_pred)
y_test.shape
# In[4]:
#多项分布朴素贝叶斯
from sklearn.naive_bayes import MultinomialNB
gnb = MultinomialNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
# print("Number of mislabeled points out of a total %d points : %d"
# % (iris.data.shape[0],(iris.target != y_pred).sum()))
right = 0
wrong = 0
i = 0
for single in y_pred:
if single == y_test[i]:
right = right + 1
else:
wrong = wrong + 1
i = i + 1
print(right + wrong)
print(right/(right + wrong))
print(wrong/(right + wrong))
# In[13]:
#伯努利朴素贝叶斯
from sklearn.naive_bayes import BernoulliNB
gnb = BernoulliNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
# print("Number of mislabeled points out of a total %d points : %d"
# % (iris.data.shape[0],(iris.target != y_pred).sum()))
right = 0
wrong = 0
i = 0
for single in y_pred:
if single == y_test[i]:
right = right + 1
else:
wrong = wrong + 1
i = i + 1
print(right + wrong)
print(right/(right + wrong))
print(wrong/(right + wrong))
# In[14]:
X_train.shape
# In[5]:
#ComplementNB
from sklearn.naive_bayes import ComplementNB
gnb = ComplementNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
# print("Number of mislabeled points out of a total %d points : %d"
# % (iris.data.shape[0],(iris.target != y_pred).sum()))
right = 0
wrong = 0
i = 0
for single in y_pred:
if single == y_test[i]:
right = right + 1
else:
wrong = wrong + 1
i = i + 1
print(right + wrong)
print(right/(right + wrong))
print(wrong/(right + wrong))