-
Notifications
You must be signed in to change notification settings - Fork 0
/
classify.py
59 lines (41 loc) · 1.79 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import joblib
import cPickle as pickle
import pandas as pd
def tweets(clf):
clinton, trump = pickle.load(open('tweets_preprocessed', 'rb'))
c_result = clf.predict([' '.join(doc) for _, doc in clinton])
t_result = clf.predict([' '.join(doc) for _, doc in trump])
df = pd.DataFrame([[c_result.sum(), len(c_result)],
[t_result.sum(), len(t_result)]],
index=('Clinton', 'Trump'), columns=('Positive', 'Total'))
df.to_csv('twitter_results.csv')
def time_series(clf):
clinton, trump = pickle.load(open('tweets_preprocessed', 'rb'))
c_dates = [date for date, _ in clinton]
c_result = clf.predict([' '.join(doc) for _, doc in clinton])
t_dates = [date for date, _ in trump]
t_result = clf.predict([' '.join(doc) for _, doc in trump])
c = pd.Series(c_result, index=c_dates)
t = pd.Series(t_result, index=t_dates)
c = c.groupby(pd.TimeGrouper(freq='D')).mean()
t = t.groupby(pd.TimeGrouper(freq='D')).mean()
c = pd.rolling_mean(c, 14)
t = pd.rolling_mean(t, 14)
df = pd.concat([c, t], axis=1)
df.columns = ('Clinton', 'Trump')
df.to_pickle('time_series.p')
def debates(clf):
names = ('clinton1', 'clinton2', 'clinton3', 'trump1', 'trump2', 'trump3')
results = []
for name, obj in zip(names, pickle.load(open('debates_preprocessed', 'rb'))):
temp = [x for x in obj if x != []]
results.append(clf.predict([' '.join(doc) for doc in temp]))
for i, result in enumerate(results):
results[i] = (result.sum(), len(result), float(result.sum()) / len(result))
df = pd.DataFrame(results, index=names, columns=('positive', 'total', 'percent'))
df.to_csv('debate_results.csv')
if __name__ == '__main__':
clf = joblib.load('clf/clf.p')
time_series(clf)
# tweets(clf)
# debates(clf)