-
Notifications
You must be signed in to change notification settings - Fork 0
/
naive_bayes.py
101 lines (86 loc) · 4.14 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import language
import math
import numpy as np
import string
class NaiveBayes:
training_table_chars = []
training_table_classes = []
scores = []
def __init__(self, v, d, tweets, corpus): # Initialize by getting all characters
self.train(v, d, tweets, corpus)
def class_probability(self, i, tweets, characters): # example: compute P('eu') = count('eu) / count(all docs)
count_doc_i = 0
count_all_doc = len(tweets)
prob_i = 0
for t in range(len(characters)):
if tweets[t].get_language() == i.value:
count_doc_i = count_doc_i + 1
prob_i = count_doc_i / count_all_doc
#print(prob_i)
return prob_i
def cond_probability(self, i, j, tweets, letters, characters, lang, d): # example: compute P('a'|eu) = count('a', eu) / sum(count('a', eu))
count_j_i = 0
sum_j_i = 0
prob_j_i = 0
for t in range(len(characters)):
if tweets[t].get_language() == i.value: # blocks here
count_j_i = count_j_i + characters[t].count(
letters[j]) # Getting the number of characters in each languages class
sum_j_i = sum_j_i + len(characters[t]) # Getting the sum of all characters in each language class
prob_j_i = (count_j_i + float(d)) / (sum_j_i + float(d) * len(letters)) # with smoothing d
#print(prob_j_i)
return prob_j_i
def train(self, v, d, tweets, corpus):
lang = language.Language
char_size = corpus.determite_vocabulary().get("char_size")
letters = corpus.determite_vocabulary().get("letters")
characters = corpus.get_characters()
print("Training the model with vocabulary type V = " + v + " and " + str(len(tweets)) + " tweets...")
for i in iter(lang): # For all classes i
self.training_table_classes.append(self.class_probability(i, tweets, characters))
for j in range(char_size): # For all characters in vocabulary j
self.training_table_chars.append(self.cond_probability(i, j, tweets, letters, characters, lang, d))
#print(self.training_table_chars)
def test(self, v, tweets, corpus):
lang = language.Language
char_size = corpus.determite_vocabulary().get("char_size")
letters = corpus.determite_vocabulary().get("letters")
characters = corpus.get_characters()
score = 0
print("Testing the model...")
languages = ['eu', 'ca', 'gl', 'es', 'en', 'pt']
for i in range(len(characters)):
sentence = ''.join(characters[i]) # Putting back into strings
for j in range(len(languages)):
if tweets[i].get_language() == languages[j]:
score = math.log10(self.training_table_classes[j])
for k in range(char_size):
if letters[k] in sentence:
score = score + math.log10(self.training_table_chars[k])
self.scores.append(score)
#self.printScores(characters)
def printScores(self, characters):
languages = ['Basque', 'Catalan', 'Galican', 'Spanish', 'English', 'Portuguese'] # for better printing
topIndex = 0
newArray = np.reshape(self.scores, (len(characters), len(languages)))
for i in range(len(characters)):
print("\n")
print('The scores for tweet #' + str(i + 1) + ' are...')
for j in range(len(languages)):
topIndex = np.argmax(newArray[i])
print(languages[j] + ': ' + str(newArray[i][j]) + ", ")
print("The most likely language for this tweet is " + languages[topIndex] + " with score: " + str(
newArray[i][topIndex]) + ".")
def get_score(self):
return self.score
def get_scores(self):
return self.scores
def init_dict(self, score_tweets):
score_array = []
for i in range(0, len(score_tweets), 6):
score_dict = language.to_dict(0)
for l in score_dict:
score_dict[l] = score_tweets[i]
i += 1
score_array.append(score_dict)
return score_array