-
Notifications
You must be signed in to change notification settings - Fork 1
/
tokseg.py
78 lines (55 loc) · 2.24 KB
/
tokseg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# English word and sentence tokenizer
import syntok.segmenter as segmenter
from syntok.tokenizer import Tokenizer
# Indic Languages
from indicnlp.tokenize import sentence_tokenize
from indicnlp.tokenize import indic_tokenize
# Japanese tokenizer
from fugashi import Tagger
# Chinese tokenizer
from chinese import ChineseAnalyzer
class Split:
def tokenize(self, text, lang):
# english and other non indian , indo european languages
if lang == 'ja':
tagger = Tagger('-Owakati')
return [str(word) for word in tagger(text)]
# Chinese
elif lang == 'zh':
analyzer = ChineseAnalyzer()
result = analyzer.parse(text)
return result.tokens()
elif lang == 'es' or lang == 'en' or lang == 'ru' or lang == 'id':
return [token.value for token in Tokenizer().tokenize(text)]
else:
return indic_tokenize.trivial_tokenize(text, lang)
def segment(self, text, lang):
if lang == 'ja':
tagger = Tagger('-Owakati')
sentences = []
sentence = ''
for word in tagger(text):
# print (word)
if "。" != str(word):
sentence += str(word)
else:
sentences.append(sentence)
sentence = ""
return sentences
elif lang == 'zh':
analyzer = ChineseAnalyzer()
result = analyzer.parse(text)
sentences = []
sentence = ''
for word in result.tokens():
# print (word)
if "。" != str(word):
sentence += str(word)
else:
sentences.append(sentence)
sentence = ""
return sentences
elif lang == 'es' or lang == 'en' or lang == 'ru':
return [' '.join([token.value for token in sentence][:-1]) for paragraph in segmenter.process(text) for sentence in paragraph]
else:
return sentence_tokenize.sentence_split(text, lang)