forked from tshrinivasan/tamil-wikipedia-word-list
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_wordlist.py
52 lines (41 loc) · 1.43 KB
/
create_wordlist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re
inputfile = 'orwiki.xml'
input = open(inputfile,'r')
out = open('odia-words.txt','w')
def is_odia(word):
# word = word.decode('utf-8')
first_char = word[:1]
lang_number = ord(first_char)
if lang_number >= 2816 and lang_number <= 2943:
return True
for line in input.readlines():
for word in line.split(" "):
# try:
if len(word) > 1:
output = re.sub(r'\s*[A-Za-z]+\b', '' , word)
output = re.sub(r'[?|$|।|`|~|@|#|^|&|*|(|)|+|=|{|}|<|,|>|/|\|.|?|!|:|;]','',output)
output = output.replace('|','')
output = output.replace('"','')
output = output.replace('[','')
output = output.replace(']','')
output = output.replace("'",'')
output = output.replace("‘",'')
output = output.replace("’",'')
output = output.replace("“",'')
output = output.replace("”",'')
output = output.replace("--",'-')
output = output.rstrip()
if len(output) > 1:
if is_odia(output):
print (output)
out.write(output+'\n')
# except:
# print (" ")
input.close()
out.close()
odia_words_set = set(map(str.strip, open('odia-words.txt')))
uniq_words = open('unique_odia_words.txt','w')
for word in odia_words_set:
uniq_words.write(word)
uniq_words.write("\n")
uniq_words.close()