-
Notifications
You must be signed in to change notification settings - Fork 2
/
get_CTB_POS.py
57 lines (56 loc) · 1.75 KB
/
get_CTB_POS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# -*- coding: cp936 -*-
import os,re
from configure import ctb_seg_pos_path
in_file=ctb_seg_pos_path+'in.txt'
out_file=ctb_seg_pos_path+'out.txt'
original =['.',',','!','?','(',')',':',';','/','%',\
'0','1','2','3','4','5','6','7','8','9']
character=['。',',','!','?','(',')',':',';','/','%',\
'0','1','2','3','4','5','6','7','8','9']
character=[cha.decode('gbk') for cha in character]
# original ='.,!?():;/%0123456789'
# character='。,!?():;/%0123456789'.decode('gbk')
# table = maketrans(original,character)
word_dict=dict(zip(original, character))
ptranslate = re.compile('|'.join(map(re.escape, word_dict)))
def replace_words(text): #translate some character
def translate(mat):
return word_dict[mat.group(0)]
return ptranslate.sub(translate, text)
def get_CTB_POS(query):
'''
query: unicode,seged
string: unicode
'''
query=replace_words(query)
with open(in_file,'w') as ff: #write sen
ff.write(query.encode('gbk'))
cmd=ctb_seg_pos_path+'ctbparser_pos %s %s' %\
(in_file,out_file)
output=os.popen(cmd)
output.close()
of=open(out_file)
string=of.read()
of.close()
string=string.decode('gbk')
return string
def get_CTB_SEG_POS(query):
'''
query: unicode,seged
string: unicode
'''
query=replace_words(query)
with open(in_file,'w') as ff: #write sen
ff.write(query.encode('gbk'))
cmd=ctb_seg_pos_path+'ctbparser_seg_pos %s %s' %\
(in_file,out_file)
output=os.popen(cmd)
output.close()
of=open(out_file)
string=of.read()
of.close()
string=string.decode('gbk')
return string
if __name__=='__main__':
get_CTB_POS('我是北京邮电大学学生.'.decode('gbk'))
print 'done'