-
Notifications
You must be signed in to change notification settings - Fork 1
/
WordExtraction.py
53 lines (40 loc) · 1.14 KB
/
WordExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import fasttext
import fasttext.util
import numpy as np
import scipy.io as scio
import re
# calculate word vector based on fasttext
# fasttext.util.download_model('en', if_exists='ignore')
# test=0
name="A2"
ft_en = fasttext.load_model('D:\Project\Data\\fasttext\English_300.bin')
# print("vector dimension: ")
# print(ft_en.get_dimension())
file=open("stimuli\\"+name+".txt","r",encoding='gbk')
# if test==1:
# name="try"
wordlist=[]
for line in file:
# line = line.decode()
# print(line)
wordlist=wordlist+line.split(' ')
file.close()
#
# sentence="Once upon a time, five little peas"
#
# wordlist=sentence.split(' ')
WordVec=np.empty(shape=(300,1))
for i, word in enumerate(wordlist):
word=re.sub(',|\.|\?|!|;|“|”|"|…|\r|\n','',word)
if word=='':
continue
print(word)
wordvector=ft_en.get_word_vector(word)
wordvector=np.expand_dims(wordvector,1)
if i != 0:
WordVec=np.concatenate((WordVec,wordvector.T),axis=0)
else:
WordVec=wordvector.T
print(WordVec.shape)
mat_path='D:\Project\Data\stimuli_wordvec\\'+name+'.mat'
scio.savemat(mat_path,{'WordVec':WordVec,'wordlist':wordlist})