-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
156 lines (124 loc) · 5.16 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python3
import glob
import pickle
import random
import string
import re
import pandas as pd
from sklearn.utils import shuffle
from tqdm import tqdm
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
tqdm.pandas()
sp = SpellCorrector(corpus="twitter")
text_processor = TextPreProcessor(
# terms that will be normalized
normalize=[],
fix_html=True, # fix HTML tokens
# corpus from which the word statistics are going to be used
# for word segmentation
segmenter="english",
unpack_hashtags=False, # perform word segmentation on hashtags
unpack_contractions=True, # Unpack contractions (can't -> can not)
spell_correct_elong=True, # spell correction for elongated words
# select a tokenizer. You can use SocialTokenizer, or pass your own
# the tokenizer, should take as input a string and return a list of tokens
tokenizer=SocialTokenizer(lowercase=False).tokenize,
# list of dictionaries, for replacing tokens extracted from the text,
# with other expressions. You can pass more than one dictionaries.
dicts=[]
)
def datareader(dir, filetype="txt", engine="c"):
'''
Read in data from file into a pandas DataFrame
'''
frames = []
if not dir.endswith("/"):
dir = dir+"/"
for file in glob.glob(dir+"*."+filetype):
filename = file.split(dir)[1]
data = []
sep_f = pd.DataFrame()
with open(dir+filename, "r") as f:
for line in f:
line = line.strip()
data.append(line)
sep_f["words"] = data
sep_f["categories"] = 0 if "neg" in filename else 1
frames.append(sep_f)
return pd.concat(frames)
def lower(sentence):
'''
Change to normlisation func
'''
return sentence.lower()
def remove_dots(sentence):
'''
Ensure there are no '.' (dots) repeating, since these are
considered separate lines according to our HAN model,
'''
return sentence.translate(str.maketrans('', '', string.punctuation))
def spell_correct(sentence):
'''
Correct spelling mistakes
'''
return " ".join(sp.correct(word) for word in sentence.split())
def decontract(sentence):
'''
Unpack contractions (e.g. can't -> can not)
'''
# specific
sentence = re.sub(r"won\'t", "will not", sentence)
sentence = re.sub(r"can\'t", "can not", sentence) #instead of cannot
sentence = re.sub(r"shan\'t", "shall not", sentence)
sentence = re.sub(r"(i|I) ain\'t", "i am not", sentence)
sentence = re.sub(r"(h|H)e ain\'t", "he is not", sentence)
sentence = re.sub(r"(s|S)e ain\'t", "she is not", sentence)
sentence = re.sub(r"(w|W)e ain\'t", "we are not", sentence)
# general
sentence = re.sub(r"n\'t", " not", sentence)
sentence = re.sub(r"\'re", " are", sentence)
sentence = re.sub(r"\'s", " is", sentence)
sentence = re.sub(r"\'d", " would", sentence)
sentence = re.sub(r"\'ll", " will", sentence)
sentence = re.sub(r"\'t", " not", sentence)
sentence = re.sub(r"\'ve", " have", sentence)
sentence = re.sub(r"\'m", " am", sentence)
return sentence
def normalise(sentence):
'''
:: Normalize webaddresses, e-mail adresses, percentages, valuta, phone-numbers, time data, dates, and numbers to
['url', 'email', 'percent', 'money', 'phone', 'time', 'date', 'number']
:: Fix any HTML tags left in data
:: Segment words (e.g. whatisthis -> what is this)
:: Spelling correction (e.g. fauld -> fault, looooool -> lol)
:: Basic tokenization
:: Emoticon substitution
'''
return " ".join(text_processor.pre_process_doc(sentence))
def main():
data = datareader("data/sentiment_reviews/")
data.reset_index(inplace=True, drop=True)
# Some pre-processing steps
data["words"] = data["words"].apply(decontract)
data["words"] = data["words"].progress_apply(normalise)
data["words"] = data["words"].apply(lower)
data["words"] = data["words"].apply(remove_dots)
data["words"] = data["words"].progress_apply(spell_correct)
for i in range(3):
data = shuffle(data)
data.reset_index(inplace=True, drop=True)
# Create balanced: train (600k), validation (100k) and test sets (100k)
data["is_which_set"] = "train"
data.is_which_set.iloc[data[data["categories"]==1].is_which_set.index[300000:350000]] = "val"
data.is_which_set.iloc[data[data["categories"]==1].is_which_set.index[350000:]] = "test"
data.is_which_set.iloc[data[data["categories"]==0].is_which_set.index[300000:350000]] = "val"
data.is_which_set.iloc[data[data["categories"]==0].is_which_set.index[350000:]] = "test"
data.to_pickle('df_all.pkl')
# data = pd.read_pickle('df_all.pkl')
# pd.set_option('display.max_colwidth', -1) # show more of pandas dataframe
# print(data.head())
if __name__ == '__main__':
main()