-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
37 lines (27 loc) · 1.16 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
from bs4 import BeautifulSoup
import pickle
import os
import glob
def review_to_words(review):
nltk.download("stopwords", quiet=True)
stemmer = PorterStemmer()
text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
words = text.split() # Split string into words
words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
words = [PorterStemmer().stem(w) for w in words] # stem
return words
def convert_and_pad(word_dict, sentence, pad=500):
NOWORD = 0 # We will use 0 to represent the 'no word' category
INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
working_sentence = [NOWORD] * pad
for word_index, word in enumerate(sentence[:pad]):
if word in word_dict:
working_sentence[word_index] = word_dict[word]
else:
working_sentence[word_index] = INFREQ
return working_sentence, min(len(sentence), pad)