This repository has been archived by the owner on Nov 5, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
148 lines (117 loc) · 5.31 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import argparse
import re
import emoji
import numpy as np
import pandas as pd
import pickle
from gensim.parsing.preprocessing import *
def tokenize(string):
""" Tokenizes a string.
Adds a space between numbers and letters, removes punctuation, repeated whitespaces, words
shorter than 2 characters, and stop-words. Returns a list of stems and, eventually, emojis.
@param string: String to tokenize.
@return: A list of stems and emojis.
"""
# Based on the Ranks NL (Google) stopwords list, but "how" and "will" are not stripped, and
# words shorter than 2 characters are not checked (since they are stripped):
stop_words = [
"about", "an", "are", "as", "at", "be", "by", "com", "for", "from", "in", "is", "it", "of",
"on", "or", "that", "the", "this", "to", "was", "what", "when", "where", "who", "with",
"the", "www"
]
string = strip_short(
strip_multiple_whitespaces(
strip_punctuation(
split_alphanum(string))),
minsize=2)
# Parse emojis:
emojis = [c for c in string if c in emoji.get_emoji_unicode_dict('en')]
# Remove every non-word character and stem each word:
string = stem_text(re.sub(r"[^\w\s,]", "", string))
# List of stems and emojis:
tokens = string.split() + emojis
for stop_word in stop_words:
try:
tokens.remove(stop_word)
except:
pass
return tokens
def average_embedding(tokens, word2vec, na_vector=None):
""" Embeds a title with the average representation of its tokens.
Returns the mean vector representation of the tokens representations. When no token is in the
Word2Vec model, it can be provided a vector to use instead (for example the mean vector
representation of the train set titles).
@param tokens: List of tokens to embed.
@param word2vec: Word2Vec model.
@param na_vector: Vector representation to use when no token is in the Word2Vec model.
@return: A vector representation for the token list.
"""
vectors = list()
for token in tokens:
if token in word2vec.wv:
vectors.append(word2vec.wv[token])
if len(vectors) == 0 and na_vector is not None:
vectors.append(na_vector)
return np.mean(np.array(vectors), axis=0)
parser = argparse.ArgumentParser(description="Predict if a Youtube video is clickbait or not.")
parser.add_argument(
"--title", "-t",
type=str, help="Title.", required=True)
parser.add_argument(
"--views", "-v",
type=int, help="Number of views.", required=False)
parser.add_argument(
"--likes", "-l",
type=int, help="Number of likes.", required=False)
parser.add_argument(
"--dislikes", "-d",
type=int, help="Number of dislikes.", required=False)
parser.add_argument(
"--comments", "-c",
type=int, help="Number of comments.", required=False)
args = parser.parse_args()
# Import the Word2Vec model and the mean vector representation computed on the train set:
word2vec = pickle.load(open("word2vec", "rb"))
mean_title_embedding = pickle.load(open("mean-title-embedding", "rb"))
input = {
"video_title": args.title,
"video_views": args.views if args.views is not None else np.NaN,
"video_likes": args.likes if args.likes is not None else np.NaN,
"video_dislikes": args.dislikes if args.dislikes is not None else np.NaN,
"video_comments": args.comments if args.comments is not None else np.NaN,
}
sample = pd.DataFrame([ input ])
# Tokenize the title and then compute its embedding:
sample["video_title"] = sample["video_title"].apply(tokenize)
sample["video_title"] = sample["video_title"].apply(
average_embedding, word2vec=word2vec, na_vector=mean_title_embedding)
sample = pd.concat(
[
sample[["video_views", "video_likes", "video_dislikes", "video_comments"]],
sample["video_title"].apply(pd.Series)
], axis=1)
# Compute the log of the video metadata or replace the missing values with the mean values obtained
# from the train set:
mean_log_video_views = pickle.load(open("mean-log-video-views", "rb"))
mean_log_video_likes = pickle.load(open("mean-log-video-likes", "rb"))
mean_log_video_dislikes = pickle.load(open("mean-log-video-dislikes", "rb"))
mean_log_video_comments = pickle.load(open("mean-log-video-comments", "rb"))
sample[["video_views", "video_likes", "video_dislikes", "video_comments"]] = \
sample[["video_views", "video_likes", "video_dislikes", "video_comments"]].apply(np.log)
if sample["video_views"].isnull().any():
sample["video_views"].fillna(mean_log_video_views, inplace=True)
if sample["video_likes"].isnull().any():
sample["video_likes"].fillna(mean_log_video_likes, inplace=True)
if sample["video_dislikes"].isnull().any():
sample["video_dislikes"].fillna(mean_log_video_dislikes, inplace=True)
if sample["video_comments"].isnull().any():
sample["video_comments"].fillna(mean_log_video_comments, inplace=True)
# Replace any -Inf value with 0:
sample = sample.replace(-np.inf, 0)
# Import the min-max scaler and apply it to the sample:
min_max_scaler = pickle.load(open("min-max-scaler", "rb"))
sample = pd.DataFrame(min_max_scaler.transform(sample), columns=sample.columns)
# Import the SVM model:
svm = pickle.load(open("svm", "rb"))
# Print its prediction:
print("It's probably clickbait" if svm.predict(sample)[0] == 1 else "Doesn't look clickbait to me")