-
Notifications
You must be signed in to change notification settings - Fork 0
/
Text Classification using tensor flow_ project.py
72 lines (68 loc) · 2.61 KB
/
Text Classification using tensor flow_ project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
df = pd.read_csv("/wine-reviews.csv", usecols = ['country', 'description', 'points', 'price', 'variety', 'winery'])
df = df.dropna(subset=["description", "points"])
plt.hist(df.points, bins=20)
plt.title("Points histogram")
plt.ylabel("N")
plt.xlabel("Points")
plt.show()
df["label"] = (df.points >= 90).astype(int)
df = df[["description","label"]]
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))])
def df_to_dataset(dataframe, shuffle=True, batch_size=1024):
df = dataframe.copy()
labels = df.pop('label')
df = df["description"]
ds = tf.data.Dataset.from_tensor_slices((df, labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(dataframe))
ds = ds.batch(batch_size)
ds = ds.prefetch(tf.data.AUTOTUNE)
return ds
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)
hub_layer(list(train_data)[0][0])
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy'])
model.evaluate(train_data)
model.evaluate(valid_data)
history = model.fit(train_data, epochs=5, validation_data=valid_data)
model.evaluate(test_data)
encoder = tf.keras.layers.TextVectorization(max_tokens=2000)
encoder.adapt(train_data.map(lambda text, label: text))
vocab = np.array(encoder.get_vocabulary())
vocab[:20]
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(
input_dim=len(encoder.get_vocabulary()),
output_dim=32,
mask_zero=True
),
tf.keras.layers.LSTM(32),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dropout(0.4),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy'])
model.evaluate(train_data)
model.evaluate(valid_data)
history = model.fit(train_data, epochs=5, validation_data=valid_data)
model.evaluate(test_data)