Sentiment Analysis - Deep Learning Approach (LSTM)

Sentiment Analysis - Deep Learning
In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re


from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
Using TensorFlow backend.
In [2]:
df = pd.read_csv("movie_data.csv")
In [3]:
df.head()
Out[3]:
review sentiment
0 I went and saw this movie last night after bei... 1
1 Actor turned director Bill Paxton follows up h... 1
2 As a recreational golfer with some knowledge o... 1
3 I saw this film in a sneak preview, and it is ... 1
4 Bill Paxton has taken the true story of the 19... 1
In [4]:
df.shape
Out[4]:
(50000, 2)
In [5]:
df["sentiment"].unique()
Out[5]:
array([1, 0], dtype=int64)
In [6]:
df["review"] = df["review"].str.lower()
In [7]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['review'].values)
X = tokenizer.texts_to_sequences(df['review'].values)
X = pad_sequences(X)
In [8]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 1939, 128)         256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 1939, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
=================================================================
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None
In [9]:
y = pd.get_dummies(df['sentiment']).values
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.33, random_state = 42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(33500, 1939) (33500, 2)
(16500, 1939) (16500, 2)
In [10]:
batch_size = 32
model.fit(X_train, y_train, epochs=7, batch_size=batch_size, verbose=2)
Epoch 1/7
 - 7771s - loss: 0.4913 - accuracy: 0.7632
Epoch 2/7
 - 6812s - loss: 0.3962 - accuracy: 0.8316
Epoch 3/7
 - 7125s - loss: 0.4015 - accuracy: 0.8235
Epoch 4/7
 - 8689s - loss: 0.3461 - accuracy: 0.8487
Epoch 5/7
 - 6405s - loss: 0.2686 - accuracy: 0.8899
Epoch 6/7
 - 6377s - loss: 0.2399 - accuracy: 0.9034
Epoch 7/7
 - 6396s - loss: 0.2243 - accuracy: 0.9094
Out[10]:
<keras.callbacks.callbacks.History at 0x207708d89e8>
In [11]:
y_pred = model.predict(X_test, verbose=2, batch_size=batch_size)
In [12]:
y_pred = np.array([ np.argmax(row) for row in y_pred ])
y_test = np.array([ np.argmax(row) for row in y_test ])
In [13]:
report = classification_report(y_test, y_pred, output_dict=True)
df = pd.DataFrame(report).transpose()
print(df.to_latex())
\begin{tabular}{lrrrr}
\toprule
{} &  f1-score &  precision &    recall &  support \\
\midrule
0            &  0.890747 &   0.897757 &  0.883846 &   8196.0 \\
1            &  0.893815 &   0.887083 &  0.900650 &   8304.0 \\
micro avg    &  0.892303 &   0.892303 &  0.892303 &  16500.0 \\
macro avg    &  0.892281 &   0.892420 &  0.892248 &  16500.0 \\
weighted avg &  0.892291 &   0.892385 &  0.892303 &  16500.0 \\
\bottomrule
\end{tabular}

In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))
Accuracy: 0.8923030303030303
In [16]:
from keras.models import load_model
In [17]:
model.save('my_model.h5')