sensai/examples/toxic-chat-classification-using-lstm.ipynb

11 KiB

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from glob import glob
from os import path
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb Cell 1' in <cell line: 2>()
      <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=0'>1</a> import pandas as pd
----> <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=1'>2</a> import tensorflow as tf
      <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=2'>3</a> import numpy as np
      <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=3'>4</a> from glob import glob

ModuleNotFoundError: No module named 'tensorflow'
In [ ]:
DATASET_DIR = os.environ.get("DATASET_DIR", "../input/sensai")
print("DATASET_DIR", DATASET_DIR)

df_nonflag = []
df_flag =[]
df_nonflag = pd.concat(
    [pd.read_parquet(x) for x in glob(path.join(DATASET_DIR, 'chats_nonflag_*.parquet'))],
    ignore_index=True)
df_flag = pd.concat(
    [pd.read_parquet(x) for x in glob(path.join(DATASET_DIR, 'chats_flagged_*.parquet'))],
    ignore_index=True)

df = df_nonflag
df = df.append(df_flag)
In [ ]:
df['label'].value_counts().plot(kind='barh')

Remove Duplicate Data

In [ ]:
## Remove duplicate
print("Total data : ", len(df))
print("Total data after duplicates removes : ", len(df.drop_duplicates()))
In [ ]:
df = df.drop_duplicates()
df['label'].value_counts().plot(kind='barh')
In [ ]:
df_labeled = df
df_labeled["label"] = np.where(df["label"].str.contains("nonflagged"), 0, 1)
df_labeled = df_labeled.sample(frac=1).reset_index(drop=True)
df_labeled.head(5)
#Flag = 1
#Nonflag = 0

Hyperparameter

In [ ]:
#Hyperparam
vocab_size = 10000
embedding_dim = 16
max_length = 15
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

#Removing stopwords
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
In [ ]:
import re

Words = []
Labels = []
for item in df_labeled.body :
  #item = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", item)
  item = " ".join([word for word in item.split() if word not in (stop)])
  Words.append(item)

for item in df_labeled.label :
  Labels.append(item)
In [ ]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
Words = np.array(Words)
Words_reshape = Words.reshape(-1, 1)
X_rus, y_rus= rus.fit_resample(Words_reshape, Labels)

from sklearn.model_selection import train_test_split
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(X_rus, y_rus, test_size=0.15, random_state=42)

training_sentences = training_sentences.tolist()
testing_sentences = testing_sentences.tolist()

training_sentences = [val for sublist in training_sentences for val in sublist]
testing_sentences = [val for sublist in testing_sentences for val in sublist]
In [ ]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

Create Model

In [ ]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
In [ ]:
num_epochs = 100
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1,batch_size=200)
In [ ]: