mirror of
https://github.com/holodata/sensai-dataset.git
synced 2025-03-15 12:00:32 +09:00
11 KiB
11 KiB
In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from glob import glob
from os import path
In [ ]:
DATASET_DIR = os.environ.get("DATASET_DIR", "../input/sensai")
print("DATASET_DIR", DATASET_DIR)
df_nonflag = []
df_flag =[]
df_nonflag = pd.concat(
[pd.read_parquet(x) for x in glob(path.join(DATASET_DIR, 'chats_nonflag_*.parquet'))],
ignore_index=True)
df_flag = pd.concat(
[pd.read_parquet(x) for x in glob(path.join(DATASET_DIR, 'chats_flagged_*.parquet'))],
ignore_index=True)
df = df_nonflag
df = df.append(df_flag)
In [ ]:
df['label'].value_counts().plot(kind='barh')
Remove Duplicate Data¶
In [ ]:
## Remove duplicate
print("Total data : ", len(df))
print("Total data after duplicates removes : ", len(df.drop_duplicates()))
In [ ]:
df = df.drop_duplicates()
df['label'].value_counts().plot(kind='barh')
In [ ]:
df_labeled = df
df_labeled["label"] = np.where(df["label"].str.contains("nonflagged"), 0, 1)
df_labeled = df_labeled.sample(frac=1).reset_index(drop=True)
df_labeled.head(5)
#Flag = 1
#Nonflag = 0
Hyperparameter¶
In [ ]:
#Hyperparam
vocab_size = 10000
embedding_dim = 16
max_length = 15
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
#Removing stopwords
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
In [ ]:
import re
Words = []
Labels = []
for item in df_labeled.body :
#item = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", item)
item = " ".join([word for word in item.split() if word not in (stop)])
Words.append(item)
for item in df_labeled.label :
Labels.append(item)
In [ ]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
Words = np.array(Words)
Words_reshape = Words.reshape(-1, 1)
X_rus, y_rus= rus.fit_resample(Words_reshape, Labels)
from sklearn.model_selection import train_test_split
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(X_rus, y_rus, test_size=0.15, random_state=42)
training_sentences = training_sentences.tolist()
testing_sentences = testing_sentences.tolist()
training_sentences = [val for sublist in training_sentences for val in sublist]
testing_sentences = [val for sublist in testing_sentences for val in sublist]
In [ ]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
Create Model¶
In [ ]:
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
In [ ]:
num_epochs = 100
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1,batch_size=200)
In [ ]: