sensai/examples/toxic-chat-classification-using-lstm.ipynb

2 lines
11 KiB
Plaintext

{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:19:35.613795Z","iopub.status.busy":"2022-03-21T11:19:35.613465Z","iopub.status.idle":"2022-03-21T11:19:40.867477Z","shell.execute_reply":"2022-03-21T11:19:40.866437Z","shell.execute_reply.started":"2022-03-21T11:19:35.613732Z"},"trusted":true},"outputs":[{"ename":"ModuleNotFoundError","evalue":"No module named 'tensorflow'","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)","\u001b[1;32m/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb Cell 1'\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=1'>2</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtensorflow\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mtf\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=2'>3</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnumpy\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mnp\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=3'>4</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mglob\u001b[39;00m \u001b[39mimport\u001b[39;00m glob\n","\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tensorflow'"]}],"source":["import pandas as pd\n","import tensorflow as tf\n","import numpy as np\n","from glob import glob\n","from os import path"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:19:40.86965Z","iopub.status.busy":"2022-03-21T11:19:40.869365Z","iopub.status.idle":"2022-03-21T11:19:47.938615Z","shell.execute_reply":"2022-03-21T11:19:47.937623Z","shell.execute_reply.started":"2022-03-21T11:19:40.869597Z"},"trusted":true},"outputs":[],"source":["DATASET_DIR = os.environ.get(\"DATASET_DIR\", \"../input/sensai\")\n","print(\"DATASET_DIR\", DATASET_DIR)\n","\n","df_nonflag = []\n","df_flag =[]\n","df_nonflag = pd.concat(\n"," [pd.read_parquet(x) for x in glob(path.join(DATASET_DIR, 'chats_nonflag_*.parquet'))],\n"," ignore_index=True)\n","df_flag = pd.concat(\n"," [pd.read_parquet(x) for x in glob(path.join(DATASET_DIR, 'chats_flagged_*.parquet'))],\n"," ignore_index=True)\n","\n","df = df_nonflag\n","df = df.append(df_flag)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:19:47.941236Z","iopub.status.busy":"2022-03-21T11:19:47.940035Z","iopub.status.idle":"2022-03-21T11:19:49.364806Z","shell.execute_reply":"2022-03-21T11:19:49.363826Z","shell.execute_reply.started":"2022-03-21T11:19:47.941189Z"},"trusted":true},"outputs":[],"source":["df['label'].value_counts().plot(kind='barh')"]},{"cell_type":"markdown","metadata":{},"source":["## Remove Duplicate Data"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:19:49.368573Z","iopub.status.busy":"2022-03-21T11:19:49.368259Z","iopub.status.idle":"2022-03-21T11:19:55.435414Z","shell.execute_reply":"2022-03-21T11:19:55.433824Z","shell.execute_reply.started":"2022-03-21T11:19:49.368526Z"},"trusted":true},"outputs":[],"source":["## Remove duplicate\n","print(\"Total data : \", len(df))\n","print(\"Total data after duplicates removes : \", len(df.drop_duplicates()))"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:19:55.437634Z","iopub.status.busy":"2022-03-21T11:19:55.437029Z","iopub.status.idle":"2022-03-21T11:20:02.277705Z","shell.execute_reply":"2022-03-21T11:20:02.276676Z","shell.execute_reply.started":"2022-03-21T11:19:55.437588Z"},"trusted":true},"outputs":[],"source":["df = df.drop_duplicates()\n","df['label'].value_counts().plot(kind='barh')"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:20:02.280104Z","iopub.status.busy":"2022-03-21T11:20:02.279547Z","iopub.status.idle":"2022-03-21T11:20:05.956737Z","shell.execute_reply":"2022-03-21T11:20:05.955697Z","shell.execute_reply.started":"2022-03-21T11:20:02.280058Z"},"trusted":true},"outputs":[],"source":["df_labeled = df\n","df_labeled[\"label\"] = np.where(df[\"label\"].str.contains(\"nonflagged\"), 0, 1)\n","df_labeled = df_labeled.sample(frac=1).reset_index(drop=True)\n","df_labeled.head(5)\n","#Flag = 1\n","#Nonflag = 0"]},{"cell_type":"markdown","metadata":{},"source":["## Hyperparameter"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:20:05.958838Z","iopub.status.busy":"2022-03-21T11:20:05.958517Z","iopub.status.idle":"2022-03-21T11:20:07.307416Z","shell.execute_reply":"2022-03-21T11:20:07.305733Z","shell.execute_reply.started":"2022-03-21T11:20:05.958796Z"},"trusted":true},"outputs":[],"source":["#Hyperparam\n","vocab_size = 10000\n","embedding_dim = 16\n","max_length = 15\n","trunc_type='post'\n","padding_type='post'\n","oov_tok = \"<OOV>\"\n","\n","#Removing stopwords\n","import nltk.corpus\n","nltk.download('stopwords')\n","from nltk.corpus import stopwords\n","stop = stopwords.words('english')"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:20:07.310522Z","iopub.status.busy":"2022-03-21T11:20:07.309937Z","iopub.status.idle":"2022-03-21T11:20:30.417533Z","shell.execute_reply":"2022-03-21T11:20:30.416592Z","shell.execute_reply.started":"2022-03-21T11:20:07.310478Z"},"trusted":true},"outputs":[],"source":["import re\n","\n","Words = []\n","Labels = []\n","for item in df_labeled.body :\n"," #item = re.sub(r\"(@\\[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)|^rt|http.+?\", \"\", item)\n"," item = \" \".join([word for word in item.split() if word not in (stop)])\n"," Words.append(item)\n","\n","for item in df_labeled.label :\n"," Labels.append(item)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:21:22.953452Z","iopub.status.busy":"2022-03-21T11:21:22.952793Z","iopub.status.idle":"2022-03-21T11:21:46.120344Z","shell.execute_reply":"2022-03-21T11:21:46.119296Z","shell.execute_reply.started":"2022-03-21T11:21:22.953417Z"},"trusted":true},"outputs":[],"source":["from imblearn.under_sampling import RandomUnderSampler\n","rus = RandomUnderSampler()\n","Words = np.array(Words)\n","Words_reshape = Words.reshape(-1, 1)\n","X_rus, y_rus= rus.fit_resample(Words_reshape, Labels)\n","\n","from sklearn.model_selection import train_test_split\n","training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(X_rus, y_rus, test_size=0.15, random_state=42)\n","\n","training_sentences = training_sentences.tolist()\n","testing_sentences = testing_sentences.tolist()\n","\n","training_sentences = [val for sublist in training_sentences for val in sublist]\n","testing_sentences = [val for sublist in testing_sentences for val in sublist]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:21:46.122546Z","iopub.status.busy":"2022-03-21T11:21:46.122254Z","iopub.status.idle":"2022-03-21T11:23:16.233315Z","shell.execute_reply":"2022-03-21T11:23:16.232054Z","shell.execute_reply.started":"2022-03-21T11:21:46.122503Z"},"trusted":true},"outputs":[],"source":["from tensorflow.keras.preprocessing.text import Tokenizer\n","from tensorflow.keras.preprocessing.sequence import pad_sequences\n","\n","tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)\n","tokenizer.fit_on_texts(training_sentences)\n","\n","word_index = tokenizer.word_index\n","\n","training_sequences = tokenizer.texts_to_sequences(training_sentences)\n","training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)\n","\n","testing_sequences = tokenizer.texts_to_sequences(testing_sentences)\n","testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)"]},{"cell_type":"markdown","metadata":{},"source":["## Create Model"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:25:09.078404Z","iopub.status.busy":"2022-03-21T11:25:09.077853Z","iopub.status.idle":"2022-03-21T11:25:12.755265Z","shell.execute_reply":"2022-03-21T11:25:12.754303Z","shell.execute_reply.started":"2022-03-21T11:25:09.078355Z"},"trusted":true},"outputs":[],"source":["model = tf.keras.Sequential([\n"," tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),\n"," tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),\n"," tf.keras.layers.Dense(128, activation='relu'),\n"," tf.keras.layers.Dense(256, activation='relu'),\n"," tf.keras.layers.Dense(512, activation='relu'),\n"," tf.keras.layers.Dense(512, activation='relu'),\n"," tf.keras.layers.Dense(256, activation='relu'),\n"," tf.keras.layers.Dense(128, activation='relu'),\n"," tf.keras.layers.Dense(64, activation='relu'),\n"," tf.keras.layers.Dense(32, activation='relu'),\n"," tf.keras.layers.Dense(1, activation='sigmoid')\n","])\n","model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])\n","model.summary()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:25:22.33961Z","iopub.status.busy":"2022-03-21T11:25:22.339264Z"},"trusted":true},"outputs":[],"source":["num_epochs = 100\n","training_padded = np.array(training_padded)\n","training_labels = np.array(training_labels)\n","testing_padded = np.array(testing_padded)\n","testing_labels = np.array(testing_labels)\n","history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1,batch_size=200)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.6"}},"nbformat":4,"nbformat_minor":4}