sensai/examples/toxic-chat-classification-using-lstm.ipynb

2 lines
11 KiB
Plaintext
Raw Normal View History

2022-06-03 16:36:32 +09:00
{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:19:35.613795Z","iopub.status.busy":"2022-03-21T11:19:35.613465Z","iopub.status.idle":"2022-03-21T11:19:40.867477Z","shell.execute_reply":"2022-03-21T11:19:40.866437Z","shell.execute_reply.started":"2022-03-21T11:19:35.613732Z"},"trusted":true},"outputs":[{"ename":"ModuleNotFoundError","evalue":"No module named 'tensorflow'","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)","\u001b[1;32m/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb Cell 1'\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=1'>2</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtensorflow\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mtf\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=2'>3</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnumpy\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mnp\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Btakos/home/uetchy/repos/src/github.com/holodata/sensai-dataset/examples/toxic-chat-classification-using-lstm.ipynb#ch0000001vscode-remote?line=3'>4</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mglob\u001b[39;00m \u001b[39mimport\u001b[39;00m glob\n","\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tensorflow'"]}],"source":["import pandas as pd\n","import tensorflow as tf\n","import numpy as np\n","from glob import glob\n","from os import path"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:19:40.86965Z","iopub.status.busy":"2022-03-21T11:19:40.869365Z","iopub.status.idle":"2022-03-21T11:19:47.938615Z","shell.execute_reply":"2022-03-21T11:19:47.937623Z","shell.execute_reply.started":"2022-03-21T11:19:40.869597Z"},"trusted":true},"outputs":[],"source":["DATASET_DIR = os.environ.get(\"DATASET_DIR\", \"../input/sensai\")\n","print(\"DATASET_DIR\", DATASET_DIR)\n","\n","df_nonflag = []\n","df_flag =[]\n","df_nonflag = pd.concat(\n"," [pd.read_parquet(x) for x in glob(path.join(DATASET_DIR, 'chats_nonflag_*.parquet'))],\n"," ignore_index=True)\n","df_flag = pd.concat(\n"," [pd.read_parquet(x) for x in glob(path.join(DATASET_DIR, 'chats_flagged_*.parquet'))],\n"," ignore_index=True)\n","\n","df = df_nonflag\n","df = df.append(df_flag)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:19:47.941236Z","iopub.status.busy":"2022-03-21T11:19:47.940035Z","iopub.status.idle":"2022-03-21T11:19:49.364806Z","shell.execute_reply":"2022-03-21T11:19:49.363826Z","shell.execute_reply.started":"2022-03-21T11:19:47.941189Z"},"trusted":true},"outputs":[],"source":["df['label'].value_counts().plot(kind='barh')"]},{"cell_type":"markdown","metadata":{},"source":["## Remove Duplicate Data"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-03-21T11:19:49.368573Z","iopub.status.busy":"2022-03-21T11:19:49.368259Z","iopub.status.idle":"2022-03-21T11:19:55.435414Z","shell.execute_reply":"2022-03-21T11:19:55.433824Z","shell.execute_reply.started":"2022-03-21T11