sensai/examples/tangram.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DATASET_DIR /home/uetchy/repos/src/github.com/holodata/sensai-huggingface\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from os.path import join\n",
    "from glob import glob\n",
    "import pandas as pd\n",
    "from sentence_transformers import SentenceTransformer\n",
    "import tangram\n",
    "\n",
    "DATASET_DIR = os.environ.get(\"DATASET_DIR\", \"../input/sensai\")\n",
    "print(\"DATASET_DIR\", DATASET_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# load tokenizer\n",
    "st = SentenceTransformer('all-MiniLM-L6-v2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading data...\n"
     ]
    }
   ],
   "source": [
    "print('Loading data...')\n",
    "df = pd.concat(\n",
    "    [pd.read_parquet(x) for x in glob(join(DATASET_DIR, '*.parquet'))],\n",
    "    ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sampling data...\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 9121058 entries, 0 to 12942624\n",
      "Data columns (total 3 columns):\n",
      " #   Column      Dtype \n",
      "---  ------      ----- \n",
      " 0   authorName  object\n",
      " 1   body        object\n",
      " 2   label       object\n",
      "dtypes: object(3)\n",
      "memory usage: 278.4+ MB\n",
      "sample_size 3647717\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 7295434 entries, 0 to 7295433\n",
      "Data columns (total 3 columns):\n",
      " #   Column      Dtype \n",
      "---  ------      ----- \n",
      " 0   authorName  object\n",
      " 1   body        object\n",
      " 2   label       object\n",
      "dtypes: object(3)\n",
      "memory usage: 167.0+ MB\n",
      "label\n",
      "safe     3647717\n",
      "toxic    3647717\n",
      "dtype: int64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>authorName</th>\n",
       "      <th>body</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2646589</th>\n",
       "      <td>それもえ</td>\n",
       "      <td>ナイスー</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6722778</th>\n",
       "      <td>LassieOW</td>\n",
       "      <td>K U S A K E N ? ? ? K U S A K E N ? ? ? K U S...</td>\n",
       "      <td>toxic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4490817</th>\n",
       "      <td>ヤルダバオトここに</td>\n",
       "      <td>猫左下</td>\n",
       "      <td>toxic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>522319</th>\n",
       "      <td>IkeShamus</td>\n",
       "      <td>5</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6359809</th>\n",
       "      <td>susi Susanti</td>\n",
       "      <td>keigo tsundere</td>\n",
       "      <td>toxic</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           authorName                                               body  \\\n",
       "2646589          それもえ                                               ナイスー   \n",
       "6722778      LassieOW   K U S A K E N ? ? ? K U S A K E N ? ? ? K U S...   \n",
       "4490817     ヤルダバオトここに                                                猫左下   \n",
       "522319      IkeShamus                                                  5   \n",
       "6359809  susi Susanti                                     keigo tsundere   \n",
       "\n",
       "         label  \n",
       "2646589   safe  \n",
       "6722778  toxic  \n",
       "4490817  toxic  \n",
       "522319    safe  \n",
       "6359809  toxic  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Sampling data...')\n",
    "\n",
    "# Chats with body size larger than 1\n",
    "# df = df[df['body'].str.len() > 1]\n",
    "\n",
    "# Merge labels\n",
    "df['label'] = df['label'].apply(lambda x: {\n",
    "    'deleted': 'toxic',\n",
    "    'hidden': 'toxic',\n",
    "    'nonflagged': 'safe'\n",
    "}[x])\n",
    "\n",
    "# Drop duplicates\n",
    "df = df.drop_duplicates()\n",
    "df.info()\n",
    "\n",
    "# Balance item count for each category\n",
    "g = df.groupby('label')\n",
    "sample_size = min(g.size().min(), 4_000_000)\n",
    "print('sample_size', sample_size)\n",
    "df = g.apply(lambda x: x.sample(sample_size)).reset_index(drop=True)\n",
    "\n",
    "df.info()\n",
    "print(df.groupby('label').size())\n",
    "df.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Formatting data...\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 7295434 entries, 0 to 7295433\n",
      "Data columns (total 2 columns):\n",
      " #   Column  Dtype \n",
      "---  ------  ----- \n",
      " 0   body    object\n",
      " 1   label   object\n",
      "dtypes: object(2)\n",
      "memory usage: 111.3+ MB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>body</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6998597</th>\n",
       "      <td>allormist&lt;SEP&gt;demon slayer copied this idea</td>\n",
       "      <td>toxic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2536692</th>\n",
       "      <td>Rei&lt;SEP&gt;ああああああああ</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3713136</th>\n",
       "      <td>Alex&lt;SEP&gt;Look at the first part of the Quest. ...</td>\n",
       "      <td>toxic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5614434</th>\n",
       "      <td>Nov4Zyz&lt;SEP&gt;かわいい</td>\n",
       "      <td>toxic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>517705</th>\n",
       "      <td>Aida&lt;SEP&gt;lmaoooo</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      body  label\n",
       "6998597        allormist<SEP>demon slayer copied this idea  toxic\n",
       "2536692                                   Rei<SEP>ああああああああ   safe\n",
       "3713136  Alex<SEP>Look at the first part of the Quest. ...  toxic\n",
       "5614434                                   Nov4Zyz<SEP>かわいい  toxic\n",
       "517705                                    Aida<SEP>lmaoooo   safe"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Formatting data...')\n",
    "\n",
    "# Add counting features\n",
    "# df['blen'] = df['body'].str.len()\n",
    "\n",
    "# Concat author name and message with a separator\n",
    "df['body'] = df.apply(lambda x: (x['authorName'] or '<EMPTY>') + '<SEP>' + x['body'], axis=1)\n",
    "df.drop(columns=['authorName'], inplace=True)\n",
    "\n",
    "df.info()\n",
    "df.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Calculating embedding vectors...\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>...</th>\n",
       "      <th>374</th>\n",
       "      <th>375</th>\n",
       "      <th>376</th>\n",
       "      <th>377</th>\n",
       "      <th>378</th>\n",
       "      <th>379</th>\n",
       "      <th>380</th>\n",
       "      <th>381</th>\n",
       "      <th>382</th>\n",
       "      <th>383</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>safe</td>\n",
       "      <td>0.009612</td>\n",
       "      <td>0.052424</td>\n",
       "      <td>0.083845</td>\n",
       "      <td>-0.031595</td>\n",
       "      <td>0.028893</td>\n",
       "      <td>0.059027</td>\n",
       "      <td>0.045335</td>\n",
       "      <td>-0.014343</td>\n",
       "      <td>0.037504</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.088224</td>\n",
       "      <td>0.037281</td>\n",
       "      <td>0.020476</td>\n",
       "      <td>0.067415</td>\n",
       "      <td>0.000324</td>\n",
       "      <td>-0.049588</td>\n",
       "      <td>0.006397</td>\n",
       "      <td>0.139169</td>\n",
       "      <td>0.049218</td>\n",
       "      <td>-0.025690</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>safe</td>\n",
       "      <td>0.003328</td>\n",
       "      <td>0.114213</td>\n",
       "      <td>0.032118</td>\n",
       "      <td>-0.055552</td>\n",
       "      <td>-0.008358</td>\n",
       "      <td>0.024322</td>\n",
       "      <td>0.080483</td>\n",
       "      <td>0.007664</td>\n",
       "      <td>0.062874</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.000377</td>\n",
       "      <td>0.046901</td>\n",
       "      <td>0.018417</td>\n",
       "      <td>-0.003170</td>\n",
       "      <td>-0.027469</td>\n",
       "      <td>0.005639</td>\n",
       "      <td>0.133500</td>\n",
       "      <td>0.032221</td>\n",
       "      <td>0.030415</td>\n",
       "      <td>-0.019664</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>safe</td>\n",
       "      <td>0.031809</td>\n",
       "      <td>-0.024030</td>\n",
       "      <td>0.026037</td>\n",
       "      <td>-0.039061</td>\n",
       "      <td>-0.002786</td>\n",
       "      <td>0.052565</td>\n",
       "      <td>0.126942</td>\n",
       "      <td>-0.014587</td>\n",
       "      <td>0.047931</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.014579</td>\n",
       "      <td>0.009263</td>\n",
       "      <td>0.010371</td>\n",
       "      <td>0.045266</td>\n",
       "      <td>-0.076779</td>\n",
       "      <td>-0.023678</td>\n",
       "      <td>0.021364</td>\n",
       "      <td>0.036397</td>\n",
       "      <td>-0.001076</td>\n",
       "      <td>-0.060540</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>safe</td>\n",
       "      <td>-0.061874</td>\n",
       "      <td>0.046060</td>\n",
       "      <td>0.021520</td>\n",
       "      <td>0.021778</td>\n",
       "      <td>0.009127</td>\n",
       "      <td>0.021292</td>\n",
       "      <td>0.184221</td>\n",
       "      <td>0.004618</td>\n",
       "      <td>-0.069447</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000147</td>\n",
       "      <td>0.002003</td>\n",
       "      <td>0.089495</td>\n",
       "      <td>0.040357</td>\n",
       "      <td>0.007321</td>\n",
       "      <td>0.016096</td>\n",
       "      <td>0.012340</td>\n",
       "      <td>0.062434</td>\n",
       "      <td>0.005810</td>\n",
       "      <td>0.003983</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>safe</td>\n",
       "      <td>-0.053528</td>\n",
       "      <td>-0.045918</td>\n",
       "      <td>0.076154</td>\n",
       "      <td>-0.003261</td>\n",
       "      <td>0.005603</td>\n",
       "      <td>-0.058807</td>\n",
       "      <td>0.071698</td>\n",
       "      <td>0.005038</td>\n",
       "      <td>0.010522</td>\n",
       "      <td>...</td>\n",
       "      <td>0.077442</td>\n",
       "      <td>-0.080605</td>\n",
       "      <td>-0.038363</td>\n",
       "      <td>0.013772</td>\n",
       "      <td>-0.049518</td>\n",
       "      <td>-0.004730</td>\n",
       "      <td>0.142463</td>\n",
       "      <td>-0.017870</td>\n",
       "      <td>0.013901</td>\n",
       "      <td>-0.037920</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 385 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  label         0         1         2         3         4         5         6  \\\n",
       "0  safe  0.009612  0.052424  0.083845 -0.031595  0.028893  0.059027  0.045335   \n",
       "1  safe  0.003328  0.114213  0.032118 -0.055552 -0.008358  0.024322  0.080483   \n",
       "2  safe  0.031809 -0.024030  0.026037 -0.039061 -0.002786  0.052565  0.126942   \n",
       "3  safe -0.061874  0.046060  0.021520  0.021778  0.009127  0.021292  0.184221   \n",
       "4  safe -0.053528 -0.045918  0.076154 -0.003261  0.005603 -0.058807  0.071698   \n",
       "\n",
       "          7         8  ...       374       375       376       377       378  \\\n",
       "0 -0.014343  0.037504  ... -0.088224  0.037281  0.020476  0.067415  0.000324   \n",
       "1  0.007664  0.062874  ... -0.000377  0.046901  0.018417 -0.003170 -0.027469   \n",
       "2 -0.014587  0.047931  ... -0.014579  0.009263  0.010371  0.045266 -0.076779   \n",
       "3  0.004618 -0.069447  ...  0.000147  0.002003  0.089495  0.040357  0.007321   \n",
       "4  0.005038  0.010522  ...  0.077442 -0.080605 -0.038363  0.013772 -0.049518   \n",
       "\n",
       "        379       380       381       382       383  \n",
       "0 -0.049588  0.006397  0.139169  0.049218 -0.025690  \n",
       "1  0.005639  0.133500  0.032221  0.030415 -0.019664  \n",
       "2 -0.023678  0.021364  0.036397 -0.001076 -0.060540  \n",
       "3  0.016096  0.012340  0.062434  0.005810  0.003983  \n",
       "4 -0.004730  0.142463 -0.017870  0.013901 -0.037920  \n",
       "\n",
       "[5 rows x 385 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Calculating embedding vectors...')\n",
    "\n",
    "emb = pd.DataFrame(st.encode(df['body'].to_list(), convert_to_numpy=True))\n",
    "df = pd.concat([df, emb], axis=1).drop(columns=['body'])\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 7295434 entries, 0 to 7295433\n",
      "Columns: 385 entries, label to 383\n",
      "dtypes: float32(384), object(1)\n",
      "memory usage: 10.5+ GB\n"
     ]
    }
   ],
   "source": [
    "df.info()\n",
    "df.to_csv('./data/input_7m_balanced_deduped_amc_MiniLM.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!tangram train -f ./data/input_7m_balanced_deduped_amc_MiniLM.csv -o ./models/sensai_7m_balanced_deduped_amc_MiniLM.tangram -t label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = tangram.Model.from_path('./models/sensai_7m_balanced_deduped_amc_MiniLM.tangram')\n",
    "\n",
    "def remap_label(pred) -> str:\n",
    "    label = '👍'\n",
    "    if pred.class_name == 'toxic':\n",
    "        if pred.probability <= 0.6:\n",
    "            label = '⚖️'\n",
    "        else:\n",
    "            label = '🚽'\n",
    "    return label\n",
    "\n",
    "\n",
    "def predict_batch(texts):\n",
    "    preds = model.predict([{str(k): v for k,v in enumerate(x)} for x in st.encode(texts)])\n",
    "    return [(remap_label(pred), pred.probability) for pred in preds]\n",
    "\n",
    "def predict(text: str):\n",
    "    return predict_batch([text])[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "👍 : lol\n",
      "🚽 : 草\n",
      "👍 : かわいい！！！\n",
      "👍 : INAAAAAAAAAA\n",
      "🚽 : [ENG] Heh ? don't I have way less recovery items ? don't they come back on each continue ????\n",
      "👍 : What's with this sassy lost child?\n",
      "👍 : Your village looks so cute and pretty\n",
      "⚖️ : imagine the dead team member watching him dance for no reason\n",
      "👍 : A型だけど、気にならないどころかむしろ安心感ある\n",
      "🚽 : Tenchou pls even if you don't think your singing is as good, you are still one of the most electric and infectious souls I've ever seen. Ganbatte bossu!!!\n",
      "🚽 : Nice butt\n",
      "🚽 : My dog died let's goooooooooooo\n",
      "👍 : when will you graduate?\n",
      "⚖️ : lol you guys are pathetic\n",
      "🚽 : 死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね\n",
      "🚽 : 卒業卒業卒業卒業\n"
     ]
    }
   ],
   "source": [
    "chats = [\n",
    "    \"lol\",\n",
    "    \"草\",\n",
    "    \"かわいい！！！\", # cute !!!\n",
    "    \"INAAAAAAAAAA\",\n",
    "    \"[ENG] Heh ? don't I have way less recovery items ? don't they come back on each continue ????\",\n",
    "    \"What's with this sassy lost child?\",\n",
    "    \"Your village looks so cute and pretty\",\n",
    "    \"imagine the dead team member watching him dance for no reason\",\n",
    "    \"A型だけど、気にならないどころかむしろ安心感ある\",\n",
    "    \"Tenchou pls even if you don't think your singing is as good, you are still one of the most electric and infectious souls I've ever seen. Ganbatte bossu!!!\",\n",
    "    \"Nice butt\",\n",
    "    \"My dog died let's goooooooooooo\",\n",
    "    \"when will you graduate?\",\n",
    "    \"lol you guys are pathetic\",\n",
    "    \"死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね死ね\", # die die die die die die\n",
    "    \"卒業卒業卒業卒業\", # graduate graduate graduate graduate\n",
    "]\n",
    "\n",
    "for text, pred in zip(chats, predict_batch(chats)):\n",
    "    print(pred[0], ':', text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# of deleted chats 6471313\n",
      "# of safe chats 6471313\n"
     ]
    }
   ],
   "source": [
    "deleted_by_mods = df[df['label'] != 'nonflagged']\n",
    "safe = df[df['label'] == 'nonflagged']\n",
    "print('# of deleted chats', len(deleted_by_mods))\n",
    "print('# of safe chats', len(safe))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = 50000\n",
    "\n",
    "def adjust_label(pred):\n",
    "    if pred.class_name == 'safe':\n",
    "        if pred.probability <= 0.5:\n",
    "            return 'toxic'\n",
    "        else:\n",
    "            return 'safe'\n",
    "    return 'toxic'\n",
    "\n",
    "def predict_labels(chats):\n",
    "    return [(x.class_name, x.probability) for x in model.predict([{str(k): v for k,v in enumerate(x)} for x in st.encode(chats.apply(lambda x: (x['authorName'] or '<EMPTY>') + '<SEP>' + x['body'], axis=1).to_list())])]\n",
    "\n",
    "deleted_labels = predict_labels(deleted_by_mods.sample(N))\n",
    "safe_labels = predict_labels(safe.sample(N))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "N 100000\n",
      "FPR 0.14654\n",
      "FNR 0.181\n",
      "Sensitivity 0.819\n",
      "Precision 0.8482300060070013\n",
      "Accuracy 0.83623\n",
      "F(2) score 0.8246837201996158\n"
     ]
    }
   ],
   "source": [
    "def f_beta(prec, rec, beta):\n",
    "    return (1+beta**2) * ( (prec * rec) / ( beta**2 * prec + rec ) )\n",
    "\n",
    "fn = len([x for x in deleted_labels if x[0] == 'safe'])\n",
    "fp = len([x for x in safe_labels if x[0] != 'safe'])\n",
    "tp = N - fn\n",
    "tn = N - fp\n",
    "pp = tp + fp\n",
    "fnr = fn / N\n",
    "fpr = fp / N\n",
    "sensitivity = tp / N\n",
    "precision = tp / pp\n",
    "accuracy = (tp + tn) / (N*2)\n",
    "fb = f_beta(precision, sensitivity, 2)\n",
    "print('N', len(deleted_labels)+len(safe_labels))\n",
    "print('FPR', fpr)\n",
    "print('FNR', fnr)\n",
    "print('Sensitivity', sensitivity)\n",
    "print('Precision', precision)\n",
    "print('Accuracy', accuracy)\n",
    "print('F(2) score', fb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('safe', 0.5598429441452026)]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predict_labels(pd.DataFrame.from_dict([{'authorName': 'Skamor', 'body': '味ってわかる？\"あじ\"って読むんやで'}]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>authorName</th>\n",
       "      <th>body</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Hi</td>\n",
       "      <td>Hi</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  authorName body\n",
       "0         Hi   Hi"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame.from_dict([{'authorName': 'Hi', 'body': 'Hi'}])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def convertRawMessageToString(rawMessage):\n",
    "    def handler(run):\n",
    "        msgType = list(run.keys())[0]\n",
    "        payload = run[msgType]\n",
    "        if msgType == 'text':\n",
    "            return payload\n",
    "        elif msgType == 'emoji':\n",
    "            if 'isCustomEmoji' in payload:\n",
    "                return \"\\uFFFD\"\n",
    "            else:\n",
    "                return payload['emojiId']\n",
    "        else:\n",
    "            raise 'Invalid type: ' + msgType + ', ' + payload\n",
    "    return \"\".join([handler(run) for run in rawMessage])\n",
    "\n",
    "with open('addChatItemAction.jsonl') as f:\n",
    "    lines = list(map(json.loads, f.readlines()))\n",
    "chats = pd.DataFrame.from_dict(lines)\n",
    "chats = chats[['authorName', 'message']]\n",
    "chats['message'] = chats['message'].apply(convertRawMessageToString)\n",
    "chats = chats.rename(columns={'message': 'body'})\n",
    "chats['label'] = pd.Series(map(lambda x: 'hold' if x[1] < 0.85 else {'safe': 'safe', 'toxic': 'annoying'}[x[0]], predict_labels(chats)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>authorName</th>\n",
       "      <th>body</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3690</th>\n",
       "      <td>Miguel Islas</td>\n",
       "      <td>lmao</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>451</th>\n",
       "      <td>mamin</td>\n",
       "      <td>ㅋㅋㅋㅋㅋㅋ</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7977</th>\n",
       "      <td>Pierah</td>\n",
       "      <td>🧡💛💙💜❤</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1993</th>\n",
       "      <td>Kuga Raian</td>\n",
       "      <td>LMAOOOO😳😳😳</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5752</th>\n",
       "      <td>BOY tenten</td>\n",
       "      <td>こんにちはー</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9304</th>\n",
       "      <td>linkis20</td>\n",
       "      <td>💙💙💙💙💙</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10540</th>\n",
       "      <td>MegaSlayer92</td>\n",
       "      <td>💙💙💙💙💙💙</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5562</th>\n",
       "      <td>Brandermau</td>\n",
       "      <td>You're welcome</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8890</th>\n",
       "      <td>零羅</td>\n",
       "      <td>かわいい</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8155</th>\n",
       "      <td>Kronii's Apostle [8th of the Twelve]</td>\n",
       "      <td>ayy pog moment</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7414</th>\n",
       "      <td>linkis20</td>\n",
       "      <td>💙❤🧡💜💛</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7976</th>\n",
       "      <td>LongtimeNoc-KFP-Nephamily</td>\n",
       "      <td>Is happy to make one of your dreams finally co...</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10969</th>\n",
       "      <td>Bishop Johnson</td>\n",
       "      <td>💙💙💙💙</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7504</th>\n",
       "      <td>Arthur</td>\n",
       "      <td>❤🧡💛💙💜</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9343</th>\n",
       "      <td>Nova Star</td>\n",
       "      <td>You did great</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>544</th>\n",
       "      <td>BreakingPhobia103</td>\n",
       "      <td>FEET�</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11250</th>\n",
       "      <td>Oliver Chen</td>\n",
       "      <td>LOOOOOOOOOL</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2791</th>\n",
       "      <td>Kronii's Apostle [8th of the Twelve]</td>\n",
       "      <td>is mumei really that beeg?</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9991</th>\n",
       "      <td>linkis20</td>\n",
       "      <td>💙💙💙💙💙</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7843</th>\n",
       "      <td>»pingpong</td>\n",
       "      <td>💙💛💜❤🧡</td>\n",
       "      <td>safe</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 authorName  \\\n",
       "3690                           Miguel Islas   \n",
       "451                                   mamin   \n",
       "7977                                 Pierah   \n",
       "1993                             Kuga Raian   \n",
       "5752                             BOY tenten   \n",
       "9304                               linkis20   \n",
       "10540                          MegaSlayer92   \n",
       "5562                             Brandermau   \n",
       "8890                                     零羅   \n",
       "8155   Kronii's Apostle [8th of the Twelve]   \n",
       "7414                               linkis20   \n",
       "7976              LongtimeNoc-KFP-Nephamily   \n",
       "10969                        Bishop Johnson   \n",
       "7504                                 Arthur   \n",
       "9343                              Nova Star   \n",
       "544                       BreakingPhobia103   \n",
       "11250                           Oliver Chen   \n",
       "2791   Kronii's Apostle [8th of the Twelve]   \n",
       "9991                               linkis20   \n",
       "7843                              »pingpong   \n",
       "\n",
       "                                                    body label  \n",
       "3690                                                lmao  safe  \n",
       "451                                               ㅋㅋㅋㅋㅋㅋ  safe  \n",
       "7977                                               🧡💛💙💜❤  safe  \n",
       "1993                                          LMAOOOO😳😳😳  safe  \n",
       "5752                                              こんにちはー  safe  \n",
       "9304                                               💙💙💙💙💙  safe  \n",
       "10540                                             💙💙💙💙💙💙  safe  \n",
       "5562                                      You're welcome  safe  \n",
       "8890                                                かわいい  safe  \n",
       "8155                                      ayy pog moment  safe  \n",
       "7414                                               💙❤🧡💜💛  safe  \n",
       "7976   Is happy to make one of your dreams finally co...  safe  \n",
       "10969                                               💙💙💙💙  safe  \n",
       "7504                                               ❤🧡💛💙💜  safe  \n",
       "9343                                       You did great  safe  \n",
       "544                                                FEET�  safe  \n",
       "11250                                        LOOOOOOOOOL  safe  \n",
       "2791                          is mumei really that beeg?  safe  \n",
       "9991                                               💙💙💙💙💙  safe  \n",
       "7843                                               💙💛💜❤🧡  safe  "
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chats[(chats['label'] == 'safe') & (chats['body'].str.len() > 3)].sample(20)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "24403c88b9bd347a0b41a9fc3e3175e2948f6e2f45c79c9df260f2a479a817d3"
  },
  "kernelspec": {
   "display_name": "Python 3.8.6 64-bit ('.venv': poetry)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}