refactor: add more examples

2026-03-02 08:54:01 +09:00 · 2022-06-03 16:36:32 +09:00
parent 6cd6370a71
commit 4dd186486c
15 changed files with 14961 additions and 1218 deletions
--- a/examples/addChatItemAction.jsonl
+++ b/examples/addChatItemAction.jsonl
--- a/examples/pandas.ipynb
+++ b/examples/pandas.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DATASET_DIR /home/uetchy/repos/src/github.com/holodata/sensai-huggingface\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from os.path import join\n",
+    "import os\n",
+    "from glob import glob\n",
+    "\n",
+    "DATASET_DIR = os.environ.get(\"DATASET_DIR\", \"../input/sensai\")\n",
+    "print(\"DATASET_DIR\", DATASET_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 10677038 entries, 0 to 10677037\n",
+      "Data columns (total 2 columns):\n",
+      " #   Column  Dtype \n",
+      "---  ------  ----- \n",
+      " 0   body    object\n",
+      " 1   label   object\n",
+      "dtypes: object(2)\n",
+      "memory usage: 162.9+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.concat(\n",
+    "    [pd.read_parquet(x) for x in glob(join(DATASET_DIR, '*.parquet'))],\n",
+    "    ignore_index=True)\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>body</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>6229407</th>\n",
+       "      <td>Blessed stream</td>\n",
+       "      <td>hidden</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7406071</th>\n",
+       "      <td>RIP</td>\n",
+       "      <td>nonflagged</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>920434</th>\n",
+       "      <td>cute</td>\n",
+       "      <td>nonflagged</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6146625</th>\n",
+       "      <td>GACHA lets gooo</td>\n",
+       "      <td>hidden</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8259711</th>\n",
+       "      <td>草</td>\n",
+       "      <td>hidden</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    body       label\n",
+       "6229407   Blessed stream      hidden\n",
+       "7406071              RIP  nonflagged\n",
+       "920434              cute  nonflagged\n",
+       "6146625  GACHA lets gooo      hidden\n",
+       "8259711                草      hidden"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.sample(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "24403c88b9bd347a0b41a9fc3e3175e2948f6e2f45c79c9df260f2a479a817d3"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.6 64-bit ('.venv': poetry)",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.6"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/examples/tangram.ipynb
+++ b/examples/tangram.ipynb
--- a/examples/toxic-chat-classification-using-lstm.ipynb
+++ b/examples/toxic-chat-classification-using-lstm.ipynb
--- a/examples/transformers.ipynb
+++ b/examples/transformers.ipynb
@@ -0,0 +1,132 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset, Features\n",
+    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
+    "import os\n",
+    "from datasets import ClassLabel, Value\n",
+    "\n",
+    "# https://huggingface.co/docs/datasets/loading_datasets.html\n",
+    "\n",
+    "DATASET_DIR = os.environ.get(\"DATASET_DIR\", \"../input/sensai\")\n",
+    "print(\"DATASET_DIR\", DATASET_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = load_dataset(\"holodata/sensai\", features=Features(\n",
+    "                {\n",
+    "                    \"body\": Value(\"string\"),\n",
+    "                    \"toxic\": ClassLabel(num_classes=2, names=['0', '1'])\n",
+    "                }\n",
+    "            ))\n",
+    "dataset = dataset['train']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-cased\", num_labels=2)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "samples = dataset.shuffle().select(range(50000))\n",
+    "\n",
+    "def tokenize_function(examples):\n",
+    "    return tokenizer(examples[\"body\"], padding=\"max_length\", truncation=True)\n",
+    "\n",
+    "tokenized_datasets = samples.map(tokenize_function, batched=True)\n",
+    "tokenized_datasets.rename_column_(\"toxic\", \"label\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "splitset = tokenized_datasets.train_test_split(0.2)\n",
+    "splitset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_args = TrainingArguments(\"test_trainer\")\n",
+    "trainer = Trainer(\n",
+    "    model=model, args=training_args, train_dataset=splitset['train'], eval_dataset=splitset['test']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.train(resume_from_checkpoint=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "c8daecebaf2d81430b8373e4b4af380b12df116248cd1bbadd3fc947f45a1f88"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.6 64-bit",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.6"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}