mirror of
https://github.com/holodata/sensai-dataset.git
synced 2025-08-20 11:18:12 +09:00
refactor: add more examples
This commit is contained in:
11341
examples/addChatItemAction.jsonl
Normal file
11341
examples/addChatItemAction.jsonl
Normal file
File diff suppressed because one or more lines are too long
164
examples/pandas.ipynb
Normal file
164
examples/pandas.ipynb
Normal file
@@ -0,0 +1,164 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"DATASET_DIR /home/uetchy/repos/src/github.com/holodata/sensai-huggingface\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from os.path import join\n",
|
||||
"import os\n",
|
||||
"from glob import glob\n",
|
||||
"\n",
|
||||
"DATASET_DIR = os.environ.get(\"DATASET_DIR\", \"../input/sensai\")\n",
|
||||
"print(\"DATASET_DIR\", DATASET_DIR)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 10677038 entries, 0 to 10677037\n",
|
||||
"Data columns (total 2 columns):\n",
|
||||
" # Column Dtype \n",
|
||||
"--- ------ ----- \n",
|
||||
" 0 body object\n",
|
||||
" 1 label object\n",
|
||||
"dtypes: object(2)\n",
|
||||
"memory usage: 162.9+ MB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = pd.concat(\n",
|
||||
" [pd.read_parquet(x) for x in glob(join(DATASET_DIR, '*.parquet'))],\n",
|
||||
" ignore_index=True)\n",
|
||||
"df.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>body</th>\n",
|
||||
" <th>label</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>6229407</th>\n",
|
||||
" <td>Blessed stream</td>\n",
|
||||
" <td>hidden</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7406071</th>\n",
|
||||
" <td>RIP</td>\n",
|
||||
" <td>nonflagged</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>920434</th>\n",
|
||||
" <td>cute</td>\n",
|
||||
" <td>nonflagged</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6146625</th>\n",
|
||||
" <td>GACHA lets gooo</td>\n",
|
||||
" <td>hidden</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8259711</th>\n",
|
||||
" <td>草</td>\n",
|
||||
" <td>hidden</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" body label\n",
|
||||
"6229407 Blessed stream hidden\n",
|
||||
"7406071 RIP nonflagged\n",
|
||||
"920434 cute nonflagged\n",
|
||||
"6146625 GACHA lets gooo hidden\n",
|
||||
"8259711 草 hidden"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df.sample(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "24403c88b9bd347a0b41a9fc3e3175e2948f6e2f45c79c9df260f2a479a817d3"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8.6 64-bit ('.venv': poetry)",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.6"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
1069
examples/tangram.ipynb
Normal file
1069
examples/tangram.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1
examples/toxic-chat-classification-using-lstm.ipynb
Normal file
1
examples/toxic-chat-classification-using-lstm.ipynb
Normal file
File diff suppressed because one or more lines are too long
132
examples/transformers.ipynb
Normal file
132
examples/transformers.ipynb
Normal file
@@ -0,0 +1,132 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datasets import load_dataset, Features\n",
|
||||
"from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
|
||||
"import os\n",
|
||||
"from datasets import ClassLabel, Value\n",
|
||||
"\n",
|
||||
"# https://huggingface.co/docs/datasets/loading_datasets.html\n",
|
||||
"\n",
|
||||
"DATASET_DIR = os.environ.get(\"DATASET_DIR\", \"../input/sensai\")\n",
|
||||
"print(\"DATASET_DIR\", DATASET_DIR)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = load_dataset(\"holodata/sensai\", features=Features(\n",
|
||||
" {\n",
|
||||
" \"body\": Value(\"string\"),\n",
|
||||
" \"toxic\": ClassLabel(num_classes=2, names=['0', '1'])\n",
|
||||
" }\n",
|
||||
" ))\n",
|
||||
"dataset = dataset['train']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset.features"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-cased\", num_labels=2)\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"samples = dataset.shuffle().select(range(50000))\n",
|
||||
"\n",
|
||||
"def tokenize_function(examples):\n",
|
||||
" return tokenizer(examples[\"body\"], padding=\"max_length\", truncation=True)\n",
|
||||
"\n",
|
||||
"tokenized_datasets = samples.map(tokenize_function, batched=True)\n",
|
||||
"tokenized_datasets.rename_column_(\"toxic\", \"label\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"splitset = tokenized_datasets.train_test_split(0.2)\n",
|
||||
"splitset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_args = TrainingArguments(\"test_trainer\")\n",
|
||||
"trainer = Trainer(\n",
|
||||
" model=model, args=training_args, train_dataset=splitset['train'], eval_dataset=splitset['test']\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"trainer.train(resume_from_checkpoint=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "c8daecebaf2d81430b8373e4b4af380b12df116248cd1bbadd3fc947f45a1f88"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8.6 64-bit",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.6"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Reference in New Issue
Block a user