mirror of
https://github.com/holodata/sensai-dataset.git
synced 2025-03-15 20:10:32 +09:00
354 lines
12 KiB
Plaintext
354 lines
12 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"source": [
|
||
|
"from datasets import load_dataset, Features\n",
|
||
|
"from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
|
||
|
"import os\n",
|
||
|
"from os.path import join\n",
|
||
|
"import pandas as pd\n",
|
||
|
"from datasets import ClassLabel, Value\n",
|
||
|
"\n",
|
||
|
"# https://huggingface.co/docs/datasets/loading_datasets.html\n",
|
||
|
"\n",
|
||
|
"DATASET_DIR = os.environ['DATASET_DIR']"
|
||
|
],
|
||
|
"outputs": [],
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [],
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"source": [
|
||
|
"dataset = load_dataset(\"holodata/sensai\", features=Features(\n",
|
||
|
" {\n",
|
||
|
" \"body\": Value(\"string\"),\n",
|
||
|
" \"toxic\": ClassLabel(num_classes=2, names=['0', '1'])\n",
|
||
|
" }\n",
|
||
|
" ))\n",
|
||
|
"dataset = dataset['train']"
|
||
|
],
|
||
|
"outputs": [
|
||
|
{
|
||
|
"output_type": "display_data",
|
||
|
"data": {
|
||
|
"application/vnd.jupyter.widget-view+json": {
|
||
|
"version_major": 2,
|
||
|
"version_minor": 0,
|
||
|
"model_id": "704789c10f1e44ddbb83262b8a826eec"
|
||
|
},
|
||
|
"text/plain": [
|
||
|
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"output_type": "stream",
|
||
|
"name": "stderr",
|
||
|
"text": [
|
||
|
"Using custom data configuration sensai-4d9ed81389161083\n",
|
||
|
"Reusing dataset parquet (/home/uetchy/.cache/huggingface/datasets/parquet/sensai-4d9ed81389161083/0.0.0/9296ce43568b20d72ff8ff8ecbc821a16b68e9b8b7058805ef11f06e035f911a)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"output_type": "display_data",
|
||
|
"data": {
|
||
|
"application/vnd.jupyter.widget-view+json": {
|
||
|
"version_major": 2,
|
||
|
"version_minor": 0,
|
||
|
"model_id": "8acdde0c4caa4d4698f97ef29993195b"
|
||
|
},
|
||
|
"text/plain": [
|
||
|
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {}
|
||
|
}
|
||
|
],
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"source": [
|
||
|
"dataset.features"
|
||
|
],
|
||
|
"outputs": [
|
||
|
{
|
||
|
"output_type": "execute_result",
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"{'body': Value(dtype='string', id=None),\n",
|
||
|
" 'toxic': ClassLabel(num_classes=2, names=['0', '1'], names_file=None, id=None)}"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"execution_count": 3
|
||
|
}
|
||
|
],
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"source": [
|
||
|
"model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-cased\", num_labels=2)\n",
|
||
|
"tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")"
|
||
|
],
|
||
|
"outputs": [
|
||
|
{
|
||
|
"output_type": "stream",
|
||
|
"name": "stderr",
|
||
|
"text": [
|
||
|
"Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']\n",
|
||
|
"- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
||
|
"- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
|
||
|
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
|
||
|
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"source": [
|
||
|
"samples = dataset.shuffle().select(range(50000))\n",
|
||
|
"\n",
|
||
|
"def tokenize_function(examples):\n",
|
||
|
" return tokenizer(examples[\"body\"], padding=\"max_length\", truncation=True)\n",
|
||
|
"\n",
|
||
|
"tokenized_datasets = samples.map(tokenize_function, batched=True)\n",
|
||
|
"tokenized_datasets.rename_column_(\"toxic\", \"label\")"
|
||
|
],
|
||
|
"outputs": [
|
||
|
{
|
||
|
"output_type": "stream",
|
||
|
"name": "stderr",
|
||
|
"text": [
|
||
|
"Loading cached shuffled indices for dataset at /home/uetchy/.cache/huggingface/datasets/parquet/sensai-4d9ed81389161083/0.0.0/9296ce43568b20d72ff8ff8ecbc821a16b68e9b8b7058805ef11f06e035f911a/cache-24e3dd769ef2f1b7.arrow\n",
|
||
|
"Loading cached processed dataset at /home/uetchy/.cache/huggingface/datasets/parquet/sensai-4d9ed81389161083/0.0.0/9296ce43568b20d72ff8ff8ecbc821a16b68e9b8b7058805ef11f06e035f911a/cache-8395f066c72e57d7.arrow\n",
|
||
|
"/tmp/ipykernel_4082765/2982913603.py:7: FutureWarning: rename_column_ is deprecated and will be removed in the next major version of datasets. Use Dataset.rename_column instead.\n",
|
||
|
" tokenized_datasets.rename_column_(\"toxic\", \"label\")\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"source": [
|
||
|
"splitset = tokenized_datasets.train_test_split(0.2)\n",
|
||
|
"splitset"
|
||
|
],
|
||
|
"outputs": [
|
||
|
{
|
||
|
"output_type": "execute_result",
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"DatasetDict({\n",
|
||
|
" train: Dataset({\n",
|
||
|
" features: ['attention_mask', 'body', 'input_ids', 'token_type_ids', 'label'],\n",
|
||
|
" num_rows: 40000\n",
|
||
|
" })\n",
|
||
|
" test: Dataset({\n",
|
||
|
" features: ['attention_mask', 'body', 'input_ids', 'token_type_ids', 'label'],\n",
|
||
|
" num_rows: 10000\n",
|
||
|
" })\n",
|
||
|
"})"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"execution_count": 6
|
||
|
}
|
||
|
],
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"source": [
|
||
|
"training_args = TrainingArguments(\"test_trainer\")\n",
|
||
|
"trainer = Trainer(\n",
|
||
|
" model=model, args=training_args, train_dataset=splitset['train'], eval_dataset=splitset['test']\n",
|
||
|
")"
|
||
|
],
|
||
|
"outputs": [],
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"source": [
|
||
|
"trainer.train(resume_from_checkpoint=True)"
|
||
|
],
|
||
|
"outputs": [
|
||
|
{
|
||
|
"output_type": "stream",
|
||
|
"name": "stderr",
|
||
|
"text": [
|
||
|
"Loading model from test_trainer/checkpoint-12500).\n",
|
||
|
"The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: body.\n",
|
||
|
"***** Running training *****\n",
|
||
|
" Num examples = 40000\n",
|
||
|
" Num Epochs = 3\n",
|
||
|
" Instantaneous batch size per device = 8\n",
|
||
|
" Total train batch size (w. parallel, distributed & accumulation) = 8\n",
|
||
|
" Gradient Accumulation steps = 1\n",
|
||
|
" Total optimization steps = 15000\n",
|
||
|
" Continuing training from checkpoint, will skip to saved global_step\n",
|
||
|
" Continuing training from epoch 2\n",
|
||
|
" Continuing training from global step 12500\n",
|
||
|
" Will skip the first 2 epochs then the first 2500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"output_type": "display_data",
|
||
|
"data": {
|
||
|
"application/vnd.jupyter.widget-view+json": {
|
||
|
"version_major": 2,
|
||
|
"version_minor": 0,
|
||
|
"model_id": "d749dea90f914485916ce82c65e73fe6"
|
||
|
},
|
||
|
"text/plain": [
|
||
|
" 0%| | 0/2500 [00:00<?, ?it/s]"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"output_type": "stream",
|
||
|
"name": "stderr",
|
||
|
"text": [
|
||
|
"Didn't find an RNG file, if you are resuming a training that was launched in a distributed fashion, reproducibility is not guaranteed.\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"output_type": "display_data",
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"\n",
|
||
|
" <div>\n",
|
||
|
" \n",
|
||
|
" <progress value='15000' max='15000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
||
|
" [15000/15000 31:45, Epoch 3/3]\n",
|
||
|
" </div>\n",
|
||
|
" <table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: left;\">\n",
|
||
|
" <th>Step</th>\n",
|
||
|
" <th>Training Loss</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <td>13000</td>\n",
|
||
|
" <td>0.687500</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <td>13500</td>\n",
|
||
|
" <td>0.686300</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <td>14000</td>\n",
|
||
|
" <td>0.637900</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <td>14500</td>\n",
|
||
|
" <td>0.643200</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <td>15000</td>\n",
|
||
|
" <td>0.627700</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table><p>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
"<IPython.core.display.HTML object>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"output_type": "stream",
|
||
|
"name": "stderr",
|
||
|
"text": [
|
||
|
"Saving model checkpoint to test_trainer/checkpoint-13000\n",
|
||
|
"Configuration saved in test_trainer/checkpoint-13000/config.json\n",
|
||
|
"Model weights saved in test_trainer/checkpoint-13000/pytorch_model.bin\n",
|
||
|
"Saving model checkpoint to test_trainer/checkpoint-13500\n",
|
||
|
"Configuration saved in test_trainer/checkpoint-13500/config.json\n",
|
||
|
"Model weights saved in test_trainer/checkpoint-13500/pytorch_model.bin\n",
|
||
|
"Saving model checkpoint to test_trainer/checkpoint-14000\n",
|
||
|
"Configuration saved in test_trainer/checkpoint-14000/config.json\n",
|
||
|
"Model weights saved in test_trainer/checkpoint-14000/pytorch_model.bin\n",
|
||
|
"Saving model checkpoint to test_trainer/checkpoint-14500\n",
|
||
|
"Configuration saved in test_trainer/checkpoint-14500/config.json\n",
|
||
|
"Model weights saved in test_trainer/checkpoint-14500/pytorch_model.bin\n",
|
||
|
"Saving model checkpoint to test_trainer/checkpoint-15000\n",
|
||
|
"Configuration saved in test_trainer/checkpoint-15000/config.json\n",
|
||
|
"Model weights saved in test_trainer/checkpoint-15000/pytorch_model.bin\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"Training completed. Do not forget to share your model on huggingface.co/models =)\n",
|
||
|
"\n",
|
||
|
"\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"output_type": "execute_result",
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"TrainOutput(global_step=15000, training_loss=0.10941998901367188, metrics={'train_runtime': 1918.0916, 'train_samples_per_second': 62.562, 'train_steps_per_second': 7.82, 'total_flos': 3.24994775580672e+16, 'train_loss': 0.10941998901367188, 'epoch': 3.0})"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"execution_count": 9
|
||
|
}
|
||
|
],
|
||
|
"metadata": {}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"source": [],
|
||
|
"outputs": [],
|
||
|
"metadata": {}
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"orig_nbformat": 4,
|
||
|
"language_info": {
|
||
|
"name": "python",
|
||
|
"version": "3.8.6",
|
||
|
"mimetype": "text/x-python",
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"file_extension": ".py"
|
||
|
},
|
||
|
"kernelspec": {
|
||
|
"name": "python3",
|
||
|
"display_name": "Python 3.8.6 64-bit"
|
||
|
},
|
||
|
"interpreter": {
|
||
|
"hash": "c8daecebaf2d81430b8373e4b4af380b12df116248cd1bbadd3fc947f45a1f88"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|