mirror of
				https://github.com/holodata/sensai-dataset.git
				synced 2025-10-30 15:40:43 +09:00 
			
		
		
		
	
		
			
	
	
		
			354 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			354 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|  | { | ||
|  |  "cells": [ | ||
|  |   { | ||
|  |    "cell_type": "code", | ||
|  |    "execution_count": 1, | ||
|  |    "source": [ | ||
|  |     "from datasets import load_dataset, Features\n", | ||
|  |     "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n", | ||
|  |     "import os\n", | ||
|  |     "from os.path import join\n", | ||
|  |     "import pandas as pd\n", | ||
|  |     "from datasets import ClassLabel, Value\n", | ||
|  |     "\n", | ||
|  |     "# https://huggingface.co/docs/datasets/loading_datasets.html\n", | ||
|  |     "\n", | ||
|  |     "DATASET_DIR = os.environ['DATASET_DIR']" | ||
|  |    ], | ||
|  |    "outputs": [], | ||
|  |    "metadata": {} | ||
|  |   }, | ||
|  |   { | ||
|  |    "cell_type": "markdown", | ||
|  |    "source": [], | ||
|  |    "metadata": {} | ||
|  |   }, | ||
|  |   { | ||
|  |    "cell_type": "code", | ||
|  |    "execution_count": 2, | ||
|  |    "source": [ | ||
|  |     "dataset = load_dataset(\"holodata/sensai\", features=Features(\n", | ||
|  |     "                {\n", | ||
|  |     "                    \"body\": Value(\"string\"),\n", | ||
|  |     "                    \"toxic\": ClassLabel(num_classes=2, names=['0', '1'])\n", | ||
|  |     "                }\n", | ||
|  |     "            ))\n", | ||
|  |     "dataset = dataset['train']" | ||
|  |    ], | ||
|  |    "outputs": [ | ||
|  |     { | ||
|  |      "output_type": "display_data", | ||
|  |      "data": { | ||
|  |       "application/vnd.jupyter.widget-view+json": { | ||
|  |        "version_major": 2, | ||
|  |        "version_minor": 0, | ||
|  |        "model_id": "704789c10f1e44ddbb83262b8a826eec" | ||
|  |       }, | ||
|  |       "text/plain": [ | ||
|  |        "  0%|          | 0/1 [00:00<?, ?it/s]" | ||
|  |       ] | ||
|  |      }, | ||
|  |      "metadata": {} | ||
|  |     }, | ||
|  |     { | ||
|  |      "output_type": "stream", | ||
|  |      "name": "stderr", | ||
|  |      "text": [ | ||
|  |       "Using custom data configuration sensai-4d9ed81389161083\n", | ||
|  |       "Reusing dataset parquet (/home/uetchy/.cache/huggingface/datasets/parquet/sensai-4d9ed81389161083/0.0.0/9296ce43568b20d72ff8ff8ecbc821a16b68e9b8b7058805ef11f06e035f911a)\n" | ||
|  |      ] | ||
|  |     }, | ||
|  |     { | ||
|  |      "output_type": "display_data", | ||
|  |      "data": { | ||
|  |       "application/vnd.jupyter.widget-view+json": { | ||
|  |        "version_major": 2, | ||
|  |        "version_minor": 0, | ||
|  |        "model_id": "8acdde0c4caa4d4698f97ef29993195b" | ||
|  |       }, | ||
|  |       "text/plain": [ | ||
|  |        "  0%|          | 0/1 [00:00<?, ?it/s]" | ||
|  |       ] | ||
|  |      }, | ||
|  |      "metadata": {} | ||
|  |     } | ||
|  |    ], | ||
|  |    "metadata": {} | ||
|  |   }, | ||
|  |   { | ||
|  |    "cell_type": "code", | ||
|  |    "execution_count": 3, | ||
|  |    "source": [ | ||
|  |     "dataset.features" | ||
|  |    ], | ||
|  |    "outputs": [ | ||
|  |     { | ||
|  |      "output_type": "execute_result", | ||
|  |      "data": { | ||
|  |       "text/plain": [ | ||
|  |        "{'body': Value(dtype='string', id=None),\n", | ||
|  |        " 'toxic': ClassLabel(num_classes=2, names=['0', '1'], names_file=None, id=None)}" | ||
|  |       ] | ||
|  |      }, | ||
|  |      "metadata": {}, | ||
|  |      "execution_count": 3 | ||
|  |     } | ||
|  |    ], | ||
|  |    "metadata": {} | ||
|  |   }, | ||
|  |   { | ||
|  |    "cell_type": "code", | ||
|  |    "execution_count": 4, | ||
|  |    "source": [ | ||
|  |     "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-cased\", num_labels=2)\n", | ||
|  |     "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")" | ||
|  |    ], | ||
|  |    "outputs": [ | ||
|  |     { | ||
|  |      "output_type": "stream", | ||
|  |      "name": "stderr", | ||
|  |      "text": [ | ||
|  |       "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']\n", | ||
|  |       "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", | ||
|  |       "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", | ||
|  |       "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", | ||
|  |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" | ||
|  |      ] | ||
|  |     } | ||
|  |    ], | ||
|  |    "metadata": {} | ||
|  |   }, | ||
|  |   { | ||
|  |    "cell_type": "code", | ||
|  |    "execution_count": 5, | ||
|  |    "source": [ | ||
|  |     "samples = dataset.shuffle().select(range(50000))\n", | ||
|  |     "\n", | ||
|  |     "def tokenize_function(examples):\n", | ||
|  |     "    return tokenizer(examples[\"body\"], padding=\"max_length\", truncation=True)\n", | ||
|  |     "\n", | ||
|  |     "tokenized_datasets = samples.map(tokenize_function, batched=True)\n", | ||
|  |     "tokenized_datasets.rename_column_(\"toxic\", \"label\")" | ||
|  |    ], | ||
|  |    "outputs": [ | ||
|  |     { | ||
|  |      "output_type": "stream", | ||
|  |      "name": "stderr", | ||
|  |      "text": [ | ||
|  |       "Loading cached shuffled indices for dataset at /home/uetchy/.cache/huggingface/datasets/parquet/sensai-4d9ed81389161083/0.0.0/9296ce43568b20d72ff8ff8ecbc821a16b68e9b8b7058805ef11f06e035f911a/cache-24e3dd769ef2f1b7.arrow\n", | ||
|  |       "Loading cached processed dataset at /home/uetchy/.cache/huggingface/datasets/parquet/sensai-4d9ed81389161083/0.0.0/9296ce43568b20d72ff8ff8ecbc821a16b68e9b8b7058805ef11f06e035f911a/cache-8395f066c72e57d7.arrow\n", | ||
|  |       "/tmp/ipykernel_4082765/2982913603.py:7: FutureWarning: rename_column_ is deprecated and will be removed in the next major version of datasets. Use Dataset.rename_column instead.\n", | ||
|  |       "  tokenized_datasets.rename_column_(\"toxic\", \"label\")\n" | ||
|  |      ] | ||
|  |     } | ||
|  |    ], | ||
|  |    "metadata": {} | ||
|  |   }, | ||
|  |   { | ||
|  |    "cell_type": "code", | ||
|  |    "execution_count": 6, | ||
|  |    "source": [ | ||
|  |     "splitset = tokenized_datasets.train_test_split(0.2)\n", | ||
|  |     "splitset" | ||
|  |    ], | ||
|  |    "outputs": [ | ||
|  |     { | ||
|  |      "output_type": "execute_result", | ||
|  |      "data": { | ||
|  |       "text/plain": [ | ||
|  |        "DatasetDict({\n", | ||
|  |        "    train: Dataset({\n", | ||
|  |        "        features: ['attention_mask', 'body', 'input_ids', 'token_type_ids', 'label'],\n", | ||
|  |        "        num_rows: 40000\n", | ||
|  |        "    })\n", | ||
|  |        "    test: Dataset({\n", | ||
|  |        "        features: ['attention_mask', 'body', 'input_ids', 'token_type_ids', 'label'],\n", | ||
|  |        "        num_rows: 10000\n", | ||
|  |        "    })\n", | ||
|  |        "})" | ||
|  |       ] | ||
|  |      }, | ||
|  |      "metadata": {}, | ||
|  |      "execution_count": 6 | ||
|  |     } | ||
|  |    ], | ||
|  |    "metadata": {} | ||
|  |   }, | ||
|  |   { | ||
|  |    "cell_type": "code", | ||
|  |    "execution_count": 7, | ||
|  |    "source": [ | ||
|  |     "training_args = TrainingArguments(\"test_trainer\")\n", | ||
|  |     "trainer = Trainer(\n", | ||
|  |     "    model=model, args=training_args, train_dataset=splitset['train'], eval_dataset=splitset['test']\n", | ||
|  |     ")" | ||
|  |    ], | ||
|  |    "outputs": [], | ||
|  |    "metadata": {} | ||
|  |   }, | ||
|  |   { | ||
|  |    "cell_type": "code", | ||
|  |    "execution_count": 9, | ||
|  |    "source": [ | ||
|  |     "trainer.train(resume_from_checkpoint=True)" | ||
|  |    ], | ||
|  |    "outputs": [ | ||
|  |     { | ||
|  |      "output_type": "stream", | ||
|  |      "name": "stderr", | ||
|  |      "text": [ | ||
|  |       "Loading model from test_trainer/checkpoint-12500).\n", | ||
|  |       "The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: body.\n", | ||
|  |       "***** Running training *****\n", | ||
|  |       "  Num examples = 40000\n", | ||
|  |       "  Num Epochs = 3\n", | ||
|  |       "  Instantaneous batch size per device = 8\n", | ||
|  |       "  Total train batch size (w. parallel, distributed & accumulation) = 8\n", | ||
|  |       "  Gradient Accumulation steps = 1\n", | ||
|  |       "  Total optimization steps = 15000\n", | ||
|  |       "  Continuing training from checkpoint, will skip to saved global_step\n", | ||
|  |       "  Continuing training from epoch 2\n", | ||
|  |       "  Continuing training from global step 12500\n", | ||
|  |       "  Will skip the first 2 epochs then the first 2500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.\n" | ||
|  |      ] | ||
|  |     }, | ||
|  |     { | ||
|  |      "output_type": "display_data", | ||
|  |      "data": { | ||
|  |       "application/vnd.jupyter.widget-view+json": { | ||
|  |        "version_major": 2, | ||
|  |        "version_minor": 0, | ||
|  |        "model_id": "d749dea90f914485916ce82c65e73fe6" | ||
|  |       }, | ||
|  |       "text/plain": [ | ||
|  |        "  0%|          | 0/2500 [00:00<?, ?it/s]" | ||
|  |       ] | ||
|  |      }, | ||
|  |      "metadata": {} | ||
|  |     }, | ||
|  |     { | ||
|  |      "output_type": "stream", | ||
|  |      "name": "stderr", | ||
|  |      "text": [ | ||
|  |       "Didn't find an RNG file, if you are resuming a training that was launched in a distributed fashion, reproducibility is not guaranteed.\n" | ||
|  |      ] | ||
|  |     }, | ||
|  |     { | ||
|  |      "output_type": "display_data", | ||
|  |      "data": { | ||
|  |       "text/html": [ | ||
|  |        "\n", | ||
|  |        "    <div>\n", | ||
|  |        "      \n", | ||
|  |        "      <progress value='15000' max='15000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", | ||
|  |        "      [15000/15000 31:45, Epoch 3/3]\n", | ||
|  |        "    </div>\n", | ||
|  |        "    <table border=\"1\" class=\"dataframe\">\n", | ||
|  |        "  <thead>\n", | ||
|  |        "    <tr style=\"text-align: left;\">\n", | ||
|  |        "      <th>Step</th>\n", | ||
|  |        "      <th>Training Loss</th>\n", | ||
|  |        "    </tr>\n", | ||
|  |        "  </thead>\n", | ||
|  |        "  <tbody>\n", | ||
|  |        "    <tr>\n", | ||
|  |        "      <td>13000</td>\n", | ||
|  |        "      <td>0.687500</td>\n", | ||
|  |        "    </tr>\n", | ||
|  |        "    <tr>\n", | ||
|  |        "      <td>13500</td>\n", | ||
|  |        "      <td>0.686300</td>\n", | ||
|  |        "    </tr>\n", | ||
|  |        "    <tr>\n", | ||
|  |        "      <td>14000</td>\n", | ||
|  |        "      <td>0.637900</td>\n", | ||
|  |        "    </tr>\n", | ||
|  |        "    <tr>\n", | ||
|  |        "      <td>14500</td>\n", | ||
|  |        "      <td>0.643200</td>\n", | ||
|  |        "    </tr>\n", | ||
|  |        "    <tr>\n", | ||
|  |        "      <td>15000</td>\n", | ||
|  |        "      <td>0.627700</td>\n", | ||
|  |        "    </tr>\n", | ||
|  |        "  </tbody>\n", | ||
|  |        "</table><p>" | ||
|  |       ], | ||
|  |       "text/plain": [ | ||
|  |        "<IPython.core.display.HTML object>" | ||
|  |       ] | ||
|  |      }, | ||
|  |      "metadata": {} | ||
|  |     }, | ||
|  |     { | ||
|  |      "output_type": "stream", | ||
|  |      "name": "stderr", | ||
|  |      "text": [ | ||
|  |       "Saving model checkpoint to test_trainer/checkpoint-13000\n", | ||
|  |       "Configuration saved in test_trainer/checkpoint-13000/config.json\n", | ||
|  |       "Model weights saved in test_trainer/checkpoint-13000/pytorch_model.bin\n", | ||
|  |       "Saving model checkpoint to test_trainer/checkpoint-13500\n", | ||
|  |       "Configuration saved in test_trainer/checkpoint-13500/config.json\n", | ||
|  |       "Model weights saved in test_trainer/checkpoint-13500/pytorch_model.bin\n", | ||
|  |       "Saving model checkpoint to test_trainer/checkpoint-14000\n", | ||
|  |       "Configuration saved in test_trainer/checkpoint-14000/config.json\n", | ||
|  |       "Model weights saved in test_trainer/checkpoint-14000/pytorch_model.bin\n", | ||
|  |       "Saving model checkpoint to test_trainer/checkpoint-14500\n", | ||
|  |       "Configuration saved in test_trainer/checkpoint-14500/config.json\n", | ||
|  |       "Model weights saved in test_trainer/checkpoint-14500/pytorch_model.bin\n", | ||
|  |       "Saving model checkpoint to test_trainer/checkpoint-15000\n", | ||
|  |       "Configuration saved in test_trainer/checkpoint-15000/config.json\n", | ||
|  |       "Model weights saved in test_trainer/checkpoint-15000/pytorch_model.bin\n", | ||
|  |       "\n", | ||
|  |       "\n", | ||
|  |       "Training completed. Do not forget to share your model on huggingface.co/models =)\n", | ||
|  |       "\n", | ||
|  |       "\n" | ||
|  |      ] | ||
|  |     }, | ||
|  |     { | ||
|  |      "output_type": "execute_result", | ||
|  |      "data": { | ||
|  |       "text/plain": [ | ||
|  |        "TrainOutput(global_step=15000, training_loss=0.10941998901367188, metrics={'train_runtime': 1918.0916, 'train_samples_per_second': 62.562, 'train_steps_per_second': 7.82, 'total_flos': 3.24994775580672e+16, 'train_loss': 0.10941998901367188, 'epoch': 3.0})" | ||
|  |       ] | ||
|  |      }, | ||
|  |      "metadata": {}, | ||
|  |      "execution_count": 9 | ||
|  |     } | ||
|  |    ], | ||
|  |    "metadata": {} | ||
|  |   }, | ||
|  |   { | ||
|  |    "cell_type": "code", | ||
|  |    "execution_count": null, | ||
|  |    "source": [], | ||
|  |    "outputs": [], | ||
|  |    "metadata": {} | ||
|  |   } | ||
|  |  ], | ||
|  |  "metadata": { | ||
|  |   "orig_nbformat": 4, | ||
|  |   "language_info": { | ||
|  |    "name": "python", | ||
|  |    "version": "3.8.6", | ||
|  |    "mimetype": "text/x-python", | ||
|  |    "codemirror_mode": { | ||
|  |     "name": "ipython", | ||
|  |     "version": 3 | ||
|  |    }, | ||
|  |    "pygments_lexer": "ipython3", | ||
|  |    "nbconvert_exporter": "python", | ||
|  |    "file_extension": ".py" | ||
|  |   }, | ||
|  |   "kernelspec": { | ||
|  |    "name": "python3", | ||
|  |    "display_name": "Python 3.8.6 64-bit" | ||
|  |   }, | ||
|  |   "interpreter": { | ||
|  |    "hash": "c8daecebaf2d81430b8373e4b4af380b12df116248cd1bbadd3fc947f45a1f88" | ||
|  |   } | ||
|  |  }, | ||
|  |  "nbformat": 4, | ||
|  |  "nbformat_minor": 2 | ||
|  | } |