{ "cells": [ { "cell_type": "code", "execution_count": 1, "source": [ "from datasets import load_dataset, Features\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n", "import os\n", "from os.path import join\n", "import pandas as pd\n", "from datasets import ClassLabel, Value\n", "\n", "# https://huggingface.co/docs/datasets/loading_datasets.html\n", "\n", "DATASET_DIR = os.environ['DATASET_DIR']" ], "outputs": [], "metadata": {} }, { "cell_type": "markdown", "source": [], "metadata": {} }, { "cell_type": "code", "execution_count": 2, "source": [ "dataset = load_dataset(\"holodata/sensai\", features=Features(\n", " {\n", " \"body\": Value(\"string\"),\n", " \"toxic\": ClassLabel(num_classes=2, names=['0', '1'])\n", " }\n", " ))\n", "dataset = dataset['train']" ], "outputs": [ { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "704789c10f1e44ddbb83262b8a826eec" }, "text/plain": [ " 0%| | 0/1 [00:00, ?it/s]" ] }, "metadata": {} }, { "output_type": "stream", "name": "stderr", "text": [ "Using custom data configuration sensai-4d9ed81389161083\n", "Reusing dataset parquet (/home/uetchy/.cache/huggingface/datasets/parquet/sensai-4d9ed81389161083/0.0.0/9296ce43568b20d72ff8ff8ecbc821a16b68e9b8b7058805ef11f06e035f911a)\n" ] }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "8acdde0c4caa4d4698f97ef29993195b" }, "text/plain": [ " 0%| | 0/1 [00:00, ?it/s]" ] }, "metadata": {} } ], "metadata": {} }, { "cell_type": "code", "execution_count": 3, "source": [ "dataset.features" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'body': Value(dtype='string', id=None),\n", " 'toxic': ClassLabel(num_classes=2, names=['0', '1'], names_file=None, id=None)}" ] }, "metadata": {}, "execution_count": 3 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 4, "source": [ "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-cased\", num_labels=2)\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")" ], "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']\n", "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 5, "source": [ "samples = dataset.shuffle().select(range(50000))\n", "\n", "def tokenize_function(examples):\n", " return tokenizer(examples[\"body\"], padding=\"max_length\", truncation=True)\n", "\n", "tokenized_datasets = samples.map(tokenize_function, batched=True)\n", "tokenized_datasets.rename_column_(\"toxic\", \"label\")" ], "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Loading cached shuffled indices for dataset at /home/uetchy/.cache/huggingface/datasets/parquet/sensai-4d9ed81389161083/0.0.0/9296ce43568b20d72ff8ff8ecbc821a16b68e9b8b7058805ef11f06e035f911a/cache-24e3dd769ef2f1b7.arrow\n", "Loading cached processed dataset at /home/uetchy/.cache/huggingface/datasets/parquet/sensai-4d9ed81389161083/0.0.0/9296ce43568b20d72ff8ff8ecbc821a16b68e9b8b7058805ef11f06e035f911a/cache-8395f066c72e57d7.arrow\n", "/tmp/ipykernel_4082765/2982913603.py:7: FutureWarning: rename_column_ is deprecated and will be removed in the next major version of datasets. Use Dataset.rename_column instead.\n", " tokenized_datasets.rename_column_(\"toxic\", \"label\")\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 6, "source": [ "splitset = tokenized_datasets.train_test_split(0.2)\n", "splitset" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['attention_mask', 'body', 'input_ids', 'token_type_ids', 'label'],\n", " num_rows: 40000\n", " })\n", " test: Dataset({\n", " features: ['attention_mask', 'body', 'input_ids', 'token_type_ids', 'label'],\n", " num_rows: 10000\n", " })\n", "})" ] }, "metadata": {}, "execution_count": 6 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 7, "source": [ "training_args = TrainingArguments(\"test_trainer\")\n", "trainer = Trainer(\n", " model=model, args=training_args, train_dataset=splitset['train'], eval_dataset=splitset['test']\n", ")" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 9, "source": [ "trainer.train(resume_from_checkpoint=True)" ], "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Loading model from test_trainer/checkpoint-12500).\n", "The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: body.\n", "***** Running training *****\n", " Num examples = 40000\n", " Num Epochs = 3\n", " Instantaneous batch size per device = 8\n", " Total train batch size (w. parallel, distributed & accumulation) = 8\n", " Gradient Accumulation steps = 1\n", " Total optimization steps = 15000\n", " Continuing training from checkpoint, will skip to saved global_step\n", " Continuing training from epoch 2\n", " Continuing training from global step 12500\n", " Will skip the first 2 epochs then the first 2500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.\n" ] }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "d749dea90f914485916ce82c65e73fe6" }, "text/plain": [ " 0%| | 0/2500 [00:00, ?it/s]" ] }, "metadata": {} }, { "output_type": "stream", "name": "stderr", "text": [ "Didn't find an RNG file, if you are resuming a training that was launched in a distributed fashion, reproducibility is not guaranteed.\n" ] }, { "output_type": "display_data", "data": { "text/html": [ "\n", "
Step | \n", "Training Loss | \n", "
---|---|
13000 | \n", "0.687500 | \n", "
13500 | \n", "0.686300 | \n", "
14000 | \n", "0.637900 | \n", "
14500 | \n", "0.643200 | \n", "
15000 | \n", "0.627700 | \n", "
"
],
"text/plain": [
"