mirror of
https://github.com/holodata/sensai-dataset.git
synced 2025-03-15 20:10:32 +09:00
12 KiB
Vendored
12 KiB
Vendored
In [1]:
from datasets import load_dataset, Features
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import os
from os.path import join
import pandas as pd
from datasets import ClassLabel, Value
# https://huggingface.co/docs/datasets/loading_datasets.html
DATASET_DIR = os.environ['DATASET_DIR']
In [2]:
dataset = load_dataset("holodata/sensai", features=Features(
{
"body": Value("string"),
"toxic": ClassLabel(num_classes=2, names=['0', '1'])
}
))
dataset = dataset['train']
In [3]:
dataset.features
Out[3]:
In [4]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
In [5]:
samples = dataset.shuffle().select(range(50000))
def tokenize_function(examples):
return tokenizer(examples["body"], padding="max_length", truncation=True)
tokenized_datasets = samples.map(tokenize_function, batched=True)
tokenized_datasets.rename_column_("toxic", "label")
In [6]:
splitset = tokenized_datasets.train_test_split(0.2)
splitset
Out[6]:
In [7]:
training_args = TrainingArguments("test_trainer")
trainer = Trainer(
model=model, args=training_args, train_dataset=splitset['train'], eval_dataset=splitset['test']
)
In [9]:
trainer.train(resume_from_checkpoint=True)
Out[9]:
In [ ]: