mirror of
https://github.com/holodata/sensai-dataset.git
synced 2025-03-15 12:00:32 +09:00
feat: add base
This commit is contained in:
commit
e241a46bfc
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
notebooks/*.ipynb linguist-vendored
|
149
.gitignore
vendored
Normal file
149
.gitignore
vendored
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
.envrc
|
||||||
|
.env
|
||||||
|
.vscode
|
||||||
|
/tmp
|
||||||
|
|
||||||
|
# Created by https://www.toptal.com/developers/gitignore/api/python
|
||||||
|
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# End of https://www.toptal.com/developers/gitignore/api/python
|
1
.prettierrc
Normal file
1
.prettierrc
Normal file
@ -0,0 +1 @@
|
|||||||
|
{}
|
13
CONTRIBUTING.md
Normal file
13
CONTRIBUTING.md
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# Contribution Guide
|
||||||
|
|
||||||
|
## Generate dataset
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 -m sensai_gen.cli -m 'chats_2021-*'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Upload new version of dataset (Maintainers only)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make upload
|
||||||
|
```
|
7
Makefile
Normal file
7
Makefile
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
all: build upload
|
||||||
|
|
||||||
|
build:
|
||||||
|
python3 -m sensai_gen.cli
|
||||||
|
|
||||||
|
upload:
|
||||||
|
kaggle datasets version -d -m "New version" --path $$DATASET_DIR
|
111
README.md
Normal file
111
README.md
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
# ❤️🩹 Sensai: Toxic Chat Dataset
|
||||||
|
|
||||||
|
Sensai is a dataset consists of live chats from all across Virtual YouTubers' live streams, ready for training toxic chat classification models.
|
||||||
|
|
||||||
|
Download the dataset from [Kaggle Datasets](https://www.kaggle.com/uetchy/sensai) and join `#livechat-dataset` channel on [holodata Discord](https://holodata.org/discord) for discussions.
|
||||||
|
|
||||||
|
## Provenance
|
||||||
|
|
||||||
|
- **Source:** YouTube Live Chat events (all streams covered by [Holodex](https://holodex.net), including Hololive, Nijisanji, 774inc, etc)
|
||||||
|
- **Temporal Coverage:** From 2021-01-15T05:15:33Z
|
||||||
|
- **Update Frequency:** At least once per month
|
||||||
|
|
||||||
|
## Research Ideas
|
||||||
|
|
||||||
|
- Toxic Chat Classification
|
||||||
|
- Spam Detection
|
||||||
|
- Sentence Transformer for Live Chats
|
||||||
|
|
||||||
|
See [public notebooks](https://www.kaggle.com/uetchy/sensai/code) for ideas.
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
| filename | summary | size |
|
||||||
|
| ------------------------- | -------------------------------------------------------------- | -------- |
|
||||||
|
| `chats_flagged_%Y-%m.csv` | Chats flagged as either deleted or banned by mods (3,100,000+) | ~ 400 MB |
|
||||||
|
| `chats_nonflag_%Y-%m.csv` | Non-flagged chats (3,000,000+) | ~ 300 MB |
|
||||||
|
|
||||||
|
To make it a balanced dataset, the number of `chats_nonflags` is adjusted (randomly sampled) to be the same as `chats_flagged`.
|
||||||
|
Ban and deletion are equivalent to `markChatItemsByAuthorAsDeletedAction` and `markChatItemAsDeletedAction` respectively.
|
||||||
|
|
||||||
|
## Dataset Breakdown
|
||||||
|
|
||||||
|
### Chats (`chats_%Y-%m.csv`)
|
||||||
|
|
||||||
|
| column | type | description |
|
||||||
|
| --------------- | ------ | ---------------------------- |
|
||||||
|
| timestamp | string | UTC timestamp |
|
||||||
|
| body | string | chat message |
|
||||||
|
| membership | string | membership status |
|
||||||
|
| id | string | anonymized chat id |
|
||||||
|
| authorChannelId | string | anonymized author channel id |
|
||||||
|
| videoId | string | source video id |
|
||||||
|
| channelId | string | source channel id |
|
||||||
|
|
||||||
|
#### Membership status
|
||||||
|
|
||||||
|
| value | duration |
|
||||||
|
| ----------------- | ------------------------- |
|
||||||
|
| unknown | Indistinguishable |
|
||||||
|
| non-member | 0 |
|
||||||
|
| less than 1 month | < 1 month |
|
||||||
|
| 1 month | >= 1 month, < 2 months |
|
||||||
|
| 2 months | >= 2 months, < 6 months |
|
||||||
|
| 6 months | >= 6 months, < 12 months |
|
||||||
|
| 1 year | >= 12 months, < 24 months |
|
||||||
|
| 2 years | >= 24 months |
|
||||||
|
|
||||||
|
#### Pandas usage
|
||||||
|
|
||||||
|
Set `keep_default_na` to `False` and `na_values` to `''` in `read_csv`. Otherwise, chat message like `NA` would incorrectly be treated as NaN value.
|
||||||
|
|
||||||
|
```python
|
||||||
|
chats = pd.read_csv('../input/vtuber-livechat/chats_2021-03.csv',
|
||||||
|
na_values='',
|
||||||
|
keep_default_na=False,
|
||||||
|
index_col='timestamp',
|
||||||
|
parse_dates=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Channels (`channels.csv`)
|
||||||
|
|
||||||
|
| column | type | description |
|
||||||
|
| ----------------- | --------------- | ---------------------- |
|
||||||
|
| channelId | string | channel id |
|
||||||
|
| name | string | channel name |
|
||||||
|
| englishName | nullable string | channel name (English) |
|
||||||
|
| affiliation | string | channel affiliation |
|
||||||
|
| group | nullable string | group |
|
||||||
|
| subscriptionCount | number | subscription count |
|
||||||
|
| videoCount | number | uploads count |
|
||||||
|
| photo | string | channel icon |
|
||||||
|
|
||||||
|
Inactive channels have `INACTIVE` in `group` column.
|
||||||
|
|
||||||
|
## Consideration
|
||||||
|
|
||||||
|
### Anonymization
|
||||||
|
|
||||||
|
`id` and `channelId` are anonymized by SHA-1 hashing algorithm with a pinch of undisclosed salt.
|
||||||
|
|
||||||
|
### Handling Custom Emojis
|
||||||
|
|
||||||
|
All custom emojis are replaced with a Unicode replacement character `U+FFFD`.
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
```latex
|
||||||
|
@misc{sensai-dataset,
|
||||||
|
author={Yasuaki Uechi},
|
||||||
|
title={Sensai: Large Scale Virtual YouTubers Live Chat Dataset},
|
||||||
|
year={2021},
|
||||||
|
month={8},
|
||||||
|
version={31},
|
||||||
|
url={https://github.com/holodata/sensai-dataset}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
- Code: [MIT License](https://github.com/holodata/sensai-dataset/blob/master/LICENSE)
|
||||||
|
- Dataset: [ODC Public Domain Dedication and Licence (PDDL)](https://opendatacommons.org/licenses/pddl/1-0/index.html)
|
2105
poetry.lock
generated
Normal file
2105
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
25
pyproject.toml
Normal file
25
pyproject.toml
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
[tool.poetry]
|
||||||
|
name = "sensai"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "Toxic Live Chat Dataset"
|
||||||
|
authors = ["Yasuaki Uechi <y@uechi.io>"]
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.9"
|
||||||
|
kaggle = "^1.5.12"
|
||||||
|
numpy = "^1.21.0"
|
||||||
|
pandas = "^1.2.3"
|
||||||
|
pymongo = "^3.11.3"
|
||||||
|
python-dateutil = "^2.8.1"
|
||||||
|
altair = "^4.1.0"
|
||||||
|
matplotlib = "^3.4.2"
|
||||||
|
streamlit = "^0.87.0"
|
||||||
|
plotly = "^5.0.0"
|
||||||
|
|
||||||
|
[tool.poetry.dev-dependencies]
|
||||||
|
ipykernel = "^6.2.0"
|
||||||
|
yapf = "^0.31.0"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core>=1.0.0"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
0
sensai/__init__.py
Normal file
0
sensai/__init__.py
Normal file
0
sensai_gen/__init__.py
Normal file
0
sensai_gen/__init__.py
Normal file
123
sensai_gen/cli.py
Normal file
123
sensai_gen/cli.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
import gc
|
||||||
|
from glob import iglob
|
||||||
|
import argparse
|
||||||
|
import shutil
|
||||||
|
from os.path import basename, join, splitext
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from sensai_gen.constants import DATASET_DIR, DATASET_SOURCE_DIR
|
||||||
|
|
||||||
|
|
||||||
|
def load_channels(**kwargs):
|
||||||
|
dtype_dict = {
|
||||||
|
'channelId': 'category',
|
||||||
|
'name': 'category',
|
||||||
|
'englishName': 'category',
|
||||||
|
'affiliation': 'category',
|
||||||
|
'group': 'category',
|
||||||
|
'subscriptionCount': 'int32',
|
||||||
|
'videoCount': 'int32',
|
||||||
|
'photo': 'category'
|
||||||
|
}
|
||||||
|
channels = pd.read_csv(join(DATASET_SOURCE_DIR, 'channels.csv'),
|
||||||
|
dtype=dtype_dict,
|
||||||
|
**kwargs)
|
||||||
|
return channels
|
||||||
|
|
||||||
|
|
||||||
|
def generate_dataset(matcher):
|
||||||
|
print('[generate_sensai_dataset]')
|
||||||
|
|
||||||
|
delet_path = join(DATASET_SOURCE_DIR, 'deletion_events.csv')
|
||||||
|
del_events = pd.read_csv(delet_path, usecols=['id', 'retracted'])
|
||||||
|
del_events = del_events.query('retracted == 0').copy()
|
||||||
|
del_events.drop(columns=['retracted'], inplace=True)
|
||||||
|
del_events['deleted'] = True
|
||||||
|
|
||||||
|
ban_path = join(DATASET_SOURCE_DIR, 'ban_events.csv')
|
||||||
|
ban_events = pd.read_csv(ban_path, usecols=['authorChannelId', 'videoId'])
|
||||||
|
ban_events['banned'] = True
|
||||||
|
|
||||||
|
for f in sorted(iglob(join(DATASET_SOURCE_DIR, matcher))):
|
||||||
|
period_string = splitext(basename(f))[0].split('_')[1]
|
||||||
|
print('>>> Period:', period_string)
|
||||||
|
|
||||||
|
columns_to_use = [
|
||||||
|
'body',
|
||||||
|
'authorChannelId',
|
||||||
|
'channelId',
|
||||||
|
'membership',
|
||||||
|
'id',
|
||||||
|
'videoId',
|
||||||
|
]
|
||||||
|
columns_to_delete = [
|
||||||
|
'id',
|
||||||
|
'videoId',
|
||||||
|
'deleted',
|
||||||
|
'banned',
|
||||||
|
]
|
||||||
|
|
||||||
|
# load chat
|
||||||
|
print('>>> Loading chats')
|
||||||
|
chat_path = join(DATASET_SOURCE_DIR, 'chats_' + period_string + '.csv')
|
||||||
|
chat_dtype = {
|
||||||
|
'authorChannelId': 'category',
|
||||||
|
'membership': 'category',
|
||||||
|
'videoId': 'category',
|
||||||
|
'channelId': 'category'
|
||||||
|
}
|
||||||
|
chats = pd.read_csv(chat_path, dtype=chat_dtype, usecols=columns_to_use)
|
||||||
|
|
||||||
|
# apply mods
|
||||||
|
print('>>> Merging deletion')
|
||||||
|
chats = pd.merge(chats, del_events, on='id', how='left')
|
||||||
|
chats['deleted'].fillna(False, inplace=True)
|
||||||
|
|
||||||
|
# apply mods
|
||||||
|
print('>>> Merging bans')
|
||||||
|
chats = pd.merge(chats,
|
||||||
|
ban_events,
|
||||||
|
on=['authorChannelId', 'videoId'],
|
||||||
|
how='left')
|
||||||
|
chats['banned'].fillna(False, inplace=True)
|
||||||
|
|
||||||
|
flagged = chats[(chats['deleted'] | chats['banned'])].copy()
|
||||||
|
|
||||||
|
# to make balanced dataset
|
||||||
|
nbFlagged = flagged.shape[0]
|
||||||
|
if nbFlagged == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print('>>> Sampling nonflagged chats')
|
||||||
|
print('nbFlagged', nbFlagged)
|
||||||
|
nonflag = chats[~(chats['deleted'] | chats['banned'])].sample(nbFlagged)
|
||||||
|
|
||||||
|
print('>>> Writing dataset')
|
||||||
|
|
||||||
|
flagged.drop(columns=columns_to_delete, inplace=True)
|
||||||
|
flagged.to_csv(join(DATASET_DIR, f'chats_flagged_{period_string}.csv'),
|
||||||
|
index=False)
|
||||||
|
nonflag.drop(columns=columns_to_delete, inplace=True)
|
||||||
|
nonflag.to_csv(join(DATASET_DIR, f'chats_nonflag_{period_string}.csv'),
|
||||||
|
index=False)
|
||||||
|
|
||||||
|
# free up memory
|
||||||
|
del nonflag
|
||||||
|
del flagged
|
||||||
|
del chats
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description='dataset generator')
|
||||||
|
parser.add_argument('-m', '--matcher', type=str, default='chats_*.csv')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print('target: ' + DATASET_DIR)
|
||||||
|
print('source: ' + DATASET_SOURCE_DIR)
|
||||||
|
|
||||||
|
shutil.copy(join(DATASET_SOURCE_DIR, 'channels.csv'), DATASET_DIR)
|
||||||
|
|
||||||
|
generate_dataset(matcher=args.matcher)
|
7
sensai_gen/constants.py
Normal file
7
sensai_gen/constants.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
DATASET_DIR = os.environ['DATASET_DIR']
|
||||||
|
DATASET_SOURCE_DIR = os.environ['DATASET_SOURCE_DIR']
|
||||||
|
|
||||||
|
os.makedirs(DATASET_DIR, exist_ok=True)
|
||||||
|
os.makedirs(DATASET_SOURCE_DIR, exist_ok=True)
|
Loading…
x
Reference in New Issue
Block a user