feat: add base

2025-07-05 17:20:02 +09:00 · 2021-09-01 12:11:38 +09:00 · 2021-09-01 12:11:38 +09:00 · e241a46bfc
commit e241a46bfc
12 changed files with 2542 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
 notebooks/*.ipynb linguist-vendored
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,149 @@
 .envrc
 .env
 .vscode
 /tmp
 # Created by https://www.toptal.com/developers/gitignore/api/python
 # Edit at https://www.toptal.com/developers/gitignore?templates=python
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # End of https://www.toptal.com/developers/gitignore/api/python
--- a/.prettierrc
+++ b/.prettierrc
@ -0,0 +1 @@
 {}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,13 @@
 # Contribution Guide
 ## Generate dataset
 ```bash
 python3 -m sensai_gen.cli -m 'chats_2021-*'
 ```
 ## Upload new version of dataset (Maintainers only)
 ```bash
 make upload
 ```
--- a/7
+++ b/7
@ -0,0 +1,7 @@
 all: build upload
 build:
 	python3 -m sensai_gen.cli
 upload:
 	kaggle datasets version -d -m "New version" --path $$DATASET_DIR
--- a/README.md
+++ b/README.md
@ -0,0 +1,111 @@
 # ❤️‍🩹 Sensai: Toxic Chat Dataset
 Sensai is a dataset consists of live chats from all across Virtual YouTubers' live streams, ready for training toxic chat classification models.
 Download the dataset from [Kaggle Datasets](https://www.kaggle.com/uetchy/sensai) and join `#livechat-dataset` channel on [holodata Discord](https://holodata.org/discord) for discussions.
 ## Provenance
 - **Source:** YouTube Live Chat events (all streams covered by [Holodex](https://holodex.net), including Hololive, Nijisanji, 774inc, etc)
 - **Temporal Coverage:** From 2021-01-15T05:15:33Z
 - **Update Frequency:** At least once per month
 ## Research Ideas
 - Toxic Chat Classification
 - Spam Detection
 - Sentence Transformer for Live Chats
 See [public notebooks](https://www.kaggle.com/uetchy/sensai/code) for ideas.
 ## Files
 | filename                  | summary                                                        | size     |
 | ------------------------- | -------------------------------------------------------------- | -------- |
 | `chats_flagged_%Y-%m.csv` | Chats flagged as either deleted or banned by mods (3,100,000+) | ~ 400 MB |
 | `chats_nonflag_%Y-%m.csv` | Non-flagged chats (3,000,000+)                                 | ~ 300 MB |
 To make it a balanced dataset, the number of `chats_nonflags` is adjusted (randomly sampled) to be the same as `chats_flagged`.
 Ban and deletion are equivalent to `markChatItemsByAuthorAsDeletedAction` and `markChatItemAsDeletedAction` respectively.
 ## Dataset Breakdown
 ### Chats (`chats_%Y-%m.csv`)
 | column          | type   | description                  |
 | --------------- | ------ | ---------------------------- |
 | timestamp       | string | UTC timestamp                |
 | body            | string | chat message                 |
 | membership      | string | membership status            |
 | id              | string | anonymized chat id           |
 | authorChannelId | string | anonymized author channel id |
 | videoId         | string | source video id              |
 | channelId       | string | source channel id            |
 #### Membership status
 | value             | duration                  |
 | ----------------- | ------------------------- |
 | unknown           | Indistinguishable         |
 | non-member        | 0                         |
 | less than 1 month | < 1 month                 |
 | 1 month           | >= 1 month, < 2 months    |
 | 2 months          | >= 2 months, < 6 months   |
 | 6 months          | >= 6 months, < 12 months  |
 | 1 year            | >= 12 months, < 24 months |
 | 2 years           | >= 24 months              |
 #### Pandas usage
 Set `keep_default_na` to `False` and `na_values` to `''` in `read_csv`. Otherwise, chat message like `NA` would incorrectly be treated as NaN value.
 ```python
 chats = pd.read_csv('../input/vtuber-livechat/chats_2021-03.csv',
                    na_values='',
                    keep_default_na=False,
                    index_col='timestamp',
                    parse_dates=True)
 ```
 ### Channels (`channels.csv`)
 | column            | type            | description            |
 | ----------------- | --------------- | ---------------------- |
 | channelId         | string          | channel id             |
 | name              | string          | channel name           |
 | englishName       | nullable string | channel name (English) |
 | affiliation       | string          | channel affiliation    |
 | group             | nullable string | group                  |
 | subscriptionCount | number          | subscription count     |
 | videoCount        | number          | uploads count          |
 | photo             | string          | channel icon           |
 Inactive channels have `INACTIVE` in `group` column.
 ## Consideration
 ### Anonymization
 `id` and `channelId` are anonymized by SHA-1 hashing algorithm with a pinch of undisclosed salt.
 ### Handling Custom Emojis
 All custom emojis are replaced with a Unicode replacement character `U+FFFD`.
 ## Citation
 ```latex
@misc{sensai-dataset,
 author={Yasuaki Uechi},
 title={Sensai: Large Scale Virtual YouTubers Live Chat Dataset},
 year={2021},
 month={8},
 version={31},
 url={https://github.com/holodata/sensai-dataset}
 }
 ```
 ## License
 - Code: [MIT License](https://github.com/holodata/sensai-dataset/blob/master/LICENSE)
 - Dataset: [ODC Public Domain Dedication and Licence (PDDL)](https://opendatacommons.org/licenses/pddl/1-0/index.html)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,25 @@
 [tool.poetry]
 name = "sensai"
 version = "1.0.0"
 description = "Toxic Live Chat Dataset"
 authors = ["Yasuaki Uechi <y@uechi.io>"]
 [tool.poetry.dependencies]
 python = "^3.9"
 kaggle = "^1.5.12"
 numpy = "^1.21.0"
 pandas = "^1.2.3"
 pymongo = "^3.11.3"
 python-dateutil = "^2.8.1"
 altair = "^4.1.0"
 matplotlib = "^3.4.2"
 streamlit = "^0.87.0"
 plotly = "^5.0.0"
 [tool.poetry.dev-dependencies]
 ipykernel = "^6.2.0"
 yapf = "^0.31.0"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
--- a/sensai/init.py
+++ b/sensai/init.py
--- a/sensai_gen/init.py
+++ b/sensai_gen/init.py
--- a/sensai_gen/cli.py
+++ b/sensai_gen/cli.py
@ -0,0 +1,123 @@
 import gc
 from glob import iglob
 import argparse
 import shutil
 from os.path import basename, join, splitext
 import numpy as np
 import pandas as pd
 from sensai_gen.constants import DATASET_DIR, DATASET_SOURCE_DIR
 def load_channels(**kwargs):
    dtype_dict = {
        'channelId': 'category',
        'name': 'category',
        'englishName': 'category',
        'affiliation': 'category',
        'group': 'category',
        'subscriptionCount': 'int32',
        'videoCount': 'int32',
        'photo': 'category'
    }
    channels = pd.read_csv(join(DATASET_SOURCE_DIR, 'channels.csv'),
                           dtype=dtype_dict,
                           **kwargs)
    return channels
 def generate_dataset(matcher):
    print('[generate_sensai_dataset]')
    delet_path = join(DATASET_SOURCE_DIR, 'deletion_events.csv')
    del_events = pd.read_csv(delet_path, usecols=['id', 'retracted'])
    del_events = del_events.query('retracted == 0').copy()
    del_events.drop(columns=['retracted'], inplace=True)
    del_events['deleted'] = True
    ban_path = join(DATASET_SOURCE_DIR, 'ban_events.csv')
    ban_events = pd.read_csv(ban_path, usecols=['authorChannelId', 'videoId'])
    ban_events['banned'] = True
    for f in sorted(iglob(join(DATASET_SOURCE_DIR, matcher))):
        period_string = splitext(basename(f))[0].split('_')[1]
        print('>>> Period:', period_string)
        columns_to_use = [
            'body',
            'authorChannelId',
            'channelId',
            'membership',
            'id',
            'videoId',
        ]
        columns_to_delete = [
            'id',
            'videoId',
            'deleted',
            'banned',
        ]
        # load chat
        print('>>> Loading chats')
        chat_path = join(DATASET_SOURCE_DIR, 'chats_' + period_string + '.csv')
        chat_dtype = {
            'authorChannelId': 'category',
            'membership': 'category',
            'videoId': 'category',
            'channelId': 'category'
        }
        chats = pd.read_csv(chat_path, dtype=chat_dtype, usecols=columns_to_use)
        # apply mods
        print('>>> Merging deletion')
        chats = pd.merge(chats, del_events, on='id', how='left')
        chats['deleted'].fillna(False, inplace=True)
        # apply mods
        print('>>> Merging bans')
        chats = pd.merge(chats,
                         ban_events,
                         on=['authorChannelId', 'videoId'],
                         how='left')
        chats['banned'].fillna(False, inplace=True)
        flagged = chats[(chats['deleted'] | chats['banned'])].copy()
        # to make balanced dataset
        nbFlagged = flagged.shape[0]
        if nbFlagged == 0:
            continue
        print('>>> Sampling nonflagged chats')
        print('nbFlagged', nbFlagged)
        nonflag = chats[~(chats['deleted'] | chats['banned'])].sample(nbFlagged)
        print('>>> Writing dataset')
        flagged.drop(columns=columns_to_delete, inplace=True)
        flagged.to_csv(join(DATASET_DIR, f'chats_flagged_{period_string}.csv'),
                       index=False)
        nonflag.drop(columns=columns_to_delete, inplace=True)
        nonflag.to_csv(join(DATASET_DIR, f'chats_nonflag_{period_string}.csv'),
                       index=False)
        # free up memory
        del nonflag
        del flagged
        del chats
        gc.collect()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='dataset generator')
    parser.add_argument('-m', '--matcher', type=str, default='chats_*.csv')
    args = parser.parse_args()
    print('target: ' + DATASET_DIR)
    print('source: ' + DATASET_SOURCE_DIR)
    shutil.copy(join(DATASET_SOURCE_DIR, 'channels.csv'), DATASET_DIR)
    generate_dataset(matcher=args.matcher)
--- a/sensai_gen/constants.py
+++ b/sensai_gen/constants.py
@ -0,0 +1,7 @@
 import os
 DATASET_DIR = os.environ['DATASET_DIR']
 DATASET_SOURCE_DIR = os.environ['DATASET_SOURCE_DIR']
 os.makedirs(DATASET_DIR, exist_ok=True)
 os.makedirs(DATASET_SOURCE_DIR, exist_ok=True)