From 6cd6370a71cac55902fa64967ac6c98ec975e737 Mon Sep 17 00:00:00 2001 From: Yasuaki Uechi Date: Mon, 1 Nov 2021 14:21:38 +0900 Subject: [PATCH] fix: add label --- README.md | 12 ++++++------ .../generator/{__init__.py => __main__.py} | 0 sensai_dataset/generator/commands.py | 13 +++++++------ 3 files changed, 13 insertions(+), 12 deletions(-) rename sensai_dataset/generator/{__init__.py => __main__.py} (100%) diff --git a/README.md b/README.md index 61b2436..e827ccc 100644 --- a/README.md +++ b/README.md @@ -34,12 +34,12 @@ Ban and deletion are equivalent to `markChatItemsByAuthorAsDeletedAction` and `m ### Chats (`chats_%Y-%m.csv`) -| column | type | description | -| --------------- | ------ | ---------------------------- | -| body | string | chat message | -| authorChannelId | string | anonymized author channel id | -| channelId | string | source channel id | -| label | enum | toxic,spam,safe | +| column | type | description | +| --------------- | ------ | ----------------------------- | +| body | string | chat message | +| authorChannelId | string | anonymized author channel id | +| channelId | string | source channel id | +| label | string | {deleted, hidden, nonflagged} | ## Usage diff --git a/sensai_dataset/generator/__init__.py b/sensai_dataset/generator/__main__.py similarity index 100% rename from sensai_dataset/generator/__init__.py rename to sensai_dataset/generator/__main__.py diff --git a/sensai_dataset/generator/commands.py b/sensai_dataset/generator/commands.py index b912bbd..de35808 100644 --- a/sensai_dataset/generator/commands.py +++ b/sensai_dataset/generator/commands.py @@ -12,11 +12,11 @@ def generate_dataset(source_dir, target_dir, matcher): del_events = pd.read_csv(delet_path, usecols=['id', 'retracted']) del_events = del_events.query('retracted == 0').copy() del_events.drop(columns=['retracted'], inplace=True) - del_events['label'] = 'toxic' + del_events['label'] = 'deleted' ban_path = join(source_dir, 'ban_events.csv') ban_events = pd.read_csv(ban_path, usecols=['authorChannelId', 'videoId']) - ban_events['label'] = 'spam' + ban_events['label'] = 'hidden' for f in sorted(iglob(join(source_dir, matcher))): period_string = splitext(basename(f))[0].split('_')[1] @@ -48,12 +48,13 @@ def generate_dataset(source_dir, target_dir, matcher): # apply mods print('>>> Merging deletion') - chats = pd.merge(chats, del_events, on='id', how='left') + chats.loc[chats['id'].isin(del_events['id']), 'label'] = 'deleted' - # fill NA label - chats['label'].fillna('safe', inplace=True) + # apply safe + print('>>> Applying safe') + chats['label'].fillna('nonflagged', inplace=True) - isFlagged = chats['label'] != 'safe' + isFlagged = chats['label'] != 'nonflagged' flagged = chats[isFlagged].copy() # to make balanced dataset