fix: add label

This commit is contained in:
uetchy 2021-11-01 14:21:38 +09:00
parent dfe4980698
commit 6cd6370a71
3 changed files with 13 additions and 12 deletions

View File

@ -34,12 +34,12 @@ Ban and deletion are equivalent to `markChatItemsByAuthorAsDeletedAction` and `m
### Chats (`chats_%Y-%m.csv`) ### Chats (`chats_%Y-%m.csv`)
| column | type | description | | column | type | description |
| --------------- | ------ | ---------------------------- | | --------------- | ------ | ----------------------------- |
| body | string | chat message | | body | string | chat message |
| authorChannelId | string | anonymized author channel id | | authorChannelId | string | anonymized author channel id |
| channelId | string | source channel id | | channelId | string | source channel id |
| label | enum | toxic,spam,safe | | label | string | {deleted, hidden, nonflagged} |
## Usage ## Usage

View File

@ -12,11 +12,11 @@ def generate_dataset(source_dir, target_dir, matcher):
del_events = pd.read_csv(delet_path, usecols=['id', 'retracted']) del_events = pd.read_csv(delet_path, usecols=['id', 'retracted'])
del_events = del_events.query('retracted == 0').copy() del_events = del_events.query('retracted == 0').copy()
del_events.drop(columns=['retracted'], inplace=True) del_events.drop(columns=['retracted'], inplace=True)
del_events['label'] = 'toxic' del_events['label'] = 'deleted'
ban_path = join(source_dir, 'ban_events.csv') ban_path = join(source_dir, 'ban_events.csv')
ban_events = pd.read_csv(ban_path, usecols=['authorChannelId', 'videoId']) ban_events = pd.read_csv(ban_path, usecols=['authorChannelId', 'videoId'])
ban_events['label'] = 'spam' ban_events['label'] = 'hidden'
for f in sorted(iglob(join(source_dir, matcher))): for f in sorted(iglob(join(source_dir, matcher))):
period_string = splitext(basename(f))[0].split('_')[1] period_string = splitext(basename(f))[0].split('_')[1]
@ -48,12 +48,13 @@ def generate_dataset(source_dir, target_dir, matcher):
# apply mods # apply mods
print('>>> Merging deletion') print('>>> Merging deletion')
chats = pd.merge(chats, del_events, on='id', how='left') chats.loc[chats['id'].isin(del_events['id']), 'label'] = 'deleted'
# fill NA label # apply safe
chats['label'].fillna('safe', inplace=True) print('>>> Applying safe')
chats['label'].fillna('nonflagged', inplace=True)
isFlagged = chats['label'] != 'safe' isFlagged = chats['label'] != 'nonflagged'
flagged = chats[isFlagged].copy() flagged = chats[isFlagged].copy()
# to make balanced dataset # to make balanced dataset