mirror of
https://github.com/holodata/sensai-dataset.git
synced 2025-03-15 20:10:32 +09:00
fix: add label
This commit is contained in:
parent
dfe4980698
commit
6cd6370a71
@ -35,11 +35,11 @@ Ban and deletion are equivalent to `markChatItemsByAuthorAsDeletedAction` and `m
|
|||||||
### Chats (`chats_%Y-%m.csv`)
|
### Chats (`chats_%Y-%m.csv`)
|
||||||
|
|
||||||
| column | type | description |
|
| column | type | description |
|
||||||
| --------------- | ------ | ---------------------------- |
|
| --------------- | ------ | ----------------------------- |
|
||||||
| body | string | chat message |
|
| body | string | chat message |
|
||||||
| authorChannelId | string | anonymized author channel id |
|
| authorChannelId | string | anonymized author channel id |
|
||||||
| channelId | string | source channel id |
|
| channelId | string | source channel id |
|
||||||
| label | enum | toxic,spam,safe |
|
| label | string | {deleted, hidden, nonflagged} |
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
@ -12,11 +12,11 @@ def generate_dataset(source_dir, target_dir, matcher):
|
|||||||
del_events = pd.read_csv(delet_path, usecols=['id', 'retracted'])
|
del_events = pd.read_csv(delet_path, usecols=['id', 'retracted'])
|
||||||
del_events = del_events.query('retracted == 0').copy()
|
del_events = del_events.query('retracted == 0').copy()
|
||||||
del_events.drop(columns=['retracted'], inplace=True)
|
del_events.drop(columns=['retracted'], inplace=True)
|
||||||
del_events['label'] = 'toxic'
|
del_events['label'] = 'deleted'
|
||||||
|
|
||||||
ban_path = join(source_dir, 'ban_events.csv')
|
ban_path = join(source_dir, 'ban_events.csv')
|
||||||
ban_events = pd.read_csv(ban_path, usecols=['authorChannelId', 'videoId'])
|
ban_events = pd.read_csv(ban_path, usecols=['authorChannelId', 'videoId'])
|
||||||
ban_events['label'] = 'spam'
|
ban_events['label'] = 'hidden'
|
||||||
|
|
||||||
for f in sorted(iglob(join(source_dir, matcher))):
|
for f in sorted(iglob(join(source_dir, matcher))):
|
||||||
period_string = splitext(basename(f))[0].split('_')[1]
|
period_string = splitext(basename(f))[0].split('_')[1]
|
||||||
@ -48,12 +48,13 @@ def generate_dataset(source_dir, target_dir, matcher):
|
|||||||
|
|
||||||
# apply mods
|
# apply mods
|
||||||
print('>>> Merging deletion')
|
print('>>> Merging deletion')
|
||||||
chats = pd.merge(chats, del_events, on='id', how='left')
|
chats.loc[chats['id'].isin(del_events['id']), 'label'] = 'deleted'
|
||||||
|
|
||||||
# fill NA label
|
# apply safe
|
||||||
chats['label'].fillna('safe', inplace=True)
|
print('>>> Applying safe')
|
||||||
|
chats['label'].fillna('nonflagged', inplace=True)
|
||||||
|
|
||||||
isFlagged = chats['label'] != 'safe'
|
isFlagged = chats['label'] != 'nonflagged'
|
||||||
flagged = chats[isFlagged].copy()
|
flagged = chats[isFlagged].copy()
|
||||||
|
|
||||||
# to make balanced dataset
|
# to make balanced dataset
|
||||||
|
Loading…
x
Reference in New Issue
Block a user