sensai/examples/pandas.ipynb

3.9 KiB

In [2]:
import pandas as pd
from os.path import join
import os
from glob import glob

DATASET_DIR = os.environ.get("DATASET_DIR", "../input/sensai")
print("DATASET_DIR", DATASET_DIR)
DATASET_DIR /home/uetchy/repos/src/github.com/holodata/sensai-huggingface
In [3]:
df = pd.concat(
    [pd.read_parquet(x) for x in glob(join(DATASET_DIR, '*.parquet'))],
    ignore_index=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10677038 entries, 0 to 10677037
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   body    object
 1   label   object
dtypes: object(2)
memory usage: 162.9+ MB
In [11]:
df.sample(5)
Out[11]:
body label
6229407 Blessed stream hidden
7406071 RIP nonflagged
920434 cute nonflagged
6146625 GACHA lets gooo hidden
8259711 hidden
In [ ]: