sensai/examples/pandas.ipynb

165 lines
3.9 KiB
Plaintext
Raw Permalink Normal View History

2022-06-03 16:36:32 +09:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DATASET_DIR /home/uetchy/repos/src/github.com/holodata/sensai-huggingface\n"
]
}
],
"source": [
"import pandas as pd\n",
"from os.path import join\n",
"import os\n",
"from glob import glob\n",
"\n",
"DATASET_DIR = os.environ.get(\"DATASET_DIR\", \"../input/sensai\")\n",
"print(\"DATASET_DIR\", DATASET_DIR)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 10677038 entries, 0 to 10677037\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 body object\n",
" 1 label object\n",
"dtypes: object(2)\n",
"memory usage: 162.9+ MB\n"
]
}
],
"source": [
"df = pd.concat(\n",
" [pd.read_parquet(x) for x in glob(join(DATASET_DIR, '*.parquet'))],\n",
" ignore_index=True)\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6229407</th>\n",
" <td>Blessed stream</td>\n",
" <td>hidden</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7406071</th>\n",
" <td>RIP</td>\n",
" <td>nonflagged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>920434</th>\n",
" <td>cute</td>\n",
" <td>nonflagged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6146625</th>\n",
" <td>GACHA lets gooo</td>\n",
" <td>hidden</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8259711</th>\n",
" <td>草</td>\n",
" <td>hidden</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body label\n",
"6229407 Blessed stream hidden\n",
"7406071 RIP nonflagged\n",
"920434 cute nonflagged\n",
"6146625 GACHA lets gooo hidden\n",
"8259711 草 hidden"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "24403c88b9bd347a0b41a9fc3e3175e2948f6e2f45c79c9df260f2a479a817d3"
},
"kernelspec": {
"display_name": "Python 3.8.6 64-bit ('.venv': poetry)",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}