sensai/examples/pandas.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DATASET_DIR /home/uetchy/repos/src/github.com/holodata/sensai-huggingface\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from os.path import join\n",
    "import os\n",
    "from glob import glob\n",
    "\n",
    "DATASET_DIR = os.environ.get(\"DATASET_DIR\", \"../input/sensai\")\n",
    "print(\"DATASET_DIR\", DATASET_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 10677038 entries, 0 to 10677037\n",
      "Data columns (total 2 columns):\n",
      " #   Column  Dtype \n",
      "---  ------  ----- \n",
      " 0   body    object\n",
      " 1   label   object\n",
      "dtypes: object(2)\n",
      "memory usage: 162.9+ MB\n"
     ]
    }
   ],
   "source": [
    "df = pd.concat(\n",
    "    [pd.read_parquet(x) for x in glob(join(DATASET_DIR, '*.parquet'))],\n",
    "    ignore_index=True)\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>body</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6229407</th>\n",
       "      <td>Blessed stream</td>\n",
       "      <td>hidden</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7406071</th>\n",
       "      <td>RIP</td>\n",
       "      <td>nonflagged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>920434</th>\n",
       "      <td>cute</td>\n",
       "      <td>nonflagged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6146625</th>\n",
       "      <td>GACHA lets gooo</td>\n",
       "      <td>hidden</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8259711</th>\n",
       "      <td>草</td>\n",
       "      <td>hidden</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    body       label\n",
       "6229407   Blessed stream      hidden\n",
       "7406071              RIP  nonflagged\n",
       "920434              cute  nonflagged\n",
       "6146625  GACHA lets gooo      hidden\n",
       "8259711                草      hidden"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "24403c88b9bd347a0b41a9fc3e3175e2948f6e2f45c79c9df260f2a479a817d3"
  },
  "kernelspec": {
   "display_name": "Python 3.8.6 64-bit ('.venv': poetry)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}