{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Fait du poireautage¹ de données comme un pro\n", "\n", "\n", "\n", "----\n", "\n", "1: Traduction approximative" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![leek](https://media.giphy.com/media/d8KkDupA6b3o9UQlbv/source.gif)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.datasets import make_classification\n", "from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit\n", "from sklearn.metrics import cohen_kappa_score, make_scorer\n", "from tqdm.auto import tqdm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "n_samples = 100\n", "n_features = 3072\n", "n_classes = 5\n", "n_features_selection = 927" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "X, y = make_classification(n_samples, n_features, \n", " n_classes=n_classes,\n", " n_informative=10,\n", " n_redundant=0,\n", " n_repeated=0,\n", " class_sep=3,\n", " flip_y=0.01)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((100, 3072),)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape, " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(100,)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y.shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "kf = StratifiedShuffleSplit(n_splits=2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![Stratified Shuffle](https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_0091.png)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "302160b95f4b4f58aba21f5ca8e42c74", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "𝛋 = 0.19 ± 0.19\n" ] } ], "source": [ "def benchmark(X):\n", " s = []\n", " for i in tqdm(range(25)):\n", " s += [cross_val_score(RandomForestClassifier(), X, y, cv=kf, n_jobs=-1, scoring=make_scorer(cohen_kappa_score))]\n", " s = np.array(s)\n", " print('𝛋 = {:.2f} ± {:.2f}'.format(s.mean(), s.std()))\n", "\n", "benchmark(X)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def select_best_features(X):\n", " clf = RandomForestClassifier(random_state=0)\n", " clf.fit(X, y)\n", "\n", " ibest = np.argsort(clf.feature_importances_)[-n_features_selection:]\n", "\n", " return X[:,ibest]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(100, 927)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select_best_features(X).shape" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "28ec83d68b1849e4abc56f240708fa7c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "𝛋 = 0.52 ± 0.19\n" ] } ], "source": [ "benchmark(select_best_features(X))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Who cares about data quality anyway" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "now same but full random X" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "Xr = np.random.random((n_samples, n_features))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(100, 3072)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Xr.shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8a1163d81cf34c919ba23b3b41ab74a6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "𝛋 = 0.05 ± 0.14\n" ] } ], "source": [ "benchmark(Xr)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0d1b80a130ed4e6fade6173e4752c1d3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "𝛋 = 0.18 ± 0.17\n" ] } ], "source": [ "benchmark(select_best_features(Xr))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![](https://media1.tenor.com/images/766adcaf4097f8c68aa069ad0fbaeb79/tenor.gif)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- \n", "- \n", "- Part 7.10.2 (p. 264) of \n", "- \n", "- " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RESPECTEZ LES GESTES BARRIÈRES" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. **If train/val then test is your god**\n", "2. Feature selection should be done in CV\n", "3. _For nihilists_: get more training data.\n", "4. _Get better data._" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }