commit 91934136cac60901b16851a3ae0f0d9b9a4ceefb Author: Karamaz0V1 Date: Tue Jan 19 14:03:41 2021 +0100 Add leakage diff --git a/Data Leakage.ipynb b/Data Leakage.ipynb new file mode 100644 index 0000000..d9ca7ae --- /dev/null +++ b/Data Leakage.ipynb @@ -0,0 +1,397 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fait du poireautage¹ de données comme un pro\n", + "\n", + "\n", + "\n", + "----\n", + "\n", + "1: Traduction approximative" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![leek](https://media.giphy.com/media/d8KkDupA6b3o9UQlbv/source.gif)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.datasets import make_classification\n", + "from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit\n", + "from sklearn.metrics import cohen_kappa_score, make_scorer\n", + "from tqdm.auto import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "n_samples = 100\n", + "n_features = 3072\n", + "n_classes = 5\n", + "n_features_selection = 927" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = make_classification(n_samples, n_features, \n", + " n_classes=n_classes,\n", + " n_informative=10,\n", + " n_redundant=0,\n", + " n_repeated=0,\n", + " class_sep=3,\n", + " flip_y=0.01)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((100, 3072),)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape, " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(100,)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "kf = StratifiedShuffleSplit(n_splits=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Stratified Shuffle](https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_0091.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "302160b95f4b4f58aba21f5ca8e42c74", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "𝛋 = 0.19 ± 0.19\n" + ] + } + ], + "source": [ + "def benchmark(X):\n", + " s = []\n", + " for i in tqdm(range(25)):\n", + " s += [cross_val_score(RandomForestClassifier(), X, y, cv=kf, n_jobs=-1, scoring=make_scorer(cohen_kappa_score))]\n", + " s = np.array(s)\n", + " print('𝛋 = {:.2f} ± {:.2f}'.format(s.mean(), s.std()))\n", + "\n", + "benchmark(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def select_best_features(X):\n", + " clf = RandomForestClassifier(random_state=0)\n", + " clf.fit(X, y)\n", + "\n", + " ibest = np.argsort(clf.feature_importances_)[-n_features_selection:]\n", + "\n", + " return X[:,ibest]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(100, 927)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "select_best_features(X).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "28ec83d68b1849e4abc56f240708fa7c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "𝛋 = 0.52 ± 0.19\n" + ] + } + ], + "source": [ + "benchmark(select_best_features(X))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Who cares about data quality anyway" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now same but full random X" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "Xr = np.random.random((n_samples, n_features))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(100, 3072)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Xr.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8a1163d81cf34c919ba23b3b41ab74a6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "𝛋 = 0.05 ± 0.14\n" + ] + } + ], + "source": [ + "benchmark(Xr)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0d1b80a130ed4e6fade6173e4752c1d3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "𝛋 = 0.18 ± 0.17\n" + ] + } + ], + "source": [ + "benchmark(select_best_features(Xr))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://media1.tenor.com/images/766adcaf4097f8c68aa069ad0fbaeb79/tenor.gif)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- \n", + "- \n", + "- Part 7.10.2 (p. 264) of \n", + "- \n", + "- " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RESPECTEZ LES GESTES BARRIÈRES" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **If train/val then test is your god**\n", + "2. Feature selection should be done in CV\n", + "3. _For nihilists_: get more training data.\n", + "4. _Get better data._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}