commit 91934136cac60901b16851a3ae0f0d9b9a4ceefb
Author: Karamaz0V1 <florent.guiotte@gmail.com>
Date:   Tue Jan 19 14:03:41 2021 +0100

    Add leakage

diff --git a/Data Leakage.ipynb b/Data Leakage.ipynb
new file mode 100644
index 0000000..d9ca7ae
--- /dev/null
+++ b/Data Leakage.ipynb	
@@ -0,0 +1,397 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fait du poireautage¹ de données comme un pro\n",
+    "\n",
+    "\n",
+    "\n",
+    "----\n",
+    "\n",
+    "1: Traduction approximative"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![leek](https://media.giphy.com/media/d8KkDupA6b3o9UQlbv/source.gif)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.datasets import make_classification\n",
+    "from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit\n",
+    "from sklearn.metrics import cohen_kappa_score, make_scorer\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_samples = 100\n",
+    "n_features = 3072\n",
+    "n_classes = 5\n",
+    "n_features_selection = 927"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X, y = make_classification(n_samples, n_features, \n",
+    "                           n_classes=n_classes,\n",
+    "                           n_informative=10,\n",
+    "                           n_redundant=0,\n",
+    "                           n_repeated=0,\n",
+    "                           class_sep=3,\n",
+    "                           flip_y=0.01)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((100, 3072),)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.shape, "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(100,)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kf = StratifiedShuffleSplit(n_splits=2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Stratified Shuffle](https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_0091.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "302160b95f4b4f58aba21f5ca8e42c74",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "𝛋 = 0.19 ± 0.19\n"
+     ]
+    }
+   ],
+   "source": [
+    "def benchmark(X):\n",
+    "    s = []\n",
+    "    for i in tqdm(range(25)):\n",
+    "        s += [cross_val_score(RandomForestClassifier(), X, y, cv=kf, n_jobs=-1, scoring=make_scorer(cohen_kappa_score))]\n",
+    "    s = np.array(s)\n",
+    "    print('𝛋 = {:.2f} ± {:.2f}'.format(s.mean(), s.std()))\n",
+    "\n",
+    "benchmark(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def select_best_features(X):\n",
+    "    clf = RandomForestClassifier(random_state=0)\n",
+    "    clf.fit(X, y)\n",
+    "\n",
+    "    ibest = np.argsort(clf.feature_importances_)[-n_features_selection:]\n",
+    "\n",
+    "    return X[:,ibest]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(100, 927)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "select_best_features(X).shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "28ec83d68b1849e4abc56f240708fa7c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "𝛋 = 0.52 ± 0.19\n"
+     ]
+    }
+   ],
+   "source": [
+    "benchmark(select_best_features(X))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Who cares about data quality anyway"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "now same but full random X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Xr = np.random.random((n_samples, n_features))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(100, 3072)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Xr.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a1163d81cf34c919ba23b3b41ab74a6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "𝛋 = 0.05 ± 0.14\n"
+     ]
+    }
+   ],
+   "source": [
+    "benchmark(Xr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0d1b80a130ed4e6fade6173e4752c1d3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "𝛋 = 0.18 ± 0.17\n"
+     ]
+    }
+   ],
+   "source": [
+    "benchmark(select_best_features(Xr))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![](https://media1.tenor.com/images/766adcaf4097f8c68aa069ad0fbaeb79/tenor.gif)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- <https://www.nature.com/articles/s41598-019-56185-5>\n",
+    "- <https://static-content.springer.com/esm/art%3A10.1038%2Fs41598-019-56185-5/MediaObjects/41598_2019_56185_MOESM1_ESM.pdf>\n",
+    "- Part 7.10.2 (p. 264) of <https://web.stanford.edu/~hastie/ElemStatLearn/>\n",
+    "- <https://towardsdatascience.com/data-leakage-in-machine-learning-10bdd3eec742>\n",
+    "- <https://www.reddit.com/r/MachineLearning/comments/jviymn/d_i_keep_running_across_studies_designed_this_way/>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## RESPECTEZ LES GESTES BARRIÈRES"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. **If train/val then test is your god**\n",
+    "2. Feature selection should be done in CV\n",
+    "3. _For nihilists_: get more training data.\n",
+    "4. _Get better data._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}