398 lines
8.2 KiB
Plaintext
398 lines
8.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Fait du poireautage¹ de données comme un pro\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"----\n",
|
|
"\n",
|
|
"1: Traduction approximative"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
"from sklearn.datasets import make_classification\n",
|
|
"from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit\n",
|
|
"from sklearn.metrics import cohen_kappa_score, make_scorer\n",
|
|
"from tqdm.auto import tqdm"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"n_samples = 100\n",
|
|
"n_features = 3072\n",
|
|
"n_classes = 5\n",
|
|
"n_features_selection = 927"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X, y = make_classification(n_samples, n_features, \n",
|
|
" n_classes=n_classes,\n",
|
|
" n_informative=10,\n",
|
|
" n_redundant=0,\n",
|
|
" n_repeated=0,\n",
|
|
" class_sep=3,\n",
|
|
" flip_y=0.01)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"((100, 3072),)"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"X.shape, "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(100,)"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"y.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"kf = StratifiedShuffleSplit(n_splits=2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "302160b95f4b4f58aba21f5ca8e42c74",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"𝛋 = 0.19 ± 0.19\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def benchmark(X):\n",
|
|
" s = []\n",
|
|
" for i in tqdm(range(25)):\n",
|
|
" s += [cross_val_score(RandomForestClassifier(), X, y, cv=kf, n_jobs=-1, scoring=make_scorer(cohen_kappa_score))]\n",
|
|
" s = np.array(s)\n",
|
|
" print('𝛋 = {:.2f} ± {:.2f}'.format(s.mean(), s.std()))\n",
|
|
"\n",
|
|
"benchmark(X)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def select_best_features(X):\n",
|
|
" clf = RandomForestClassifier(random_state=0)\n",
|
|
" clf.fit(X, y)\n",
|
|
"\n",
|
|
" ibest = np.argsort(clf.feature_importances_)[-n_features_selection:]\n",
|
|
"\n",
|
|
" return X[:,ibest]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(100, 927)"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"select_best_features(X).shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "28ec83d68b1849e4abc56f240708fa7c",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"𝛋 = 0.52 ± 0.19\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"benchmark(select_best_features(X))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Who cares about data quality anyway"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"now same but full random X"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"Xr = np.random.random((n_samples, n_features))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(100, 3072)"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"Xr.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "8a1163d81cf34c919ba23b3b41ab74a6",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"𝛋 = 0.05 ± 0.14\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"benchmark(Xr)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "0d1b80a130ed4e6fade6173e4752c1d3",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"𝛋 = 0.18 ± 0.17\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"benchmark(select_best_features(Xr))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"- <https://www.nature.com/articles/s41598-019-56185-5>\n",
|
|
"- <https://static-content.springer.com/esm/art%3A10.1038%2Fs41598-019-56185-5/MediaObjects/41598_2019_56185_MOESM1_ESM.pdf>\n",
|
|
"- Part 7.10.2 (p. 264) of <https://web.stanford.edu/~hastie/ElemStatLearn/>\n",
|
|
"- <https://towardsdatascience.com/data-leakage-in-machine-learning-10bdd3eec742>\n",
|
|
"- <https://www.reddit.com/r/MachineLearning/comments/jviymn/d_i_keep_running_across_studies_designed_this_way/>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## RESPECTEZ LES GESTES BARRIÈRES"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"1. **If train/val then test is your god**\n",
|
|
"2. Feature selection should be done in CV\n",
|
|
"3. _For nihilists_: get more training data.\n",
|
|
"4. _Get better data._"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|