Saved figures and writing the first version of model evaluations part

This commit is contained in:
2025-10-25 18:11:07 +02:00
parent 62ac682385
commit c0ffaa45c9
13 changed files with 213 additions and 52 deletions

View File

@@ -16,17 +16,13 @@
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "cannot unpack non-iterable DecisionTreeClassifier object",
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 46\u001b[39m\n\u001b[32m 41\u001b[39m model = Pipeline([\n\u001b[32m 42\u001b[39m (DecisionTreeClassifier(random_state=\u001b[32m42\u001b[39m)) \u001b[38;5;66;03m# Train Decision Tree Regressor\u001b[39;00m\n\u001b[32m 43\u001b[39m ])\n\u001b[32m 45\u001b[39m \u001b[38;5;66;03m# Train the model\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m46\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 47\u001b[39m y_pred = model.predict(X_val)\n\u001b[32m 49\u001b[39m \u001b[38;5;66;03m# Visualize the decision tree\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/Documents/MLP/Projects/MLPproject/.venv/lib/python3.12/site-packages/sklearn/base.py:1365\u001b[39m, in \u001b[36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(estimator, *args, **kwargs)\u001b[39m\n\u001b[32m 1358\u001b[39m estimator._validate_params()\n\u001b[32m 1360\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[32m 1361\u001b[39m skip_parameter_validation=(\n\u001b[32m 1362\u001b[39m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[32m 1363\u001b[39m )\n\u001b[32m 1364\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1365\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/Documents/MLP/Projects/MLPproject/.venv/lib/python3.12/site-packages/sklearn/pipeline.py:654\u001b[39m, in \u001b[36mPipeline.fit\u001b[39m\u001b[34m(self, X, y, **params)\u001b[39m\n\u001b[32m 647\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _routing_enabled() \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.transform_input \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 648\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 649\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mThe `transform_input` parameter can only be set if metadata \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 650\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mrouting is enabled. You can enable metadata routing using \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 651\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m`sklearn.set_config(enable_metadata_routing=True)`.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 652\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m654\u001b[39m routed_params = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_check_method_params\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfit\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprops\u001b[49m\u001b[43m=\u001b[49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 655\u001b[39m Xt = \u001b[38;5;28mself\u001b[39m._fit(X, y, routed_params, raw_params=params)\n\u001b[32m 656\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[33m\"\u001b[39m\u001b[33mPipeline\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m._log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m.steps) - \u001b[32m1\u001b[39m)):\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/Documents/MLP/Projects/MLPproject/.venv/lib/python3.12/site-packages/sklearn/pipeline.py:454\u001b[39m, in \u001b[36mPipeline._check_method_params\u001b[39m\u001b[34m(self, method, props, **kwargs)\u001b[39m\n\u001b[32m 449\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m routed_params\n\u001b[32m 450\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 451\u001b[39m fit_params_steps = Bunch(\n\u001b[32m 452\u001b[39m **{\n\u001b[32m 453\u001b[39m name: Bunch(**{method: {} \u001b[38;5;28;01mfor\u001b[39;00m method \u001b[38;5;129;01min\u001b[39;00m METHODS})\n\u001b[32m--> \u001b[39m\u001b[32m454\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m name, step \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.steps\n\u001b[32m 455\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m step \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 456\u001b[39m }\n\u001b[32m 457\u001b[39m )\n\u001b[32m 458\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m pname, pval \u001b[38;5;129;01min\u001b[39;00m props.items():\n\u001b[32m 459\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33m__\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m pname:\n",
"\u001b[31mTypeError\u001b[39m: cannot unpack non-iterable DecisionTreeClassifier object"
"\u001b[1;31mRunning cells with '.venv (Python 3.13.7)' requires the ipykernel package.\n",
"\u001b[1;31mInstall 'ipykernel' into the Python environment. \n",
"\u001b[1;31mCommand: '/home/jaknyst/Documents/MLPproject/.venv/bin/python -m pip install ipykernel -U --force-reinstall'"
]
}
],
@@ -598,7 +594,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
"version": "3.13.7"
}
},
"nbformat": 4,

View File

@@ -23,17 +23,42 @@
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Handling missing values}{1}{subsection.2.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Training, validation and test sets}{1}{subsection.2.4}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {3}Model selection}{1}{section.3}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{2}{section.4}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {5}Model Evaluations}{2}{section.5}\protected@file@percent }
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{fig:featureImportanceDT}{{1(a)}{2}{\relax }{figure.caption.1}{}}
\newlabel{fig:featureImportanceDT@cref}{{[subfigure][1][1]1(a)}{[1][2][]2}}
\newlabel{sub@fig:featureImportanceDT}{{(a)}{2}{\relax }{figure.caption.1}{}}
\newlabel{sub@fig:featureImportanceDT@cref}{{[subfigure][1][1]1(a)}{[1][2][]2}}
\newlabel{fig:featureImportanceRF}{{1(b)}{2}{\relax }{figure.caption.1}{}}
\newlabel{fig:featureImportanceRF@cref}{{[subfigure][2][1]1(b)}{[1][2][]2}}
\newlabel{sub@fig:featureImportanceRF}{{(b)}{2}{\relax }{figure.caption.1}{}}
\newlabel{sub@fig:featureImportanceRF@cref}{{[subfigure][2][1]1(b)}{[1][2][]2}}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces The confusion matricies of the Decision Tree model and the Random Forest model on the test data.\relax }}{2}{figure.caption.1}\protected@file@percent }
\newlabel{fig:}{{1}{2}{The confusion matricies of the Decision Tree model and the Random Forest model on the test data.\relax }{figure.caption.1}{}}
\newlabel{fig:@cref}{{[figure][1][]1}{[1][2][]2}}
\bibstyle{model1-num-names}
\bibcite{Steinhaus:Mathematical}{1}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces The performance metrics of the models on the validation data.\relax }}{3}{table.caption.2}\protected@file@percent }
\newlabel{perfmetric}{{1}{3}{The performance metrics of the models on the validation data.\relax }{table.caption.2}{}}
\newlabel{perfmetric@cref}{{[table][1][]1}{[1][2][]3}}
\@writefile{toc}{\contentsline {section}{References}{3}{figure.caption.3}\protected@file@percent }
\newlabel{fig:featureImportanceDT}{{2(a)}{3}{\relax }{figure.caption.3}{}}
\newlabel{fig:featureImportanceDT@cref}{{[subfigure][1][2]2(a)}{[1][3][]3}}
\newlabel{sub@fig:featureImportanceDT}{{(a)}{3}{\relax }{figure.caption.3}{}}
\newlabel{sub@fig:featureImportanceDT@cref}{{[subfigure][1][2]2(a)}{[1][3][]3}}
\newlabel{fig:featureImportanceRF}{{2(b)}{3}{\relax }{figure.caption.3}{}}
\newlabel{fig:featureImportanceRF@cref}{{[subfigure][2][2]2(b)}{[1][3][]3}}
\newlabel{sub@fig:featureImportanceRF}{{(b)}{3}{\relax }{figure.caption.3}{}}
\newlabel{sub@fig:featureImportanceRF@cref}{{[subfigure][2][2]2(b)}{[1][3][]3}}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces The feature importance graphs for the Decision Tree model and the Random Forest model.\relax }}{3}{figure.caption.3}\protected@file@percent }
\newlabel{fig:}{{2}{3}{The feature importance graphs for the Decision Tree model and the Random Forest model.\relax }{figure.caption.3}{}}
\newlabel{fig:@cref}{{[figure][2][]2}{[1][3][]3}}
\bibcite{Greivenkamp:FieldGuide}{2}
\bibcite{Pedrotti:Introduction}{3}
\bibcite{Davis:ChemWiki}{4}
\@writefile{toc}{\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{2}{section.4}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {5}Model Evaluations}{2}{section.5}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {6}}{2}{section.6}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{References}{2}{section.6}\protected@file@percent }
\ttl@finishall
\newlabel{LastPage}{{}{2}{}{page.2}{}}
\xdef\lastpage@lastpage{2}
\xdef\lastpage@lastpageHy{2}
\gdef \@abspage@last{2}
\newlabel{LastPage}{{}{4}{}{page.4}{}}
\xdef\lastpage@lastpage{4}
\xdef\lastpage@lastpageHy{4}
\gdef \@abspage@last{4}

View File

@@ -1,6 +1,6 @@
# Fdb version 4
["pdflatex"] 1761133861.19469 "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" "MLPproject.pdf" "MLPproject" 1761133862.86789 0
"/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" 1761133860.76481 9644 b114a2966591f23d2a80de0b71664c7d ""
["pdflatex"] 1761408421.82051 "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" "MLPproject.pdf" "MLPproject" 1761408422.38409 0
"/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" 1761408421.10156 15551 7c91f156b6c79cad294fb22ca0c64f64 ""
"/usr/share/texlive/texmf-dist/fonts/enc/dvips/base/8r.enc" 1721433600 4850 80dc9bab7f31fb78a000ccfed0e27cab ""
"/usr/share/texlive/texmf-dist/fonts/map/fontname/texfonts.map" 1577235249 3524 cb3e574dea2d1052e39280babc910dc8 ""
"/usr/share/texlive/texmf-dist/fonts/tfm/adobe/helvetic/phvb7t.tfm" 1136768653 2240 eb56c13537f4d8a0bd3fafc25572b1bd ""
@@ -132,11 +132,14 @@
"/usr/share/texlive/texmf-dist/web2c/texmf.cnf" 1721433600 40900 887e0dc8cac988a9e9c574af364cf837 ""
"/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map" 1760290233.68077 4602002 62dba5fc29055c16380d7393a2adb07a ""
"/var/lib/texmf/web2c/pdftex/pdflatex.fmt" 1760289849 7753794 892d611f76aecccd13eb485815d0543e ""
"MLPproject.aux" 1761133862.68081 2207 83760043bb554d7220df5afc55d586db "pdflatex"
"MLPproject.out" 1761133862.68281 1614 a2f9c909152198446a03f747ac01e9f8 "pdflatex"
"MLPproject.tex" 1761133860.76481 9644 b114a2966591f23d2a80de0b71664c7d ""
"MLPproject.toc" 1761133862.68381 896 462daffa7f338139a1f72b531978e0ba "pdflatex"
"MLPproject.aux" 1761408422.32348 4598 98311111e5f17f774f4333b6899ad750 "pdflatex"
"MLPproject.out" 1761408422.3244 1585 96a5cbe86300c20e741bb675b1dad6de "pdflatex"
"MLPproject.tex" 1761408421.10156 15551 7c91f156b6c79cad294fb22ca0c64f64 ""
"MLPproject.toc" 1761408422.32537 847 58496dd675955cac8bd2874a312fe024 "pdflatex"
"SelfArx.cls" 1761123180.54708 7316 506603b27aab6da8087bc0f1ee693041 ""
"confusionMatrix.png" 1761329088.9659 21007 cfab042955bc386bac85d53bfe5a6793 ""
"featureImportanceDT.png" 1761328898.24566 60078 4a2e56e2a45ae2ae5e41b9830c1bbcea ""
"featureImportanceRF.png" 1761328962.51602 61794 6b3eefc625dd3da8a3dbf302174c614c ""
(generated)
"MLPproject.aux"
"MLPproject.log"

View File

@@ -1023,6 +1023,25 @@ INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/helvetic/phvb8r.tfm
INPUT /usr/share/texlive/texmf-dist/fonts/vf/adobe/times/ptmr7t.vf
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmr8r.tfm
INPUT /usr/share/texlive/texmf-dist/fonts/vf/adobe/times/ptmr8c.vf
INPUT ./confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT ./featureImportanceDT.png
INPUT ./featureImportanceDT.png
INPUT featureImportanceDT.png
INPUT ./featureImportanceDT.png
INPUT ./featureImportanceDT.png
INPUT ./featureImportanceRF.png
INPUT ./featureImportanceRF.png
INPUT featureImportanceRF.png
INPUT ./featureImportanceRF.png
INPUT ./featureImportanceRF.png
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmr7t.tfm
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmr7t.tfm
INPUT /usr/share/texlive/texmf-dist/fonts/vf/adobe/times/ptmb7t.vf

View File

@@ -1,4 +1,4 @@
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2025.10.12) 22 OCT 2025 13:51
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2025.10.12) 25 OCT 2025 18:07
entering extended mode
restricted \write18 enabled.
file:line:error style messages enabled.
@@ -494,7 +494,38 @@ LaTeX Info: Redefining \labelcpageref on input line 2370.
LaTeX Font Info: Trying to load font information for OT1+ptm on input line 54.
(/usr/share/texlive/texmf-dist/tex/latex/psnfss/ot1ptm.fd
File: ot1ptm.fd 2001/06/04 font definitions for OT1/ptm.
) (./MLPproject.aux)
) (./MLPproject.aux
LaTeX Warning: Label `fig:featureImportanceDT' multiply defined.
LaTeX Warning: Label `fig:featureImportanceDT@cref' multiply defined.
LaTeX Warning: Label `sub@fig:featureImportanceDT' multiply defined.
LaTeX Warning: Label `sub@fig:featureImportanceDT@cref' multiply defined.
LaTeX Warning: Label `fig:featureImportanceRF' multiply defined.
LaTeX Warning: Label `fig:featureImportanceRF@cref' multiply defined.
LaTeX Warning: Label `sub@fig:featureImportanceRF' multiply defined.
LaTeX Warning: Label `sub@fig:featureImportanceRF@cref' multiply defined.
LaTeX Warning: Label `fig:' multiply defined.
LaTeX Warning: Label `fig:@cref' multiply defined.
)
\openout1 = `MLPproject.aux'.
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 54.
@@ -658,53 +689,78 @@ File: ts1ptm.fd 2001/06/04 font definitions for TS1/ptm.
]
Underfull \hbox (badness 1448) in paragraph at lines 119--123
<confusionMatrix.png, id=73, 388.0899pt x 328.8285pt>
File: confusionMatrix.png Graphic file (type png)
<use confusionMatrix.png>
Package pdftex.def Info: confusionMatrix.png used on input line 98.
(pdftex.def) Requested size: 218.17422pt x 184.8602pt.
File: confusionMatrix.png Graphic file (type png)
<use confusionMatrix.png>
Package pdftex.def Info: confusionMatrix.png used on input line 105.
(pdftex.def) Requested size: 218.17422pt x 184.8602pt.
[2 <./confusionMatrix.png>]
<featureImportanceDT.png, id=87, 416.2752pt x 393.8715pt>
File: featureImportanceDT.png Graphic file (type png)
<use featureImportanceDT.png>
Package pdftex.def Info: featureImportanceDT.png used on input line 141.
(pdftex.def) Requested size: 218.17422pt x 206.43103pt.
<featureImportanceRF.png, id=88, 422.0568pt x 393.8715pt>
File: featureImportanceRF.png Graphic file (type png)
<use featureImportanceRF.png>
Package pdftex.def Info: featureImportanceRF.png used on input line 148.
(pdftex.def) Requested size: 218.17422pt x 203.60634pt.
Underfull \vbox (badness 10000) has occurred while \output is active []
[3 <./featureImportanceDT.png> <./featureImportanceRF.png>]
Underfull \hbox (badness 1448) in paragraph at lines 183--187
[]\OT1/ptm/m/n/10 (+20) UC Davis ChemWiki, Prop-a-ga-tion of Er-ror, Avail-
[]
Underfull \hbox (badness 7649) in paragraph at lines 119--123
Underfull \hbox (badness 7649) in paragraph at lines 183--187
\OT1/ptm/m/n/10 (+20) able at: [][]$https : / / chem . libretexts . org / Textbook[]Maps /
[]
Underfull \hbox (badness 10000) in paragraph at lines 119--123
Underfull \hbox (badness 10000) in paragraph at lines 183--187
\OT1/ptm/m/n/10 (+20) Analytical[]Chemistry / Supplemental[]Modules[]
[]
Underfull \hbox (badness 10000) in paragraph at lines 119--123
Underfull \hbox (badness 10000) in paragraph at lines 183--187
\OT1/ptm/m/n/10 (+20) (Analytical[]Chemistry ) /Quantifying[]Nature /
[]
Underfull \hbox (badness 10000) in paragraph at lines 119--123
Underfull \hbox (badness 10000) in paragraph at lines 183--187
\OT1/ptm/m/n/10 (+20) Signi^^Lcant[]Digits / Propagation[]of[]Error$[][], (Ac-cessed:
[]
[4
Package caption Warning: Unused \captionsetup[subfigure] on input line 32.
See the caption package documentation for explanation.
[2]
]
enddocument/afterlastpage: lastpage setting LastPage.
(./MLPproject.aux)
LaTeX Warning: There were multiply-defined labels.
Package rerunfilecheck Info: File `MLPproject.out' has not changed.
(rerunfilecheck) Checksum: A2F9C909152198446A03F747AC01E9F8;1614.
(rerunfilecheck) Checksum: 96A5CBE86300C20E741BB675B1DAD6DE;1585.
)
Here is how much of TeX's memory you used:
18943 strings out of 476041
320588 string characters out of 5793173
19026 strings out of 476041
322014 string characters out of 5793173
1876388 words of memory out of 6000000
38855 multiletter control sequences out of 15000+600000
569328 words of font info for 263 fonts, out of 8000000 for 9000
38900 multiletter control sequences out of 15000+600000
570518 words of font info for 283 fonts, out of 8000000 for 9000
1137 hyphenation exceptions out of 8191
75i,12n,77p,1476b,472s stack positions out of 10000i,1000n,20000p,200000b,200000s
75i,12n,77p,1611b,627s stack positions out of 10000i,1000n,20000p,200000b,200000s
</usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvb8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvr8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvro8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/times/utmb8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/times/utmr8a.pfb>
Output written on MLPproject.pdf (2 pages, 64293 bytes).
Output written on MLPproject.pdf (4 pages, 159792 bytes).
PDF statistics:
121 PDF objects out of 1000 (max. 8388607)
95 compressed objects within 1 object stream
17 named destinations out of 1000 (max. 500000)
98393 words of extra memory for PDF output out of 106986 (max. 10000000)
140 PDF objects out of 1000 (max. 8388607)
106 compressed objects within 2 object streams
21 named destinations out of 1000 (max. 500000)
98400 words of extra memory for PDF output out of 106986 (max. 10000000)

View File

@@ -7,5 +7,4 @@
\BOOKMARK [1][-]{section.3}{\376\377\000M\000o\000d\000e\000l\000\040\000s\000e\000l\000e\000c\000t\000i\000o\000n}{}% 7
\BOOKMARK [1][-]{section.4}{\376\377\000M\000o\000d\000e\000l\000\040\000T\000r\000a\000i\000n\000i\000n\000g\000\040\000a\000n\000d\000\040\000H\000y\000p\000e\000r\000p\000a\000r\000a\000m\000e\000t\000e\000r\000\040\000T\000u\000n\000i\000n\000g}{}% 8
\BOOKMARK [1][-]{section.5}{\376\377\000M\000o\000d\000e\000l\000\040\000E\000v\000a\000l\000u\000a\000t\000i\000o\000n\000s}{}% 9
\BOOKMARK [1][-]{section.6}{}{}% 10
\BOOKMARK [1][-]{section.6}{\376\377\000R\000e\000f\000e\000r\000e\000n\000c\000e\000s}{}% 11
\BOOKMARK [1][-]{figure.caption.3}{\376\377\000R\000e\000f\000e\000r\000e\000n\000c\000e\000s}{}% 10

Binary file not shown.

Binary file not shown.

View File

@@ -72,7 +72,7 @@
\subsection{Dataset}
%https://www.kaggle.com/datasets/mosapabdelghany/adult-income-prediction-dataset
The dataset we decided to study is a labeled income prediction dataset. This dataset includes 14 features with information about the people in the srudy and a label with the income as either more than 50 000\$ per year or less than or equal to 50 000 \$ per year. This means that we are looking at a binary classification problem. A lot of the features are discrete where only a set number of options available. This includes features such as marital status, education and working class. The dataset features around 32500 data points.
The dataset we decided to study is a labeled income prediction dataset. This dataset includes 14 features with information about the people in the study and a label with the income as either more than \$50 000 per year or less than or equal to \$50 000 per year. This means that we are looking at a binary classification problem. A lot of the features are discrete where only a set number of options available. This includes features such as marital status, education and working class. The dataset features around 32500 data points.
\subsection{Data cleaning and feature engineering}
There were a couple of things with our dataset that had to be modified in order for it to be usable in our ML application. We find that some of the features are redundant or not interesting in our project. We romove the redundant feature education since there is another already numerically encoded feature containing the same data. We also chose to remove the feature 'fnlwgt' since it is a already calculated number that is used by the Census Bureau to estimate population statistics. Since we want to estimate the population statistics based on the other features and not the already calculated weight we remove this feature. We have a mix of numerical and non-numerical features in our dataset. Since the machine learning models cannot use non-numerical data we have to encode the non-numercial data into corresponding numbers. This is with the label encoder built into sci-kit learn and used on all non-numerical data.
@@ -86,8 +86,72 @@ When selecting the model to use for this project we have to limit us to using mo
\section{Model Training and Hyperparameter Tuning}
During the model training there are some important changes we can make to improve the accuracy of our model. One thing we implement is cross validation. Since there is a great spread in our data we choose to use randomized search. %Add more here and change type of x-val if needed. How many folds?
Another very important part of the model training is finding the optimal hyperparameters. This is an important step in minimizing the risk of overfitting. Some important hyperparameters in our decision trees are the maximum depth and minimum sample split. The maximum depth hyperparameter decides how deep the tree is allowed to go. If a tree is allowed to go very deep there is a high risk of overfitting. We therefore test multiple different depths and see which values give the best training and validation accuracy. This will ensure that we use the most optimal depth for our tree. The minimum sample split states how many data points there has to be for a new split to be created. This is also a good measure against overfitting since if it is very low we risk training the noise of the data instead of the general trend and end up overfitting the data. It is also important that it is not to small since we then loose information and underfit instead. For the random forest there is also the hyperparameter of how many estimators to use. This decides how many trees to choose from.
\section{Model Evaluations}
\section{}
There are two interesting parts to look at after our analysis. One part is to analyze how well the actual models performed and compare the difference between the two models we have chosen to study. We fine tuned our models using the validation part of the data. After running it on the test data we can see how well it actually performs. A great way to get a quick overview of how well a model classifies is to look at the confusion matrix.
\begin{figure}[!hptb]
\centering
\begin{subfigure}[b]{0.9\columnwidth}
\centering
\includegraphics[width=\textwidth]{confusionMatrix.png}
\caption{}
\label{fig:featureImportanceDT}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.9\columnwidth}
\centering
\includegraphics[width=\textwidth]{confusionMatrix.png}
\caption{}
\label{fig:featureImportanceRF}
\end{subfigure}
\caption{The confusion matricies of the Decision Tree model and the Random Forest model on the test data.}
\label{fig:}
\end{figure}
As we can see in the confusion matricies there is not that big of a difference between the models. Both did an overall good job at identifying the two classes. There is a difference in how well the models did in identifying the two different classes. Overall they performed a lot better at classifying the poor people than the rich. % Add more about the exact numbers!!!
This is a very interesting result and maybe not so weird as it first seems. There were a lot more poor people in our training data set than rich people. This would of course train our model to be better at classifying the poor. As well as looking at the classification matricies it is interesting to look at the actual performance metrics that can be calculated from the matricies. These metrics can be seen in table(\ref{perfmetric}). Of note is that all of these metrics are calculated as weighted metrics which means that they account for the class imbalances seen in the confusion matrcies.
\begin{table}[!htbp]
\centering
\caption{The performance metrics of the models on the validation data.}
\label{perfmetric}
\resizebox{\columnwidth}{!}{
\begin{tabular}{c|c|c|c|c|c}
Model&Accuracy&Precision&Recall&F1 Score&Total Time\\
\hline
RF &0.8589&0.8535&0.8589&0.8534&150.8154\\
\hline
DT&0.8483&0.8449&0.8483&0.8462&6.7357
\end{tabular}}
\end{table}
Looking at the values we see that the difference between our models is not that large. The Random forest model is on average about 1 percentage point better than the Decision Tree. We can also see that all metrics are at about 0.85. This means that our models are not very accurate and that the differences between them is not that large at all. Which model that is better depends a lot on what is the priority. While it is clear that the Random Forest has the better performance, even by just a little bit, it is also significanty slower. So for this dataset was it really worth 30x the computational time to get a slightly better result? We are not really sure. The extra computational time is a definite negative but at the size of this dataset we are only talking about a couple of minutes which is not too bad. For another dataset the results may be different and it might be clearer which is really the prefered model.
At a first glance at both the confusion matricies and the performance metrics the models do not look to be that good. But what has to be considered is the data that we are analyzing. We are looking at what possible indicators there are for a person to earn more than a certain amount of money. This is real world data and in the real world there is a lot of unique ways of earning money. While there certainly are some indicators that will clearly tell that somebody is earning a lot of money, there are other factors that are not as telling. This means that some features are less important than others. This can be seen in our models int he feature importance graphs in figure(\ref{fig:featureImportanceDT}) and (\ref{fig:featureImportanceRF}). This also means that there will be plenty of outliers in the data. No matter how good the model is, it cannot possibly catch all of these outliers. If it did it would be overfitted. We simply cannot expect a model to have very good accuracy on this type of data set.
Taking a closer look at the feature importance graphs of the two models we notice an interesting difference. The Decision tree which is only one tree focuses has only a few main features where one is the most important. The rest are not used that much or almost not at all. The Random Forest uses a far wider range of features. They also rank the features a bit differently and the best feature for one model is not the best for the other. We considered removing the worst performing features to see if it would make a difference in the performanes. But since they have diffrent worst performing features we reasoned that to keep the comparison as fair as possible it would be more interesting to leave the features as is.
% Jämföra test och validation för att verifiera att vi inte overfittar
We spent some time tuning the hyperparameters to ensure that we did not overfit. We can also see if we
\begin{figure}[!hptb]
\centering
\begin{subfigure}[b]{0.9\columnwidth}
\centering
\includegraphics[width=\textwidth]{featureImportanceDT.png}
\caption{}
\label{fig:featureImportanceDT}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.9\columnwidth}
\centering
\includegraphics[width=\textwidth]{featureImportanceRF.png}
\caption{}
\label{fig:featureImportanceRF}
\end{subfigure}
\caption{The feature importance graphs for the Decision Tree model and the Random Forest model.}
\label{fig:}
\end{figure}
%----------------------------

View File

@@ -8,6 +8,5 @@
\contentsline {section}{\numberline {3}Model selection}{1}{section.3}%
\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{2}{section.4}%
\contentsline {section}{\numberline {5}Model Evaluations}{2}{section.5}%
\contentsline {section}{\numberline {6}}{2}{section.6}%
\contentsline {section}{References}{2}{section.6}%
\contentsline {section}{References}{3}{figure.caption.3}%
\contentsfinish

BIN
Report/confusionMatrix.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB