Maybe done?

This commit is contained in:
2025-10-31 15:36:32 +01:00
parent d587a7064e
commit 52a78af447
14 changed files with 352 additions and 331 deletions

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 85 KiB

After

Width:  |  Height:  |  Size: 95 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 87 KiB

After

Width:  |  Height:  |  Size: 96 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 86 KiB

After

Width:  |  Height:  |  Size: 96 KiB

View File

@@ -23,49 +23,52 @@
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Handling missing values}{2}{subsection.2.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Training, validation and test sets}{2}{subsection.2.4}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {3}Model selection}{2}{section.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Data cleaning and feature engineering}{2}{subsection.3.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Handling missing values}{2}{subsection.3.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Training, validation and test sets}{2}{subsection.3.3}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {4}Model selection}{2}{section.4}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {5}Model Training and Hyperparameter Tuning}{3}{section.5}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Models and methods used}{3}{subsection.5.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Caveats and restrictions}{3}{subsection.5.2}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {6}Model Evaluations}{3}{section.6}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {6.1}Analyzing the Confusion Matricies}{3}{subsection.6.1}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{2}{section.4}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Models and methods used}{2}{subsection.4.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Validation Results}{2}{subsection.4.2}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces The weighted averages of the performance metrics of the models on the validation data.\relax }}{2}{table.caption.1}\protected@file@percent }
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{fig:featureImportanceDT}{{1(a)}{4}{\relax }{figure.caption.1}{}}
\newlabel{fig:featureImportanceDT@cref}{{[subfigure][1][1]1(a)}{[1][3][]4}}
\newlabel{sub@fig:featureImportanceDT}{{(a)}{4}{\relax }{figure.caption.1}{}}
\newlabel{sub@fig:featureImportanceDT@cref}{{[subfigure][1][1]1(a)}{[1][3][]4}}
\newlabel{fig:featureImportanceRF}{{1(b)}{4}{\relax }{figure.caption.1}{}}
\newlabel{fig:featureImportanceRF@cref}{{[subfigure][2][1]1(b)}{[1][3][]4}}
\newlabel{sub@fig:featureImportanceRF}{{(b)}{4}{\relax }{figure.caption.1}{}}
\newlabel{sub@fig:featureImportanceRF@cref}{{[subfigure][2][1]1(b)}{[1][3][]4}}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces The confusion matricies of the Decision Tree model and the Random Forest model on the test data.\relax }}{4}{figure.caption.1}\protected@file@percent }
\newlabel{fig:}{{1}{4}{The confusion matricies of the Decision Tree model and the Random Forest model on the test data.\relax }{figure.caption.1}{}}
\newlabel{fig:@cref}{{[figure][1][]1}{[1][3][]4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.2}Analyzing Weighted Performance Metrics}{4}{subsection.6.2}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces The performance metrics of the models on the validation data.\relax }}{4}{table.caption.2}\protected@file@percent }
\newlabel{perfmetric}{{1}{4}{The performance metrics of the models on the validation data.\relax }{table.caption.2}{}}
\newlabel{perfmetric@cref}{{[table][1][]1}{[1][3][]4}}
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces The performance metrics of the models on the test data.\relax }}{4}{table.caption.3}\protected@file@percent }
\newlabel{perfmetrictest}{{2}{4}{The performance metrics of the models on the test data.\relax }{table.caption.3}{}}
\newlabel{perfmetrictest@cref}{{[table][2][]2}{[1][3][]4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.3}Analyzing the Performance}{4}{subsection.6.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {6.4}Overfitting and Underfitting}{5}{subsection.6.4}\protected@file@percent }
\newlabel{fig:featureImportanceDT}{{2(a)}{5}{\relax }{figure.caption.4}{}}
\newlabel{fig:featureImportanceDT@cref}{{[subfigure][1][2]2(a)}{[1][5][]5}}
\newlabel{sub@fig:featureImportanceDT}{{(a)}{5}{\relax }{figure.caption.4}{}}
\newlabel{sub@fig:featureImportanceDT@cref}{{[subfigure][1][2]2(a)}{[1][5][]5}}
\newlabel{fig:featureImportanceRF}{{2(b)}{5}{\relax }{figure.caption.4}{}}
\newlabel{fig:featureImportanceRF@cref}{{[subfigure][2][2]2(b)}{[1][5][]5}}
\newlabel{sub@fig:featureImportanceRF}{{(b)}{5}{\relax }{figure.caption.4}{}}
\newlabel{sub@fig:featureImportanceRF@cref}{{[subfigure][2][2]2(b)}{[1][5][]5}}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces The feature importance graphs for the Decision Tree model and the Random Forest model.\relax }}{5}{figure.caption.4}\protected@file@percent }
\newlabel{fig:}{{2}{5}{The feature importance graphs for the Decision Tree model and the Random Forest model.\relax }{figure.caption.4}{}}
\newlabel{fig:@cref}{{[figure][2][]2}{[1][5][]5}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.5}Feature Importance}{5}{subsection.6.5}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {7}Summary}{5}{section.7}\protected@file@percent }
\newlabel{perfmetric}{{1}{2}{The weighted averages of the performance metrics of the models on the validation data.\relax }{table.caption.1}{}}
\newlabel{perfmetric@cref}{{[table][1][]1}{[1][2][]2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Caveats and restrictions}{2}{subsection.4.3}\protected@file@percent }
\newlabel{fig:featureImportanceDT}{{1(a)}{3}{\relax }{figure.caption.2}{}}
\newlabel{fig:featureImportanceDT@cref}{{[subfigure][1][1]1(a)}{[1][3][]3}}
\newlabel{sub@fig:featureImportanceDT}{{(a)}{3}{\relax }{figure.caption.2}{}}
\newlabel{sub@fig:featureImportanceDT@cref}{{[subfigure][1][1]1(a)}{[1][3][]3}}
\newlabel{fig:featureImportanceRF}{{1(b)}{3}{\relax }{figure.caption.2}{}}
\newlabel{fig:featureImportanceRF@cref}{{[subfigure][2][1]1(b)}{[1][3][]3}}
\newlabel{sub@fig:featureImportanceRF}{{(b)}{3}{\relax }{figure.caption.2}{}}
\newlabel{sub@fig:featureImportanceRF@cref}{{[subfigure][2][1]1(b)}{[1][3][]3}}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces The confusion matricies of the Decision Tree model and the Random Forest model on the test data.\relax }}{3}{figure.caption.2}\protected@file@percent }
\newlabel{fig:}{{1}{3}{The confusion matricies of the Decision Tree model and the Random Forest model on the test data.\relax }{figure.caption.2}{}}
\newlabel{fig:@cref}{{[figure][1][]1}{[1][3][]3}}
\@writefile{toc}{\contentsline {section}{\numberline {5}Model Evaluations}{3}{section.5}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Analyzing the Confusion Matricies}{3}{subsection.5.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Analyzing Weighted Performance Metrics}{3}{subsection.5.2}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces The weighted averages of the performance metrics of the models on the test data.\relax }}{3}{table.caption.3}\protected@file@percent }
\newlabel{perfmetrictest}{{2}{3}{The weighted averages of the performance metrics of the models on the test data.\relax }{table.caption.3}{}}
\newlabel{perfmetrictest@cref}{{[table][2][]2}{[1][3][]3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Analyzing the Performance}{4}{subsection.5.3}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Class-wise performance metrics of the Decision Tree.\relax }}{4}{table.caption.4}\protected@file@percent }
\newlabel{dt_metrics}{{3}{4}{Class-wise performance metrics of the Decision Tree.\relax }{table.caption.4}{}}
\newlabel{dt_metrics@cref}{{[table][3][]3}{[1][4][]4}}
\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Class-wise performance metrics of the Random Forest.\relax }}{4}{table.caption.5}\protected@file@percent }
\newlabel{rf_metrics}{{4}{4}{Class-wise performance metrics of the Random Forest.\relax }{table.caption.5}{}}
\newlabel{rf_metrics@cref}{{[table][4][]4}{[1][4][]4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.4}Overfitting and Underfitting}{4}{subsection.5.4}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.5}Feature Importance}{4}{subsection.5.5}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {6}Summary}{4}{section.6}\protected@file@percent }
\newlabel{fig:featureImportanceDT}{{2(a)}{5}{\relax }{figure.caption.6}{}}
\newlabel{fig:featureImportanceDT@cref}{{[subfigure][1][2]2(a)}{[1][4][]5}}
\newlabel{sub@fig:featureImportanceDT}{{(a)}{5}{\relax }{figure.caption.6}{}}
\newlabel{sub@fig:featureImportanceDT@cref}{{[subfigure][1][2]2(a)}{[1][4][]5}}
\newlabel{fig:featureImportanceRF}{{2(b)}{5}{\relax }{figure.caption.6}{}}
\newlabel{fig:featureImportanceRF@cref}{{[subfigure][2][2]2(b)}{[1][4][]5}}
\newlabel{sub@fig:featureImportanceRF}{{(b)}{5}{\relax }{figure.caption.6}{}}
\newlabel{sub@fig:featureImportanceRF@cref}{{[subfigure][2][2]2(b)}{[1][4][]5}}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces The feature importance graphs for the Decision Tree model and the Random Forest model based on the validation data.\relax }}{5}{figure.caption.6}\protected@file@percent }
\newlabel{fig:}{{2}{5}{The feature importance graphs for the Decision Tree model and the Random Forest model based on the validation data.\relax }{figure.caption.6}{}}
\newlabel{fig:@cref}{{[figure][2][]2}{[1][4][]5}}
\ttl@finishall
\newlabel{LastPage}{{}{5}{}{page.5}{}}
\xdef\lastpage@lastpage{5}

View File

@@ -1,7 +1,7 @@
# Fdb version 4
["pdflatex"] 1761826831.99817 "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" "MLPproject.pdf" "MLPproject" 1761826834.06411 0
"/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" 1761826831.67703 25484 b909380ef4186262cc37c215d1d67a9a ""
"/usr/share/texlive/texmf-dist/fonts/enc/dvips/base/8r.enc" 1721433600 4850 80dc9bab7f31fb78a000ccfed0e27cab ""
["pdflatex"] 1761920867.71702 "/home/petrus/Documents/MLP/Projects/MLPproject/Report/MLPproject.tex" "MLPproject.pdf" "MLPproject" 1761920868.83079 0
"/home/petrus/Documents/MLP/Projects/MLPproject/Report/MLPproject.tex" 1761920867.50658 24533 58032bad0234d994ba6556d7acc5212e ""
"/usr/share/texlive/texmf-dist/fonts/enc/dvips/base/8r.enc" 1737590400 4850 80dc9bab7f31fb78a000ccfed0e27cab ""
"/usr/share/texlive/texmf-dist/fonts/map/fontname/texfonts.map" 1577235249 3524 cb3e574dea2d1052e39280babc910dc8 ""
"/usr/share/texlive/texmf-dist/fonts/tfm/adobe/helvetic/phvb7t.tfm" 1136768653 2240 eb56c13537f4d8a0bd3fafc25572b1bd ""
"/usr/share/texlive/texmf-dist/fonts/tfm/adobe/helvetic/phvb8r.tfm" 1136768653 4484 b828043cbd581d289d955903c1339981 ""
@@ -29,7 +29,7 @@
"/usr/share/texlive/texmf-dist/fonts/vf/adobe/helvetic/phvro7t.vf" 1136768653 1372 9948cedecdb0445a3b5cf1b8a8082ab8 ""
"/usr/share/texlive/texmf-dist/fonts/vf/adobe/times/ptmr7t.vf" 1136768653 1380 0ea3a3370054be6da6acd929ec569f06 ""
"/usr/share/texlive/texmf-dist/fonts/vf/adobe/times/ptmr8c.vf" 1136768653 3556 8a9a6dcbcd146ef985683f677f4758a6 ""
"/usr/share/texlive/texmf-dist/tex/context/base/mkii/supp-pdf.mkii" 1721433600 71627 94eb9990bed73c364d7f53f960cc8c5b ""
"/usr/share/texlive/texmf-dist/tex/context/base/mkii/supp-pdf.mkii" 1737590400 71627 94eb9990bed73c364d7f53f960cc8c5b ""
"/usr/share/texlive/texmf-dist/tex/generic/atbegshi/atbegshi.sty" 1575674566 24708 5584a51a7101caf7e6bbf1fc27d8f7b1 ""
"/usr/share/texlive/texmf-dist/tex/generic/babel-english/english.ldf" 1496785618 7008 9ff5fdcc865b01beca2b0fe4a46231d4 ""
"/usr/share/texlive/texmf-dist/tex/generic/babel/babel.sty" 1676321701 151363 1f5971af3ef874d432e8fb43e0edb71d ""
@@ -59,13 +59,13 @@
"/usr/share/texlive/texmf-dist/tex/latex/amsmath/amstext.sty" 1654720880 2444 70065bddd85997dc1fd0bb7ae634e5fa ""
"/usr/share/texlive/texmf-dist/tex/latex/atveryend/atveryend.sty" 1576191570 19336 ce7ae9438967282886b3b036cfad1e4d ""
"/usr/share/texlive/texmf-dist/tex/latex/auxhook/auxhook.sty" 1576625391 3935 57aa3c3e203a5c2effb4d2bd2efbc323 ""
"/usr/share/texlive/texmf-dist/tex/latex/base/article.cls" 1721433600 20144 d5ecf0a5140c8d8d8b72cbe86e320eff ""
"/usr/share/texlive/texmf-dist/tex/latex/base/atbegshi-ltx.sty" 1721433600 3052 30236f0cc243a8651b82240dfd2e8b9d ""
"/usr/share/texlive/texmf-dist/tex/latex/base/atveryend-ltx.sty" 1721433600 2462 8ce5f9a9c63002f2c1af03c262cf29af ""
"/usr/share/texlive/texmf-dist/tex/latex/base/fleqn.clo" 1721433600 4807 d162528c27809003cc96755db6ca2bef ""
"/usr/share/texlive/texmf-dist/tex/latex/base/ifthen.sty" 1721433600 5319 48d7f3cfa322abd2788e3c09d624b922 ""
"/usr/share/texlive/texmf-dist/tex/latex/base/inputenc.sty" 1721433600 5048 84b05796b49b69e2d4257d537721c960 ""
"/usr/share/texlive/texmf-dist/tex/latex/base/size10.clo" 1721433600 8448 c33a4e1cb35cee9b33c2b21033b73e39 ""
"/usr/share/texlive/texmf-dist/tex/latex/base/article.cls" 1737590400 20144 d5ecf0a5140c8d8d8b72cbe86e320eff ""
"/usr/share/texlive/texmf-dist/tex/latex/base/atbegshi-ltx.sty" 1737590400 3052 30236f0cc243a8651b82240dfd2e8b9d ""
"/usr/share/texlive/texmf-dist/tex/latex/base/atveryend-ltx.sty" 1737590400 2462 8ce5f9a9c63002f2c1af03c262cf29af ""
"/usr/share/texlive/texmf-dist/tex/latex/base/fleqn.clo" 1737590400 4807 d162528c27809003cc96755db6ca2bef ""
"/usr/share/texlive/texmf-dist/tex/latex/base/ifthen.sty" 1737590400 5319 48d7f3cfa322abd2788e3c09d624b922 ""
"/usr/share/texlive/texmf-dist/tex/latex/base/inputenc.sty" 1737590400 5048 84b05796b49b69e2d4257d537721c960 ""
"/usr/share/texlive/texmf-dist/tex/latex/base/size10.clo" 1737590400 8448 c33a4e1cb35cee9b33c2b21033b73e39 ""
"/usr/share/texlive/texmf-dist/tex/latex/booktabs/booktabs.sty" 1579038678 6078 f1cb470c9199e7110a27851508ed7a5c ""
"/usr/share/texlive/texmf-dist/tex/latex/caption/caption.sty" 1678653221 55778 14d5c99aa26410e440820bb9ea5b8b3a ""
"/usr/share/texlive/texmf-dist/tex/latex/caption/caption3.sty" 1678653221 71836 1a735454ad10692452eb2f2fc37f3865 ""
@@ -126,18 +126,18 @@
"/usr/share/texlive/texmf-dist/tex/latex/tools/calc.sty" 1654720880 10214 de3e21cfc0eccc98ca7f8dac0ef263d2 ""
"/usr/share/texlive/texmf-dist/tex/latex/url/url.sty" 1388531844 12796 8edb7d69a20b857904dd0ea757c14ec9 ""
"/usr/share/texlive/texmf-dist/tex/latex/xcolor/xcolor.sty" 1655066402 56148 51a9a8571c07b9921892ae11063ae853 ""
"/usr/share/texlive/texmf-dist/web2c/texmf.cnf" 1721433600 40900 887e0dc8cac988a9e9c574af364cf837 ""
"/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map" 1760290233.68077 4602002 62dba5fc29055c16380d7393a2adb07a ""
"/var/lib/texmf/web2c/pdftex/pdflatex.fmt" 1760289849 7753794 892d611f76aecccd13eb485815d0543e ""
"CM_dt.png" 1761561428.73434 87433 ef7840e96e2e4e7d41f9d29d01517aa6 ""
"CM_rf.png" 1761561428.73495 88928 d3d0474bb68254ae0bba2e635ab99231 ""
"MLPproject.aux" 1761826833.91004 6260 0b50cad3e5dbdb87ae5a15918247acfb "pdflatex"
"MLPproject.out" 1761826833.91204 3758 71b69fe4b092934da11f6db2b9fc27bd "pdflatex"
"MLPproject.tex" 1761826831.67703 25484 b909380ef4186262cc37c215d1d67a9a ""
"MLPproject.toc" 1761826833.91325 1866 d9b68267ee2cdd579efd7907196d8c4b "pdflatex"
"SelfArx.cls" 1761123180.54708 7316 506603b27aab6da8087bc0f1ee693041 ""
"featureImportanceDT.png" 1761328898.24566 60078 4a2e56e2a45ae2ae5e41b9830c1bbcea ""
"featureImportanceRF.png" 1761328962.51602 61794 6b3eefc625dd3da8a3dbf302174c614c ""
"/usr/share/texlive/texmf-dist/web2c/texmf.cnf" 1737590400 40900 887e0dc8cac988a9e9c574af364cf837 ""
"/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map" 1761127463.65456 4602601 5d02a5c5d52d7237566d144856366042 ""
"/var/lib/texmf/web2c/pdftex/pdflatex.fmt" 1761127067 7753793 c9f4d2c19ab997188c605d7179b0cdc0 ""
"CM_dt.png" 1761920482.34887 97023 ce9f07bdb4551ffd7f80782b99a54328 ""
"CM_rf.png" 1761920484.96582 98726 a24b8d53317f0e7e65e41ed83ef8fae5 ""
"MLPproject.aux" 1761920868.72356 6698 e699ab45a2056e84f281588212bdf2ec "pdflatex"
"MLPproject.out" 1761920868.72456 3113 d57c5f2b0e6699323b0a2645b9706cce "pdflatex"
"MLPproject.tex" 1761920867.50658 24533 58032bad0234d994ba6556d7acc5212e ""
"MLPproject.toc" 1761920868.72456 1587 d275c5e85ba45c005c3baf7931c510a7 "pdflatex"
"SelfArx.cls" 1761125830.98333 7316 506603b27aab6da8087bc0f1ee693041 ""
"featureImportanceDT.png" 1761403205.10917 60078 4a2e56e2a45ae2ae5e41b9830c1bbcea ""
"featureImportanceRF.png" 1761403205.11075 61794 6b3eefc625dd3da8a3dbf302174c614c ""
(generated)
"MLPproject.aux"
"MLPproject.log"

View File

@@ -1,8 +1,8 @@
PWD /home/jaknyst/Documents/MLPproject/Report
PWD /home/petrus/Documents/MLP/Projects/MLPproject/Report
INPUT /usr/share/texlive/texmf-dist/web2c/texmf.cnf
INPUT /usr/share/texlive/texmf-dist/web2c/texmf.cnf
INPUT /var/lib/texmf/web2c/pdftex/pdflatex.fmt
INPUT /home/jaknyst/Documents/MLPproject/Report/MLPproject.tex
INPUT /home/petrus/Documents/MLP/Projects/MLPproject/Report/MLPproject.tex
OUTPUT MLPproject.log
INPUT ./SelfArx.cls
INPUT ./SelfArx.cls

View File

@@ -1,10 +1,10 @@
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2025.10.12) 30 OCT 2025 13:20
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2025.10.22) 31 OCT 2025 15:27
entering extended mode
restricted \write18 enabled.
file:line:error style messages enabled.
%&-line parsing enabled.
**/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex
(/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex
**/home/petrus/Documents/MLP/Projects/MLPproject/Report/MLPproject.tex
(/home/petrus/Documents/MLP/Projects/MLPproject/Report/MLPproject.tex
LaTeX2e <2022-11-01> patch level 1
L3 programming layer <2023-02-22> (./SelfArx.cls
Document Class: SelfArx 25/01/2012, v1.0
@@ -689,53 +689,49 @@ File: ts1ptm.fd 2001/06/04 font definitions for TS1/ptm.
] [2]
<CM_dt.png, id=145, 462.528pt x 346.896pt>
<CM_dt.png, id=129, 462.528pt x 346.896pt>
File: CM_dt.png Graphic file (type png)
<use CM_dt.png>
Package pdftex.def Info: CM_dt.png used on input line 112.
Package pdftex.def Info: CM_dt.png used on input line 123.
(pdftex.def) Requested size: 230.29584pt x 172.7224pt.
<CM_rf.png, id=147, 462.528pt x 346.896pt>
<CM_rf.png, id=131, 462.528pt x 346.896pt>
File: CM_rf.png Graphic file (type png)
<use CM_rf.png>
Package pdftex.def Info: CM_rf.png used on input line 119.
Package pdftex.def Info: CM_rf.png used on input line 130.
(pdftex.def) Requested size: 230.29584pt x 172.7224pt.
LaTeX Warning: `!h' float specifier changed to `!ht'.
[3] [4 <./CM_dt.png> <./CM_rf.png>]
<featureImportanceDT.png, id=169, 416.2752pt x 393.8715pt>
[3 <./CM_dt.png> <./CM_rf.png>]
<featureImportanceDT.png, id=147, 416.2752pt x 393.8715pt>
File: featureImportanceDT.png Graphic file (type png)
<use featureImportanceDT.png>
Package pdftex.def Info: featureImportanceDT.png used on input line 179.
Package pdftex.def Info: featureImportanceDT.png used on input line 209.
(pdftex.def) Requested size: 206.0563pt x 194.96999pt.
<featureImportanceRF.png, id=170, 422.0568pt x 393.8715pt>
<featureImportanceRF.png, id=148, 422.0568pt x 393.8715pt>
File: featureImportanceRF.png Graphic file (type png)
<use featureImportanceRF.png>
Package pdftex.def Info: featureImportanceRF.png used on input line 186.
Package pdftex.def Info: featureImportanceRF.png used on input line 216.
(pdftex.def) Requested size: 206.0563pt x 192.29555pt.
[5 <./featureImportanceDT.png> <./featureImportanceRF.png>]
[4] [5 <./featureImportanceDT.png> <./featureImportanceRF.png>]
enddocument/afterlastpage: lastpage setting LastPage.
(./MLPproject.aux)
LaTeX Warning: There were multiply-defined labels.
Package rerunfilecheck Info: File `MLPproject.out' has not changed.
(rerunfilecheck) Checksum: 71B69FE4B092934DA11F6DB2B9FC27BD;3758.
(rerunfilecheck) Checksum: D57C5F2B0E6699323B0A2645B9706CCE;3113.
)
Here is how much of TeX's memory you used:
19063 strings out of 476041
322261 string characters out of 5793173
19066 strings out of 476041
322326 string characters out of 5793173
1878388 words of memory out of 6000000
38908 multiletter control sequences out of 15000+600000
569282 words of font info for 295 fonts, out of 8000000 for 9000
38909 multiletter control sequences out of 15000+600000
569401 words of font info for 297 fonts, out of 8000000 for 9000
1137 hyphenation exceptions out of 8191
75i,12n,77p,1644b,605s stack positions out of 10000i,1000n,20000p,200000b,200000s
75i,12n,77p,1656b,605s stack positions out of 10000i,1000n,20000p,200000b,200000s
</usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvb8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvr8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvro8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/times/utmr8a.pfb>
Output written on MLPproject.pdf (5 pages, 290735 bytes).
Output written on MLPproject.pdf (5 pages, 305630 bytes).
PDF statistics:
205 PDF objects out of 1000 (max. 8388607)
159 compressed objects within 2 object streams
31 named destinations out of 1000 (max. 500000)
92349 words of extra memory for PDF output out of 106986 (max. 10000000)
191 PDF objects out of 1000 (max. 8388607)
148 compressed objects within 2 object streams
30 named destinations out of 1000 (max. 500000)
92325 words of extra memory for PDF output out of 106986 (max. 10000000)

View File

@@ -5,17 +5,14 @@
\BOOKMARK [2][-]{subsection.2.3}{\376\377\000H\000a\000n\000d\000l\000i\000n\000g\000\040\000m\000i\000s\000s\000i\000n\000g\000\040\000v\000a\000l\000u\000e\000s}{section.2}% 5
\BOOKMARK [2][-]{subsection.2.4}{\376\377\000T\000r\000a\000i\000n\000i\000n\000g\000,\000\040\000v\000a\000l\000i\000d\000a\000t\000i\000o\000n\000\040\000a\000n\000d\000\040\000t\000e\000s\000t\000\040\000s\000e\000t\000s}{section.2}% 6
\BOOKMARK [1][-]{section.3}{\376\377\000M\000o\000d\000e\000l\000\040\000s\000e\000l\000e\000c\000t\000i\000o\000n}{}% 7
\BOOKMARK [2][-]{subsection.3.1}{\376\377\000D\000a\000t\000a\000\040\000c\000l\000e\000a\000n\000i\000n\000g\000\040\000a\000n\000d\000\040\000f\000e\000a\000t\000u\000r\000e\000\040\000e\000n\000g\000i\000n\000e\000e\000r\000i\000n\000g}{section.3}% 8
\BOOKMARK [2][-]{subsection.3.2}{\376\377\000H\000a\000n\000d\000l\000i\000n\000g\000\040\000m\000i\000s\000s\000i\000n\000g\000\040\000v\000a\000l\000u\000e\000s}{section.3}% 9
\BOOKMARK [2][-]{subsection.3.3}{\376\377\000T\000r\000a\000i\000n\000i\000n\000g\000,\000\040\000v\000a\000l\000i\000d\000a\000t\000i\000o\000n\000\040\000a\000n\000d\000\040\000t\000e\000s\000t\000\040\000s\000e\000t\000s}{section.3}% 10
\BOOKMARK [1][-]{section.4}{\376\377\000M\000o\000d\000e\000l\000\040\000s\000e\000l\000e\000c\000t\000i\000o\000n}{}% 11
\BOOKMARK [1][-]{section.5}{\376\377\000M\000o\000d\000e\000l\000\040\000T\000r\000a\000i\000n\000i\000n\000g\000\040\000a\000n\000d\000\040\000H\000y\000p\000e\000r\000p\000a\000r\000a\000m\000e\000t\000e\000r\000\040\000T\000u\000n\000i\000n\000g}{}% 12
\BOOKMARK [2][-]{subsection.5.1}{\376\377\000M\000o\000d\000e\000l\000s\000\040\000a\000n\000d\000\040\000m\000e\000t\000h\000o\000d\000s\000\040\000u\000s\000e\000d}{section.5}% 13
\BOOKMARK [2][-]{subsection.5.2}{\376\377\000C\000a\000v\000e\000a\000t\000s\000\040\000a\000n\000d\000\040\000r\000e\000s\000t\000r\000i\000c\000t\000i\000o\000n\000s}{section.5}% 14
\BOOKMARK [1][-]{section.6}{\376\377\000M\000o\000d\000e\000l\000\040\000E\000v\000a\000l\000u\000a\000t\000i\000o\000n\000s}{}% 15
\BOOKMARK [2][-]{subsection.6.1}{\376\377\000A\000n\000a\000l\000y\000z\000i\000n\000g\000\040\000t\000h\000e\000\040\000C\000o\000n\000f\000u\000s\000i\000o\000n\000\040\000M\000a\000t\000r\000i\000c\000i\000e\000s}{section.6}% 16
\BOOKMARK [2][-]{subsection.6.2}{\376\377\000A\000n\000a\000l\000y\000z\000i\000n\000g\000\040\000W\000e\000i\000g\000h\000t\000e\000d\000\040\000P\000e\000r\000f\000o\000r\000m\000a\000n\000c\000e\000\040\000M\000e\000t\000r\000i\000c\000s}{section.6}% 17
\BOOKMARK [2][-]{subsection.6.3}{\376\377\000A\000n\000a\000l\000y\000z\000i\000n\000g\000\040\000t\000h\000e\000\040\000P\000e\000r\000f\000o\000r\000m\000a\000n\000c\000e}{section.6}% 18
\BOOKMARK [2][-]{subsection.6.4}{\376\377\000O\000v\000e\000r\000f\000i\000t\000t\000i\000n\000g\000\040\000a\000n\000d\000\040\000U\000n\000d\000e\000r\000f\000i\000t\000t\000i\000n\000g}{section.6}% 19
\BOOKMARK [2][-]{subsection.6.5}{\376\377\000F\000e\000a\000t\000u\000r\000e\000\040\000I\000m\000p\000o\000r\000t\000a\000n\000c\000e}{section.6}% 20
\BOOKMARK [1][-]{section.7}{\376\377\000S\000u\000m\000m\000a\000r\000y}{}% 21
\BOOKMARK [1][-]{section.4}{\376\377\000M\000o\000d\000e\000l\000\040\000T\000r\000a\000i\000n\000i\000n\000g\000\040\000a\000n\000d\000\040\000H\000y\000p\000e\000r\000p\000a\000r\000a\000m\000e\000t\000e\000r\000\040\000T\000u\000n\000i\000n\000g}{}% 8
\BOOKMARK [2][-]{subsection.4.1}{\376\377\000M\000o\000d\000e\000l\000s\000\040\000a\000n\000d\000\040\000m\000e\000t\000h\000o\000d\000s\000\040\000u\000s\000e\000d}{section.4}% 9
\BOOKMARK [2][-]{subsection.4.2}{\376\377\000V\000a\000l\000i\000d\000a\000t\000i\000o\000n\000\040\000R\000e\000s\000u\000l\000t\000s}{section.4}% 10
\BOOKMARK [2][-]{subsection.4.3}{\376\377\000C\000a\000v\000e\000a\000t\000s\000\040\000a\000n\000d\000\040\000r\000e\000s\000t\000r\000i\000c\000t\000i\000o\000n\000s}{section.4}% 11
\BOOKMARK [1][-]{section.5}{\376\377\000M\000o\000d\000e\000l\000\040\000E\000v\000a\000l\000u\000a\000t\000i\000o\000n\000s}{}% 12
\BOOKMARK [2][-]{subsection.5.1}{\376\377\000A\000n\000a\000l\000y\000z\000i\000n\000g\000\040\000t\000h\000e\000\040\000C\000o\000n\000f\000u\000s\000i\000o\000n\000\040\000M\000a\000t\000r\000i\000c\000i\000e\000s}{section.5}% 13
\BOOKMARK [2][-]{subsection.5.2}{\376\377\000A\000n\000a\000l\000y\000z\000i\000n\000g\000\040\000W\000e\000i\000g\000h\000t\000e\000d\000\040\000P\000e\000r\000f\000o\000r\000m\000a\000n\000c\000e\000\040\000M\000e\000t\000r\000i\000c\000s}{section.5}% 14
\BOOKMARK [2][-]{subsection.5.3}{\376\377\000A\000n\000a\000l\000y\000z\000i\000n\000g\000\040\000t\000h\000e\000\040\000P\000e\000r\000f\000o\000r\000m\000a\000n\000c\000e}{section.5}% 15
\BOOKMARK [2][-]{subsection.5.4}{\376\377\000O\000v\000e\000r\000f\000i\000t\000t\000i\000n\000g\000\040\000a\000n\000d\000\040\000U\000n\000d\000e\000r\000f\000i\000t\000t\000i\000n\000g}{section.5}% 16
\BOOKMARK [2][-]{subsection.5.5}{\376\377\000F\000e\000a\000t\000u\000r\000e\000\040\000I\000m\000p\000o\000r\000t\000a\000n\000c\000e}{section.5}% 17
\BOOKMARK [1][-]{section.6}{\376\377\000S\000u\000m\000m\000a\000r\000y}{}% 18

Binary file not shown.

Binary file not shown.

View File

@@ -39,16 +39,16 @@
\PaperTitle{Write the title of your report here} % Article title
\Authors{John Smith\textsuperscript{1}*, Jennie Smith\textsuperscript{1}} % Authors
\Authors{Petrus Einarsson\textsuperscript{1}*, Jakob Nyström\textsuperscript{1}*} % Authors
\affiliation{\textsuperscript{1}\textit{Department of Physics, Umeå University, Umeå, Sweden}} % Author affiliation
\affiliation{*\textbf{Corresponding author}: john@smith.com} % Corresponding author
\affiliation{*\textbf{Supervisor}: joe@doe.com}
\Keywords{Optics --- Interference --- Diffraction} % Keywords - if you don't want any simply remove all the text between the curly brackets
\affiliation{*\textbf{Corresponding authors}: peei0011@student.umu.se, jany0047@student.umu.se} % Corresponding author
\affiliation{*\textbf{Supervisor}: shahab.fatemi@umu.se}
\Keywords{} % Keywords - if you don't want any simply remove all the text between the curly brackets
\newcommand{\keywordname}{Keywords} % Defines the keywords heading name
%----------------------------
% ABSTRACT
%----------------------------
\Abstract{We found a dataset that could be used for classification tasks. In order to be able to use this dataset we had to do some feature engineering, handle missing values and do some other data cleaning such as label encoding. We chose two applicable models, the Decision Tree and the Random Forst models. The dataset was divided into training, validation and testing. We tuned hyperparameters to get the best possible validation results and to avoid overfitting. When we were satisfied with our models we found that both models performed about tha same with the Random Forest having about on percentage point better results but with much higher training times. We argue that the weighted accuracies of about 85\% which at a glance might seem bad, actually are reasonable given the nature of our data sets and the choices we made.}
\Abstract{We found a dataset that could be used for classification tasks. In order to be able to use this dataset we had to do some feature engineering, handle missing values and do some other data cleaning such as label encoding. We chose two applicable models, the Decision Tree and the Random Forst models. The dataset was divided into training, validation and testing. We tuned hyperparameters to get the best possible validation results and to avoid overfitting. When we were satisfied with our models we found that both models performed about the same with the Random Forest having about on percentage point better results but with much higher training times. We argue that the weighted accuracies of about 85\% which at a glance might seem bad, actually are reasonable given the nature of our data sets and the choices we made.}
%----------------------------
\begin{document}
@@ -67,7 +67,7 @@
%----------------------------
\section{Introduction}
Machine learning techniques have plenty of practical use cases. In this report we find a real world, dataset and train two machine learning models on it to try and get the best results possible.
Machine learning techniques have plenty of practical use cases. An example of an application is using machine learning models to estimate the salary of individuals. This can not only be practical for commercial use such as recommending relevant products to potential customers. But it can also for example be used to better understand what factors are responsible for wealth gaps within societies. In this report we find a real world dataset covering salaries of adults and train two machine learning models on it to try and get the best results possible.
\section{Data analysis}
@@ -77,31 +77,42 @@ Machine learning techniques have plenty of practical use cases. In this report w
The dataset we decided to study is a labeled income prediction dataset. This dataset includes 14 features with information about the people in the study and a label with the income as either more than \$50 000 per year or less than or equal to \$50 000 per year. This means that we are looking at a binary classification problem. A lot of the features are discrete where only a set number of options available. This includes features such as marital status, education and working class. The dataset features around 32500 data points.
\subsection{Data cleaning and feature engineering}
There were a couple of things with our dataset that had to be modified in order for it to be usable in our ML application. We find that some of the features are redundant or not interesting in our project. We romove the redundant feature education since there is another already numerically encoded feature containing the same data. We also chose to remove the feature 'fnlwgt' since it is a already calculated number that is used by the Census Bureau to estimate population statistics. Since we want to estimate the population statistics based on the other features and not the already calculated weight we remove this feature. We have a mix of numerical and non-numerical features in our dataset. Since the machine learning models cannot use non-numerical data we have to encode the non-numercial data into corresponding numbers. This is with the label encoder built into sci-kit learn and used on all non-numerical data.
\subsection{Handling missing values}
With our numerical version of the dataset we found with the info function in pandas that around 2500 values were NaN values. We reasoned that filling these values with something as the mean of the category does not make very much sense for our application. Since there are many discrete categories a mean value means nothing. Especially since we gave many categories arbitrary numbers the mean means nothing. We therefore decided to only use complete data points. This resulted in removing about 6\% of the total amount of data points or about 2500 data points.
\subsection{Training, validation and test sets}
Before doing any sort of training or analysis on the data, se split it into training, test and validation data. We did this by first splitting a random 20\% of the data into test data. This data is reserved for the final testing of the model and will not be touched until the model is finished. Then we did a further split of the rest of the data were 25\% was designated as validation data. This data will be used for calibration of the model and hyperparameter tuning. The rest of the data which is 60\% of the total data or around 18000 data points will be used to train the model.
\section{Model selection}
When selecting the model to use for this project we have to limit us to using models that are appropriate to the type of problem that we are trying to solve. The problem is a classification task so all models that are used for regression are immediately invalid. There are plenty of different types of classification models left to choose from. Many of them however, are good for data that has non-discrete features. This includes models such as logistic regression, KNN and other similar types of classification models. Also since we have so many features that are non-numerical and converted into arbitrary numbers these types of models would not be optimal. What is left is the Gaussian Naïve Baye's and the different tree based models. Naïve Baye's can be a bit troublesome for this dataset since we have found that some parameters are slightly correlated. However, this does not necessarliy make in an inappropriate method as it has been found to perform well despite this strict assumption. Therefore we are left with the tree based models such as the decision tree and random forests. We decided to implement two different types of models. We first do a decision tree and see how good we can get that model to work. We then do a random forest which may not be the absolute best model but since it is a continuation on the decision tree it might be interesting to see if it performs better. We then do analysis on both methods and see if these models are good enough and if there is any meaningful difference between the two.
There were a couple of things with our dataset that had to be modified in order for it to be usable in our ML application. We find that some of the features are redundant or not interesting in our project. We remove the redundant feature 'education' since there is another already numerically encoded feature containing the same data. We also chose to remove the feature 'fnlwgt' since it is a already calculated number that is used by the Census Bureau to estimate population statistics. Since we want to estimate the population statistics based on the other features and not the already calculated weight we remove this feature. We have a mix of numerical and non-numerical features in our dataset. Since the machine learning models cannot use non-numerical data we have to encode the non-numercial data into corresponding numbers. This is with the label encoder built into sci-kit learn and used on all non-numerical data.
\subsection{Data cleaning and feature engineering}
There were a couple of things with our dataset that had to be modified in order for it to be usable in our ML application. We find that some of the features are redundant or not interesting in our project. We romove the redundant feature education since there is another already numerically encoded feature containing the same data. We also chose to remove the feature 'fnlwgt' since it is a already calculated number that is used by the Census Bureau to estimate population statistics. Since we want to estimate the population statistics based on the other features and not the already calculated weight we remove this feature. We have a mix of numerical and non-numerical features in our dataset. Since the machine learning models cannot use non-numerical data we have to encode the non-numercial data into corresponding numbers. This is with the label encoder built into sci-kit learn and used on all non-numerical data.
\subsection{Handling missing values}
With our numerical version of the dataset we found with the info function in pandas that around 2500 values were NaN values. We reasoned that filling these values with something as the mean of the category does not make very much sense for our application. Since there are many discrete categories a mean value means nothing. Especially since we gave many categories arbitrary numbers the mean means nothing. We therefore decided to only use complete data points. This resulted in removing about 6\% of the total amount of data points or about 2500 data points.
\subsection{Training, validation and test sets}
Before doing any sort of training or analysis on the data, se split it into training, test and validation data. We did this by first splitting a random 20\% of the data into test data. This data is reserved for the final testing of the model and will not be touched until the model is finished. Then we did a further split of the rest of the data were 25\% was designated as validation data. This data will be used for calibration of the model and hyperparameter tuning. The rest of the data which is 60\% of the total data or around 18000 data points will be used to train the model.
\section{Model selection}
When selecting the model to use for this project we have to limit us to using models that are appropriate to the type of problem that we are trying to solve. The problem is a classification task so all models that are used for regression are immediately invalid. There are plenty of different types of classification models left to choose from. Many of them however, are good for data that has non-discrete features. This includes models such as logistic regression, KNN and other similar types of classification models. Also since we have so many features that are non-numerical and converted into arbitrary numbers these types of models would not be optimal. At first glance, due to the many discrete features Naïve Baye's could be a possible contender. However, the dataset also includes some continious features which complicates things. The different versions of Naïve Baye's aren't really suitable to a mix of discrete and continuous features. Therefore we are left with the tree based models such as the decision tree and random forests. We decided to implement two different types of models. We first do a decision tree and see how good we can get that model to work. We then do a random forest which may not be the absolute best model but since it is a continuation on the decision tree it might be interesting to see if it performs better. We then do analysis on both methods and see if these models are good enough and if there is any meaningful difference between the two.
\section{Model Training and Hyperparameter Tuning}
\subsection{Models and methods used}
During the model training there are some important changes we can make to improve the accuracy of our model. One of the most fundemental procedures was hyperparameter tuning which was performed inside a custom class which performs model opitmization and comparison for different models. The class handles the full workflow of tuning the hyperparameters, training the models and recording evaluation metrics. More specifically the method used for hyperparameter tuning is Scikit Learn's GridSearchCV with accuracy as the scoring metric. This method tests different combinations of hyperparameters to establish the best one's. In addition it incorporates cross-validation to prevent overfitting and increase the reliability of the results. For the cross-validation, we used Scikit Learn's stratified k-fold. This type of cross validation is beneficial to use as it preserves the percentage of samples for the classes in each fold, making the model more robust. We used 10 folds for the cross validation, there is of course no "correct" number of folds to use as it's more of a trade off between performance and computational efficiency.
During the model training there are some important changes we can make to improve the accuracy of our model. One of the most fundemental procedures is hyperparameter tuning which was performed inside a custom class which performs model opitmization and comparison for different models. The class handles the full workflow of tuning the hyperparameters, training the models and recording evaluation metrics. More specifically the method used for hyperparameter tuning is Scikit Learn's GridSearchCV with accuracy as the scoring metric. This method tests different combinations of hyperparameters to establish the best one's. In addition it incorporates cross-validation to prevent overfitting and increase the reliability of the results. For the cross-validation, we used Scikit Learn's stratified k-fold. This type of cross validation is beneficial to use as it preserves the percentage of samples for the classes in each fold, making the model more robust. We used 10 folds for the cross validation, there is of course no "correct" number of folds to use as it's more of a trade off between performance and computational efficiency.
The hyperparameters included in the grid for the decision tree were the maximum depth and the minimum sample split. The maximum depth hyperparameter decides how deep the tree is allowed to go. If a tree is allowed to go very deep there is a high risk of overfitting, on the contrary, a shallow tree will instead risk underfitting. The minimum sample split states how many data points there has to be for a new split to be created. This is also a good measure against overfitting since if it is very low we risk training the noise of the data instead of the general trend and end up overfitting the data. It is also important that it is not to small since we then loose information and underfit instead. For Random Forest the hyperparameters in the grid were maximum depth, minimum sample split and number of estimators, which decides how many trees are used in the Random Forest algorithm. % Something about XGBoost as well
The hyperparameters included in the grid for the decision tree were the maximum depth and the minimum sample split. The maximum depth hyperparameter decides how deep the tree is allowed to go. If a tree is allowed to go very deep there is a high risk of overfitting, on the contrary, a shallow tree will instead risk underfitting. The minimum sample split states how many data points there has to be for a new split to be created. This is also a good measure against overfitting since if it is very low we risk training the noise of the data instead of the general trend and end up overfitting the data. It is also important that it is not too small since we then loose information and underfit instead. For Random Forest the hyperparameters in the grid were maximum depth, minimum sample split and number of estimators, which decides how many trees are used in the Random Forest algorithm. % Something about XGBoost as well
When performing the hyperparameter tuning, we started out with a rough grid to get a decent estimate of the optimal configuration. From the resluts we then performed a finer grid around the optimal configuration. This way we where able to inspect both a wide range and a more precise range without severly increasing the computational load.
\subsection{Validation Results}
Table (\ref{perfmetric}) shows the weighted averages of the performance metrics of the validation data for both models.
\begin{table}[!htbp]
\centering
\caption{The weighted averages of the performance metrics of the models on the validation data.}
\label{perfmetric}
\resizebox{\columnwidth}{!}{
\begin{tabular}{c|c|c|c|c|c}
Model&Accuracy&Precision&Recall&F1 Score&Total Time\\
\hline
RF &0.8589&0.8535&0.8589&0.8534&150.8154\\
\hline
DT&0.8483&0.8449&0.8483&0.8462&6.7357
\end{tabular}}
\end{table}
\subsection{Caveats and restrictions}
Although the validation results produced from the script are quite promising there are a couple of important notes to make, not only to better understand the final models but also to avoid pitfalls in potential future projects. Firstly, in our script we decided to not use any standardization as this is a sort of unique case where the models used do not require it. However, it's extremely important to understand that if we were to introduce another model, we would need to standardize the data to ensure that the features contribute equally. Secondly, there are more hyperparameters that one might want to consider as we only used a few of them. The problem with expanding the number of hyperparameters in the grid is that it will exponentially increase the computational load. Therefore we picked a few that we thought were most important. Continuing, the scoring metric used is not always the best choice. We used accuracy, meaning the model tries to correctly label as many datapoints as possible and does not care about keeping a similiar precision for both labels. Our goal of this project is somewhat arbitrary, we mainly want to train and compare models. However if such a model were to be used in a real world application, one might want to change the scoring to better adapt the model to the problem at hand. % Elaborate... Secondly, there are more hyperparameters that one might want to consider... Continuing, the scoring metric used is not always the best choice. In fact, the scoring metric one should use is highly dependent on what one's goal is...
@@ -131,43 +142,62 @@ There are two interesting parts to look at after our analysis. One part is to an
As we can see in the confusion matricies there is not that big of a difference between the models. Both did an overall good job at identifying the two classes. There is a difference in how well the models did in identifying the two different classes. Overall they performed a lot better at classifying the poor people than the rich. We can see that for the both models are pretty good at classifying the poor class and worse at the rich class. The Random forest model is slightly better than the Decision Tree. This is a very interesting result and maybe not so weird as it first seems. There were a lot more poor people in our training data set than rich people. This would of course train our model to be better at classifying the poor. As well as looking at the classification matricies it is interesting to look at the actual performance metrics that can be calculated from the matricies.
As we can see in the confusion matricies there is not that big of a difference between the models. Both did an overall good job at identifying the two classes. There is a difference in how well the models did in identifying the two different classes. Overall they performed a lot better at classifying the lower-earning people than the higher-earning. We can see that for the both models are pretty good at classifying the lower-earning class and worse at the higher-earning class. The Random forest model is slightly better than the Decision Tree. This is a very interesting result and maybe not so weird as it first seems. There were a lot more lower-earning people in our training data set than higher-earning people. This would of course train our model to be better at classifying the lower-earning individuals. As well as looking at the classification matricies it is interesting to look at the actual performance metrics that can be calculated from the matricies.
\subsection{Analyzing Weighted Performance Metrics}
We want to analyze to sets of metrics. First we have the validaton Metrics. These metrics can be seen in table (\ref{perfmetric}). Then we have the actual test metrics which is the result from our model. These can be seen in table (\ref{perfmetrictest}). Of note is that all of these metrics are calculated as weighted metrics which means that they account for the class imbalances seen in the confusion matrcies.
\begin{table}[!htbp]
\centering
\caption{The performance metrics of the models on the validation data.}
\label{perfmetric}
\resizebox{\columnwidth}{!}{
\begin{tabular}{c|c|c|c|c|c}
Model&Accuracy&Precision&Recall&F1 Score&Total Time\\
\hline
RF &0.8589&0.8535&0.8589&0.8534&150.8154\\
\hline
DT&0.8483&0.8449&0.8483&0.8462&6.7357
\end{tabular}}
\end{table}
\begin{table}[!htbp]
\centering
\caption{The performance metrics of the models on the test data.}
\caption{The weighted averages of the performance metrics of the models on the test data.}
\label{perfmetrictest}
\resizebox{0.6\columnwidth}{!}{
\begin{tabular}{c|c|c|c}
Model&Precision&Recall&F1 Score\\
\resizebox{0.8\columnwidth}{!}{
\begin{tabular}{c|c|c|c|c}
Model&Accuracy&Precision&Recall&F1 Score\\
\hline
RF &0.86&0.86&0.86\\
RF &0.86&0.86&0.86&0.86\\
\hline
DT&0.84&0.85&0.84
DT &0.85&0.84&0.85&0.84
\end{tabular}}
\end{table}
Looking at the values we see that the difference between our models is not that large. The Random forest model is on average about 1 percentage point better than the Decision Tree. We can also see that all metrics are at about 0.85. This means that our models are not very accurate and that the differences between them is not that large at all. Which model that is better depends a lot on what is the priority. While it is clear that the Random Forest has the better performance, even by just a little bit, it is also significanty slower on the validation data. So for this dataset was it really worth 30x the computational time to get a slightly better result? We are not really sure. The extra computational time is a definite negative but at the size of this dataset we are only talking about a couple of minutes which is not too bad. For another dataset the results may be different and it might be clearer which is really the prefered model.
Another thing to consider is the interpretability of the models. Here, there is quite a big difference that could possibly outweigh one model over the other. Starting with the Decision Tree, because the model's prediction process is quite simple, it is also highly interpretable. We can even plot the decision tree to see how the model handles every feature for a datapoint. This can be beneficial if we want to better understand the model. In contrast, Random Forest uses a more complicated method for prediction as it takes the averages over numerous decision trees with random subsets of features. This means that the model is more or less a black box. The importance of model interpretability is difficult to define as it will vary between different applications and there is even a subjective element to its importance. % Elaborate.
Another thing to consider is the interpretability of the models. Here, there is quite a big difference that could possibly outweigh one model over the other. Starting with the Decision Tree, because the model's prediction process is quite simple, it is also highly interpretable. We can even plot the decision tree to see how the model handles every feature for a datapoint. This can be beneficial if we want to better understand the model. In contrast, Random Forest uses a more complicated method for prediction as it takes the averages over numerous decision trees with random subsets of features. This means that the model is more or less a black box. The importance of model interpretability is difficult to define as it will vary between different applications. Nevertheless, it's important to understand that for the better performance of Random Forest we are sacrificing a lot of interpretability. % Elaborate.
\subsection{Analyzing the Performance}
Table (\ref{dt_metrics}) and (\ref{rf_metrics}) shows the class-wise metrics of the Decision Tree and Random Forest, respectively.
\begin{table}[!htbp]
\centering
\caption{Class-wise performance metrics of the Decision Tree.}
\label{dt_metrics}
\resizebox{0.7\columnwidth}{!}{
\begin{tabular}{c|c|c|c}
Class&Precision&Recall&F1 Score\\
\hline
Lower-earning &0.87&0.95&0.90\\
\hline
Higher-earning&0.77&0.56&0.65
\end{tabular}}
\end{table}
\begin{table}[!htbp]
\centering
\caption{Class-wise performance metrics of the Random Forest.}
\label{rf_metrics}
\resizebox{0.7\columnwidth}{!}{
\begin{tabular}{c|c|c|c}
Class&Precision&Recall&F1 Score\\
\hline
Lower-earning &0.89&0.94&0.91\\
\hline
Higher-earning&0.77&0.63&0.70
\end{tabular}}
\end{table}
At a first glance at both the confusion matricies and the performance metrics the models do not look to be that good. But what has to be considered is the data that we are analyzing. We are looking at what possible indicators there are for a person to earn more than a certain amount of money. This is real world data and in the real world there is a lot of unique ways of earning money. While there certainly are some indicators that will clearly tell that somebody is earning a lot of money, there are other factors that are not as telling. This means that some features are less important than others. This can be seen in our models in the feature importance graphs in figure(\ref{fig:featureImportanceDT}) and (\ref{fig:featureImportanceRF}). This also means that there will be plenty of outliers in the data. No matter how good the model is, it cannot possibly catch all of these outliers. If it did it would be overfitted. We simply cannot expect a model to have very good accuracy on this type of data set.
An important thing to touch on is the poor fit on rich people by our model. We see that only 60-70\% where correctly identified which is quite bad. As we talked about above there may be many data reasons for this poor fit. Of note is that we have optimized this model to find the best accuracy on all data point. We therefore stride to classify as many total data points correctly as possible and not on getting the best average for the classes separetly. Since there are more poor people in our dataset it is very reasonable for the model to have optimised for that as well since it gives the best weighted accuracy.
An important thing to touch on is the poor fit on higher-earning people by our model. We see that both models produce a precision of 77\% on the lower-earning individuals, which is quite bad compared to the precision of 87\% and 89\% on the higher-earning individuals. This means that out of all individuals predicted as higher-earning, only 77\% are correctly predicted. Even more notably, there is a very big discrepancy on the recall between the two classes. Recalls of 56\% and 63\% for the higher-earning class compared to 95\% and 94\% shows that out of all the higher-earning individuals, the models are not good at correctly detecting them as higher-earning. As we talked about above there may be many reasons for this poor fit. Of note is that we have optimized this model to find the best accuracy on all data point. We therefore stride to classify as many total data points correctly as possible and not on getting the best average for the classes separetly. Since there are more lower-earning people in our dataset it is very reasonable for the model to have optimised for that as well since it gives the best weighted accuracy. As previosly stated, the scoring metrics used for training the models should be adapted based on the problem at hand. If the problem requires similiar metrics across the classes, one should instead consider using scoring metrics such as balanced accuracy score, which are adapted to produce such results.
\subsection{Overfitting and Underfitting}
We spent some time tuning the hyperparameters to ensure that we did not overfit. If we compare the validation results with the test results we see that the performance metrics do not change much at all. This is what we want to see as this means that we have avoidede overfitting the model. This means that our model could be used on other similar datasets and hopefully give similar perfomances. We also do not want our model to be underfit. This is a bit harder to validate as we want the errors to be as small as possible for both training and testing and as we stated before I believe that this is a difficult dataaset to get a great fit to. Therefore we believe that we have found a model that has a decent enough balance between bias and variance.
@@ -187,16 +217,16 @@ We spent some time tuning the hyperparameters to ensure that we did not overfit.
\caption{}
\label{fig:featureImportanceRF}
\end{subfigure}
\caption{The feature importance graphs for the Decision Tree model and the Random Forest model.}
\caption{The feature importance graphs for the Decision Tree model and the Random Forest model based on the validation data.}
\label{fig:}
\end{figure}
\subsection{Feature Importance}
Taking a closer look at the feature importance graphs of the two models we notice an interesting difference. The Decision tree which is only one tree focuses has only a few main features where one is the most important. The rest are not used that much or almost not at all. The Random Forest uses a far wider range of features. They also rank the features a bit differently and the best feature for one model is not the best for the other. We considered removing the worst performing features to see if it would make a difference in the performanes. But since they have diffrent worst performing features we reasoned that to keep the comparison as fair as possible it would be more interesting to leave the features as is.
Taking a closer look at the feature importance graphs of the two models we notice an interesting difference. The Decision tree which is only one tree focuses has only a few main features where one is the most important. The rest are not used that much or almost not at all. The Random Forest uses a far wider range of features. They also rank the features a bit differently and the best feature for one model is not the best for the other. We considered removing the worst performing features to see if it would make a difference in the performacnes. But since they have different results for the worst performing features we reasoned that to keep the comparison as fair as possible it would be more interesting to leave the features as is.
\section{Summary}
We have succesfully trained two different but similar machine learning models on classifying the monetary status of people based on a bunch of different features. While some trade offs where made in regards to which features where kept and to what we optimized the model for. We still managed to get a respectable result especially regarding the difficult type of data that we had to work with.
We have succesfully trained two different but similar machine learning models on classifying the monetary status of people based on a bunch of different features. To avoid help overfitting, find optimal hyperparameters and generally produce a more reliable performance estimate, we performed a grid search combined with cross-validation on our data. Optimizing the models to produce the best accuracies generated a decent result for that specific metric. However, we did find that our models instead performed worse for the other metrics. Since we did not consider a specific application for model, we argue that the scoring metric should instead be adapted based on one's specific goal.
%---------
% REFERENCE LIST
%----------------------------

View File

@@ -6,18 +6,15 @@
\contentsline {subsection}{\numberline {2.3}Handling missing values}{2}{subsection.2.3}%
\contentsline {subsection}{\numberline {2.4}Training, validation and test sets}{2}{subsection.2.4}%
\contentsline {section}{\numberline {3}Model selection}{2}{section.3}%
\contentsline {subsection}{\numberline {3.1}Data cleaning and feature engineering}{2}{subsection.3.1}%
\contentsline {subsection}{\numberline {3.2}Handling missing values}{2}{subsection.3.2}%
\contentsline {subsection}{\numberline {3.3}Training, validation and test sets}{2}{subsection.3.3}%
\contentsline {section}{\numberline {4}Model selection}{2}{section.4}%
\contentsline {section}{\numberline {5}Model Training and Hyperparameter Tuning}{3}{section.5}%
\contentsline {subsection}{\numberline {5.1}Models and methods used}{3}{subsection.5.1}%
\contentsline {subsection}{\numberline {5.2}Caveats and restrictions}{3}{subsection.5.2}%
\contentsline {section}{\numberline {6}Model Evaluations}{3}{section.6}%
\contentsline {subsection}{\numberline {6.1}Analyzing the Confusion Matricies}{3}{subsection.6.1}%
\contentsline {subsection}{\numberline {6.2}Analyzing Weighted Performance Metrics}{4}{subsection.6.2}%
\contentsline {subsection}{\numberline {6.3}Analyzing the Performance}{4}{subsection.6.3}%
\contentsline {subsection}{\numberline {6.4}Overfitting and Underfitting}{5}{subsection.6.4}%
\contentsline {subsection}{\numberline {6.5}Feature Importance}{5}{subsection.6.5}%
\contentsline {section}{\numberline {7}Summary}{5}{section.7}%
\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{2}{section.4}%
\contentsline {subsection}{\numberline {4.1}Models and methods used}{2}{subsection.4.1}%
\contentsline {subsection}{\numberline {4.2}Validation Results}{2}{subsection.4.2}%
\contentsline {subsection}{\numberline {4.3}Caveats and restrictions}{2}{subsection.4.3}%
\contentsline {section}{\numberline {5}Model Evaluations}{3}{section.5}%
\contentsline {subsection}{\numberline {5.1}Analyzing the Confusion Matricies}{3}{subsection.5.1}%
\contentsline {subsection}{\numberline {5.2}Analyzing Weighted Performance Metrics}{3}{subsection.5.2}%
\contentsline {subsection}{\numberline {5.3}Analyzing the Performance}{4}{subsection.5.3}%
\contentsline {subsection}{\numberline {5.4}Overfitting and Underfitting}{4}{subsection.5.4}%
\contentsline {subsection}{\numberline {5.5}Feature Importance}{4}{subsection.5.5}%
\contentsline {section}{\numberline {6}Summary}{4}{section.6}%
\contentsfinish

Binary file not shown.