Cleaned the analysis part, started on abstract, introduction and sumamry parts

This commit is contained in:
2025-10-28 17:08:45 +01:00
parent 55eca37d53
commit 56742bab1e
9 changed files with 133 additions and 110 deletions

View File

@@ -21,45 +21,51 @@
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Dataset}{1}{subsection.2.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Data cleaning and feature engineering}{1}{subsection.2.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Handling missing values}{1}{subsection.2.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Training, validation and test sets}{1}{subsection.2.4}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {3}Model selection}{1}{section.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Training, validation and test sets}{2}{subsection.2.4}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {3}Model selection}{2}{section.3}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{2}{section.4}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {5}Model Evaluations}{2}{section.5}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Analyzing the Confusion Matricies}{2}{subsection.5.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Analyzing Weighted Performance Metrics}{2}{subsection.5.2}\protected@file@percent }
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{fig:featureImportanceDT}{{1(a)}{2}{\relax }{figure.caption.1}{}}
\newlabel{fig:featureImportanceDT@cref}{{[subfigure][1][1]1(a)}{[1][2][]2}}
\newlabel{sub@fig:featureImportanceDT}{{(a)}{2}{\relax }{figure.caption.1}{}}
\newlabel{sub@fig:featureImportanceDT@cref}{{[subfigure][1][1]1(a)}{[1][2][]2}}
\newlabel{fig:featureImportanceRF}{{1(b)}{2}{\relax }{figure.caption.1}{}}
\newlabel{fig:featureImportanceRF@cref}{{[subfigure][2][1]1(b)}{[1][2][]2}}
\newlabel{sub@fig:featureImportanceRF}{{(b)}{2}{\relax }{figure.caption.1}{}}
\newlabel{sub@fig:featureImportanceRF@cref}{{[subfigure][2][1]1(b)}{[1][2][]2}}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces The confusion matricies of the Decision Tree model and the Random Forest model on the test data.\relax }}{2}{figure.caption.1}\protected@file@percent }
\newlabel{fig:}{{1}{2}{The confusion matricies of the Decision Tree model and the Random Forest model on the test data.\relax }{figure.caption.1}{}}
\newlabel{fig:@cref}{{[figure][1][]1}{[1][2][]2}}
\newlabel{fig:featureImportanceDT}{{1(a)}{3}{\relax }{figure.caption.1}{}}
\newlabel{fig:featureImportanceDT@cref}{{[subfigure][1][1]1(a)}{[1][2][]3}}
\newlabel{sub@fig:featureImportanceDT}{{(a)}{3}{\relax }{figure.caption.1}{}}
\newlabel{sub@fig:featureImportanceDT@cref}{{[subfigure][1][1]1(a)}{[1][2][]3}}
\newlabel{fig:featureImportanceRF}{{1(b)}{3}{\relax }{figure.caption.1}{}}
\newlabel{fig:featureImportanceRF@cref}{{[subfigure][2][1]1(b)}{[1][2][]3}}
\newlabel{sub@fig:featureImportanceRF}{{(b)}{3}{\relax }{figure.caption.1}{}}
\newlabel{sub@fig:featureImportanceRF@cref}{{[subfigure][2][1]1(b)}{[1][2][]3}}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces The confusion matricies of the Decision Tree model and the Random Forest model on the test data.\relax }}{3}{figure.caption.1}\protected@file@percent }
\newlabel{fig:}{{1}{3}{The confusion matricies of the Decision Tree model and the Random Forest model on the test data.\relax }{figure.caption.1}{}}
\newlabel{fig:@cref}{{[figure][1][]1}{[1][2][]3}}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces The performance metrics of the models on the validation data.\relax }}{3}{table.caption.2}\protected@file@percent }
\newlabel{perfmetric}{{1}{3}{The performance metrics of the models on the validation data.\relax }{table.caption.2}{}}
\newlabel{perfmetric@cref}{{[table][1][]1}{[1][2][]3}}
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces The performance metrics of the models on the test data.\relax }}{3}{table.caption.3}\protected@file@percent }
\newlabel{perfmetrictest}{{2}{3}{The performance metrics of the models on the test data.\relax }{table.caption.3}{}}
\newlabel{perfmetrictest@cref}{{[table][2][]2}{[1][2][]3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Analyzing the Performance}{3}{subsection.5.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.4}Overfitting and Underfitting}{3}{subsection.5.4}\protected@file@percent }
\bibstyle{model1-num-names}
\bibcite{Steinhaus:Mathematical}{1}
\bibcite{Greivenkamp:FieldGuide}{2}
\bibcite{Pedrotti:Introduction}{3}
\bibcite{Davis:ChemWiki}{4}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces The performance metrics of the models on the validation data.\relax }}{3}{table.caption.2}\protected@file@percent }
\newlabel{perfmetric}{{1}{3}{The performance metrics of the models on the validation data.\relax }{table.caption.2}{}}
\newlabel{perfmetric@cref}{{[table][1][]1}{[1][2][]3}}
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces The performance metrics of the models on the test data.\relax }}{3}{table.caption.3}\protected@file@percent }
\newlabel{perfmetric}{{2}{3}{The performance metrics of the models on the test data.\relax }{table.caption.3}{}}
\newlabel{perfmetric@cref}{{[table][2][]2}{[1][2][]3}}
\newlabel{fig:featureImportanceDT}{{2(a)}{3}{\relax }{figure.caption.4}{}}
\newlabel{fig:featureImportanceDT@cref}{{[subfigure][1][2]2(a)}{[1][3][]3}}
\newlabel{sub@fig:featureImportanceDT}{{(a)}{3}{\relax }{figure.caption.4}{}}
\newlabel{sub@fig:featureImportanceDT@cref}{{[subfigure][1][2]2(a)}{[1][3][]3}}
\newlabel{fig:featureImportanceRF}{{2(b)}{3}{\relax }{figure.caption.4}{}}
\newlabel{fig:featureImportanceRF@cref}{{[subfigure][2][2]2(b)}{[1][3][]3}}
\newlabel{sub@fig:featureImportanceRF}{{(b)}{3}{\relax }{figure.caption.4}{}}
\newlabel{sub@fig:featureImportanceRF@cref}{{[subfigure][2][2]2(b)}{[1][3][]3}}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces The feature importance graphs for the Decision Tree model and the Random Forest model.\relax }}{3}{figure.caption.4}\protected@file@percent }
\newlabel{fig:}{{2}{3}{The feature importance graphs for the Decision Tree model and the Random Forest model.\relax }{figure.caption.4}{}}
\newlabel{fig:@cref}{{[figure][2][]2}{[1][3][]3}}
\@writefile{toc}{\contentsline {section}{References}{3}{figure.caption.4}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.5}Feature Importance}{4}{subsection.5.5}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {6}Summary}{4}{section.6}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{References}{4}{section.6}\protected@file@percent }
\newlabel{fig:featureImportanceDT}{{2(a)}{4}{\relax }{figure.caption.4}{}}
\newlabel{fig:featureImportanceDT@cref}{{[subfigure][1][2]2(a)}{[1][4][]4}}
\newlabel{sub@fig:featureImportanceDT}{{(a)}{4}{\relax }{figure.caption.4}{}}
\newlabel{sub@fig:featureImportanceDT@cref}{{[subfigure][1][2]2(a)}{[1][4][]4}}
\newlabel{fig:featureImportanceRF}{{2(b)}{4}{\relax }{figure.caption.4}{}}
\newlabel{fig:featureImportanceRF@cref}{{[subfigure][2][2]2(b)}{[1][4][]4}}
\newlabel{sub@fig:featureImportanceRF}{{(b)}{4}{\relax }{figure.caption.4}{}}
\newlabel{sub@fig:featureImportanceRF@cref}{{[subfigure][2][2]2(b)}{[1][4][]4}}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces The feature importance graphs for the Decision Tree model and the Random Forest model.\relax }}{4}{figure.caption.4}\protected@file@percent }
\newlabel{fig:}{{2}{4}{The feature importance graphs for the Decision Tree model and the Random Forest model.\relax }{figure.caption.4}{}}
\newlabel{fig:@cref}{{[figure][2][]2}{[1][4][]4}}
\ttl@finishall
\newlabel{LastPage}{{}{4}{}{page.4}{}}
\xdef\lastpage@lastpage{4}

View File

@@ -1,6 +1,6 @@
# Fdb version 4
["pdflatex"] 1761561403.82895 "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" "MLPproject.pdf" "MLPproject" 1761561404.678 0
"/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" 1761561402.77598 15966 7a773c81e3fe57345294b6a92b54398d ""
["pdflatex"] 1761667677.49056 "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" "MLPproject.pdf" "MLPproject" 1761667678.57369 0
"/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" 1761667676.18911 19065 228243c289e2ad8172afdd84483c70d8 ""
"/usr/share/texlive/texmf-dist/fonts/enc/dvips/base/8r.enc" 1721433600 4850 80dc9bab7f31fb78a000ccfed0e27cab ""
"/usr/share/texlive/texmf-dist/fonts/map/fontname/texfonts.map" 1577235249 3524 cb3e574dea2d1052e39280babc910dc8 ""
"/usr/share/texlive/texmf-dist/fonts/tfm/adobe/helvetic/phvb7t.tfm" 1136768653 2240 eb56c13537f4d8a0bd3fafc25572b1bd ""
@@ -132,12 +132,13 @@
"/usr/share/texlive/texmf-dist/web2c/texmf.cnf" 1721433600 40900 887e0dc8cac988a9e9c574af364cf837 ""
"/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map" 1760290233.68077 4602002 62dba5fc29055c16380d7393a2adb07a ""
"/var/lib/texmf/web2c/pdftex/pdflatex.fmt" 1760289849 7753794 892d611f76aecccd13eb485815d0543e ""
"MLPproject.aux" 1761561404.57999 4945 30825f194591ba7a2d8dc8b06523b8ac "pdflatex"
"MLPproject.out" 1761561404.5809 1585 799028b2b371d09d171b836f5fabb437 "pdflatex"
"MLPproject.tex" 1761561402.77598 15966 7a773c81e3fe57345294b6a92b54398d ""
"MLPproject.toc" 1761561404.58203 847 9491dde8a3fac8c42f1383615deb7f6b "pdflatex"
"CM_dt.png" 1761561428.73434 87433 ef7840e96e2e4e7d41f9d29d01517aa6 ""
"CM_rf.png" 1761561428.73495 88928 d3d0474bb68254ae0bba2e635ab99231 ""
"MLPproject.aux" 1761667678.47436 5727 94a1bcabbe387b476e26c782e4451f3d "pdflatex"
"MLPproject.out" 1761667678.47528 2690 efc895524c82e0378e07184e5720ac35 "pdflatex"
"MLPproject.tex" 1761667676.18911 19065 228243c289e2ad8172afdd84483c70d8 ""
"MLPproject.toc" 1761667678.47528 1375 6a3d7bab2edeea22735da861f6b6003c "pdflatex"
"SelfArx.cls" 1761123180.54708 7316 506603b27aab6da8087bc0f1ee693041 ""
"confusionMatrix.png" 1761329088.9659 21007 cfab042955bc386bac85d53bfe5a6793 ""
"featureImportanceDT.png" 1761328898.24566 60078 4a2e56e2a45ae2ae5e41b9830c1bbcea ""
"featureImportanceRF.png" 1761328962.51602 61794 6b3eefc625dd3da8a3dbf302174c614c ""
(generated)

View File

@@ -1023,15 +1023,16 @@ INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/helvetic/phvb8r.tfm
INPUT /usr/share/texlive/texmf-dist/fonts/vf/adobe/times/ptmr7t.vf
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmr8r.tfm
INPUT /usr/share/texlive/texmf-dist/fonts/vf/adobe/times/ptmr8c.vf
INPUT ./confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT confusionMatrix.png
INPUT ./confusionMatrix.png
INPUT ./CM_dt.png
INPUT ./CM_dt.png
INPUT CM_dt.png
INPUT ./CM_dt.png
INPUT ./CM_dt.png
INPUT ./CM_rf.png
INPUT ./CM_rf.png
INPUT CM_rf.png
INPUT ./CM_rf.png
INPUT ./CM_rf.png
INPUT ./featureImportanceDT.png
INPUT ./featureImportanceDT.png
INPUT featureImportanceDT.png

View File

@@ -1,4 +1,4 @@
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2025.10.12) 27 OCT 2025 11:36
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2025.10.12) 28 OCT 2025 17:07
entering extended mode
restricted \write18 enabled.
file:line:error style messages enabled.
@@ -496,12 +496,6 @@ LaTeX Font Info: Trying to load font information for OT1+ptm on input line 54
File: ot1ptm.fd 2001/06/04 font definitions for OT1/ptm.
) (./MLPproject.aux
LaTeX Warning: Label `perfmetric' multiply defined.
LaTeX Warning: Label `perfmetric@cref' multiply defined.
LaTeX Warning: Label `fig:featureImportanceDT' multiply defined.
@@ -688,82 +682,81 @@ LaTeX Font Info: Font shape `OT1/phv/m/it' in size <8> not available
\tf@toc=\write4
\openout4 = `MLPproject.toc'.
LaTeX Font Info: Trying to load font information for TS1+ptm on input line 75.
LaTeX Font Info: Trying to load font information for TS1+ptm on input line 77.
(/usr/share/texlive/texmf-dist/tex/latex/psnfss/ts1ptm.fd
File: ts1ptm.fd 2001/06/04 font definitions for TS1/ptm.
) [1{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}{/usr/share/texlive/texmf-dist/fonts/enc/dvips/base/8r.enc}
]
<confusionMatrix.png, id=73, 388.0899pt x 328.8285pt>
File: confusionMatrix.png Graphic file (type png)
<use confusionMatrix.png>
Package pdftex.def Info: confusionMatrix.png used on input line 98.
(pdftex.def) Requested size: 218.17422pt x 184.8602pt.
File: confusionMatrix.png Graphic file (type png)
<use confusionMatrix.png>
Package pdftex.def Info: confusionMatrix.png used on input line 105.
(pdftex.def) Requested size: 218.17422pt x 184.8602pt.
[2 <./confusionMatrix.png>]
<featureImportanceDT.png, id=87, 416.2752pt x 393.8715pt>
<CM_dt.png, id=108, 462.528pt x 346.896pt>
File: CM_dt.png Graphic file (type png)
<use CM_dt.png>
Package pdftex.def Info: CM_dt.png used on input line 101.
(pdftex.def) Requested size: 242.41745pt x 181.81612pt.
<CM_rf.png, id=110, 462.528pt x 346.896pt>
File: CM_rf.png Graphic file (type png)
<use CM_rf.png>
Package pdftex.def Info: CM_rf.png used on input line 108.
(pdftex.def) Requested size: 242.41745pt x 181.81612pt.
[2] [3 <./CM_dt.png> <./CM_rf.png>]
<featureImportanceDT.png, id=132, 416.2752pt x 393.8715pt>
File: featureImportanceDT.png Graphic file (type png)
<use featureImportanceDT.png>
Package pdftex.def Info: featureImportanceDT.png used on input line 154.
Package pdftex.def Info: featureImportanceDT.png used on input line 163.
(pdftex.def) Requested size: 218.17422pt x 206.43103pt.
<featureImportanceRF.png, id=88, 422.0568pt x 393.8715pt>
<featureImportanceRF.png, id=133, 422.0568pt x 393.8715pt>
File: featureImportanceRF.png Graphic file (type png)
<use featureImportanceRF.png>
Package pdftex.def Info: featureImportanceRF.png used on input line 161.
Package pdftex.def Info: featureImportanceRF.png used on input line 170.
(pdftex.def) Requested size: 218.17422pt x 203.60634pt.
Underfull \hbox (badness 1448) in paragraph at lines 196--200
Underfull \hbox (badness 1448) in paragraph at lines 206--210
[]\OT1/ptm/m/n/10 (+20) UC Davis ChemWiki, Prop-a-ga-tion of Er-ror, Avail-
[]
Underfull \hbox (badness 7649) in paragraph at lines 196--200
Underfull \hbox (badness 7649) in paragraph at lines 206--210
\OT1/ptm/m/n/10 (+20) able at: [][]$https : / / chem . libretexts . org / Textbook[]Maps /
[]
Underfull \hbox (badness 10000) in paragraph at lines 196--200
Underfull \hbox (badness 10000) in paragraph at lines 206--210
\OT1/ptm/m/n/10 (+20) Analytical[]Chemistry / Supplemental[]Modules[]
[]
Underfull \hbox (badness 10000) in paragraph at lines 196--200
Underfull \hbox (badness 10000) in paragraph at lines 206--210
\OT1/ptm/m/n/10 (+20) (Analytical[]Chemistry ) /Quantifying[]Nature /
[]
Underfull \hbox (badness 10000) in paragraph at lines 196--200
Underfull \hbox (badness 10000) in paragraph at lines 206--210
\OT1/ptm/m/n/10 (+20) Signi^^Lcant[]Digits / Propagation[]of[]Error$[][], (Ac-cessed:
[]
[3 <./featureImportanceDT.png> <./featureImportanceRF.png>] [4
]
[4 <./featureImportanceDT.png> <./featureImportanceRF.png>]
enddocument/afterlastpage: lastpage setting LastPage.
(./MLPproject.aux)
LaTeX Warning: There were multiply-defined labels.
Package rerunfilecheck Info: File `MLPproject.out' has not changed.
(rerunfilecheck) Checksum: 799028B2B371D09D171B836F5FABB437;1585.
(rerunfilecheck) Checksum: EFC895524C82E0378E07184E5720AC35;2690.
)
Here is how much of TeX's memory you used:
19027 strings out of 476041
322029 string characters out of 5793173
19072 strings out of 476041
322455 string characters out of 5793173
1876388 words of memory out of 6000000
38900 multiletter control sequences out of 15000+600000
570518 words of font info for 283 fonts, out of 8000000 for 9000
38913 multiletter control sequences out of 15000+600000
571503 words of font info for 300 fonts, out of 8000000 for 9000
1137 hyphenation exceptions out of 8191
75i,12n,77p,1611b,619s stack positions out of 10000i,1000n,20000p,200000b,200000s
75i,12n,77p,1611b,605s stack positions out of 10000i,1000n,20000p,200000b,200000s
</usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvb8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvr8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvro8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/times/utmb8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/times/utmr8a.pfb>
Output written on MLPproject.pdf (4 pages, 160208 bytes).
Output written on MLPproject.pdf (4 pages, 294157 bytes).
PDF statistics:
142 PDF objects out of 1000 (max. 8388607)
108 compressed objects within 2 object streams
22 named destinations out of 1000 (max. 500000)
98400 words of extra memory for PDF output out of 106986 (max. 10000000)
181 PDF objects out of 1000 (max. 8388607)
139 compressed objects within 2 object streams
28 named destinations out of 1000 (max. 500000)
98453 words of extra memory for PDF output out of 106986 (max. 10000000)

View File

@@ -7,4 +7,10 @@
\BOOKMARK [1][-]{section.3}{\376\377\000M\000o\000d\000e\000l\000\040\000s\000e\000l\000e\000c\000t\000i\000o\000n}{}% 7
\BOOKMARK [1][-]{section.4}{\376\377\000M\000o\000d\000e\000l\000\040\000T\000r\000a\000i\000n\000i\000n\000g\000\040\000a\000n\000d\000\040\000H\000y\000p\000e\000r\000p\000a\000r\000a\000m\000e\000t\000e\000r\000\040\000T\000u\000n\000i\000n\000g}{}% 8
\BOOKMARK [1][-]{section.5}{\376\377\000M\000o\000d\000e\000l\000\040\000E\000v\000a\000l\000u\000a\000t\000i\000o\000n\000s}{}% 9
\BOOKMARK [1][-]{figure.caption.4}{\376\377\000R\000e\000f\000e\000r\000e\000n\000c\000e\000s}{}% 10
\BOOKMARK [2][-]{subsection.5.1}{\376\377\000A\000n\000a\000l\000y\000z\000i\000n\000g\000\040\000t\000h\000e\000\040\000C\000o\000n\000f\000u\000s\000i\000o\000n\000\040\000M\000a\000t\000r\000i\000c\000i\000e\000s}{section.5}% 10
\BOOKMARK [2][-]{subsection.5.2}{\376\377\000A\000n\000a\000l\000y\000z\000i\000n\000g\000\040\000W\000e\000i\000g\000h\000t\000e\000d\000\040\000P\000e\000r\000f\000o\000r\000m\000a\000n\000c\000e\000\040\000M\000e\000t\000r\000i\000c\000s}{section.5}% 11
\BOOKMARK [2][-]{subsection.5.3}{\376\377\000A\000n\000a\000l\000y\000z\000i\000n\000g\000\040\000t\000h\000e\000\040\000P\000e\000r\000f\000o\000r\000m\000a\000n\000c\000e}{section.5}% 12
\BOOKMARK [2][-]{subsection.5.4}{\376\377\000O\000v\000e\000r\000f\000i\000t\000t\000i\000n\000g\000\040\000a\000n\000d\000\040\000U\000n\000d\000e\000r\000f\000i\000t\000t\000i\000n\000g}{section.5}% 13
\BOOKMARK [2][-]{subsection.5.5}{\376\377\000F\000e\000a\000t\000u\000r\000e\000\040\000I\000m\000p\000o\000r\000t\000a\000n\000c\000e}{section.5}% 14
\BOOKMARK [1][-]{section.6}{\376\377\000S\000u\000m\000m\000a\000r\000y}{}% 15
\BOOKMARK [1][-]{section.6}{\376\377\000R\000e\000f\000e\000r\000e\000n\000c\000e\000s}{}% 16

Binary file not shown.

Binary file not shown.

View File

@@ -48,7 +48,7 @@
%----------------------------
% ABSTRACT
%----------------------------
\Abstract{}
\Abstract{We found a dataset that could be used for classification tasks. In order to be able to use this dataset we had to do some feature engineering, handle missing values and do some other data cleaning such as label encoding. We chose two applicable models, the Decision Tree and the Random Forst models. The dataset was divided into training, validation and testing. We tuned hyperparameters to get the best possible validation results and to avoid overfitting. When we were satisfied with our models we found that both models performed about tha same with the Random Forest having about on percentage point better results but with much higher training times. We argue that the weighted accuracies of about 85\% which at a glance might seem bad, actually are reasonable given the nature of our data sets and the choices we made.}
%----------------------------
\begin{document}
@@ -67,6 +67,8 @@
%----------------------------
\section{Introduction}
Machine learning techniques have plenty of practical use cases. In this report we find a real world, dataset and train two machine learning models on it to try and get the best results possible.
\section{Data analysis}
@@ -89,20 +91,21 @@ Another very important part of the model training is finding the optimal hyperpa
\section{Model Evaluations}
There are two interesting parts to look at after our analysis. One part is to analyze how well the actual models performed and compare the difference between the two models we have chosen to study. We fine tuned our models using the validation part of the data. After running it on the test data we can see how well it actually performs. A great way to get a quick overview of how well a model classifies is to look at the confusion matrix.
\subsection{Analyzing the Confusion Matricies}
\begin{figure}[!hptb]
\centering
\begin{subfigure}[b]{0.9\columnwidth}
\begin{subfigure}[b]{\columnwidth}
\centering
\includegraphics[width=\textwidth]{confusionMatrix.png}
\includegraphics[width=\textwidth]{CM_dt.png}
\caption{}
\label{fig:featureImportanceDT}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.9\columnwidth}
\begin{subfigure}[b]{\columnwidth}
\centering
\includegraphics[width=\textwidth]{confusionMatrix.png}
\includegraphics[width=\textwidth]{CM_rf.png}
\caption{}
\label{fig:featureImportanceRF}
\end{subfigure}
@@ -110,8 +113,9 @@ There are two interesting parts to look at after our analysis. One part is to an
\label{fig:}
\end{figure}
As we can see in the confusion matricies there is not that big of a difference between the models. Both did an overall good job at identifying the two classes. There is a difference in how well the models did in identifying the two different classes. Overall they performed a lot better at classifying the poor people than the rich. % Add more about the exact numbers!!!
This is a very interesting result and maybe not so weird as it first seems. There were a lot more poor people in our training data set than rich people. This would of course train our model to be better at classifying the poor. As well as looking at the classification matricies it is interesting to look at the actual performance metrics that can be calculated from the matricies. These metrics can be seen in table(\ref{perfmetric}). Of note is that all of these metrics are calculated as weighted metrics which means that they account for the class imbalances seen in the confusion matrcies.
As we can see in the confusion matricies there is not that big of a difference between the models. Both did an overall good job at identifying the two classes. There is a difference in how well the models did in identifying the two different classes. Overall they performed a lot better at classifying the poor people than the rich. We can see that for the both models are pretty good at classifying the poor class and worse at the rich class. The Random forest model is slightly better than the Decision Tree. This is a very interesting result and maybe not so weird as it first seems. There were a lot more poor people in our training data set than rich people. This would of course train our model to be better at classifying the poor. As well as looking at the classification matricies it is interesting to look at the actual performance metrics that can be calculated from the matricies.
\subsection{Analyzing Weighted Performance Metrics}
We want to analyze to sets of metrics. First we have the validaton Metrics. These metrics can be seen in table(\ref{perfmetric}). Then we have the actual test metrics which is the result from our model. These can be seen in table(\ref{perfmetrictest}). Of note is that all of these metrics are calculated as weighted metrics which means that they account for the class imbalances seen in the confusion matrcies.
\begin{table}[!htbp]
\centering
\caption{The performance metrics of the models on the validation data.}
@@ -128,24 +132,29 @@ This is a very interesting result and maybe not so weird as it first seems. Ther
\begin{table}[!htbp]
\centering
\caption{The performance metrics of the models on the test data.}
\label{perfmetric}
\resizebox{\columnwidth}{!}{
\begin{tabular}{c|c|c|c|c|c}
Model&Accuracy&Precision&Recall&F1 Score&Total Time\\
\label{perfmetrictest}
\resizebox{0.6\columnwidth}{!}{
\begin{tabular}{c|c|c|c}
Model&Precision&Recall&F1 Score\\
\hline
RF &0.8589&0.8535&0.8589&0.8534&150.8154\\
RF &0.86&0.86&0.86\\
\hline
DT&0.8483&0.8449&0.8483&0.8462&6.7357
DT&0.84&0.85&0.84
\end{tabular}}
\end{table}
Looking at the values we see that the difference between our models is not that large. The Random forest model is on average about 1 percentage point better than the Decision Tree. We can also see that all metrics are at about 0.85. This means that our models are not very accurate and that the differences between them is not that large at all. Which model that is better depends a lot on what is the priority. While it is clear that the Random Forest has the better performance, even by just a little bit, it is also significanty slower. So for this dataset was it really worth 30x the computational time to get a slightly better result? We are not really sure. The extra computational time is a definite negative but at the size of this dataset we are only talking about a couple of minutes which is not too bad. For another dataset the results may be different and it might be clearer which is really the prefered model.
Looking at the values we see that the difference between our models is not that large. The Random forest model is on average about 1 percentage point better than the Decision Tree. We can also see that all metrics are at about 0.85. This means that our models are not very accurate and that the differences between them is not that large at all. Which model that is better depends a lot on what is the priority. While it is clear that the Random Forest has the better performance, even by just a little bit, it is also significanty slower on the validation data. So for this dataset was it really worth 30x the computational time to get a slightly better result? We are not really sure. The extra computational time is a definite negative but at the size of this dataset we are only talking about a couple of minutes which is not too bad. For another dataset the results may be different and it might be clearer which is really the prefered model.
\subsection{Analyzing the Performance}
At a first glance at both the confusion matricies and the performance metrics the models do not look to be that good. But what has to be considered is the data that we are analyzing. We are looking at what possible indicators there are for a person to earn more than a certain amount of money. This is real world data and in the real world there is a lot of unique ways of earning money. While there certainly are some indicators that will clearly tell that somebody is earning a lot of money, there are other factors that are not as telling. This means that some features are less important than others. This can be seen in our models int he feature importance graphs in figure(\ref{fig:featureImportanceDT}) and (\ref{fig:featureImportanceRF}). This also means that there will be plenty of outliers in the data. No matter how good the model is, it cannot possibly catch all of these outliers. If it did it would be overfitted. We simply cannot expect a model to have very good accuracy on this type of data set.
An important thing to touch on is the poor fit on rich people by our model. We see that only 60-70\% where correctly identified which is quite bad. As we talked about above there may be many data reasons for this poor fit. Of note is that we have optimized this model to find the best accuracy on all data point. We therefore stride to classify as many total data points correctly as possible and not on getting the best average for the classes separetly. Since there are more poor people in our dataset it is very reasonable for the model to have optimised for that as well since it gives the best weighted accuracy.
\subsection{Overfitting and Underfitting}
We spent some time tuning the hyperparameters to ensure that we did not overfit. If we compare the validation results with the test results we see that the performance metrics do not change much at all. This is what we want to see as this means that we have avoidede overfitting the model. This means that our model could be used on other similar datasets and hopefully give similar perfomances. We also do not want our model to be underfit. This is a bit harder to validate as we want the errors to be as small as possible for both training and testing and as we stated before I believe that this is a difficult dataaset to get a great fit to. Therefore we believe that we have found a model that has a decent enough balance between bias and variance.
\subsection{Feature Importance}
Taking a closer look at the feature importance graphs of the two models we notice an interesting difference. The Decision tree which is only one tree focuses has only a few main features where one is the most important. The rest are not used that much or almost not at all. The Random Forest uses a far wider range of features. They also rank the features a bit differently and the best feature for one model is not the best for the other. We considered removing the worst performing features to see if it would make a difference in the performanes. But since they have diffrent worst performing features we reasoned that to keep the comparison as fair as possible it would be more interesting to leave the features as is.
% Jämföra test och validation för att verifiera att vi inte overfittar
We spent some time tuning the hyperparameters to ensure that we did not overfit. We can also see if we
\begin{figure}[!hptb]
\centering
@@ -166,8 +175,9 @@ We spent some time tuning the hyperparameters to ensure that we did not overfit.
\label{fig:}
\end{figure}
%----------------------------
\section{Summary}
We have succesfully trained two different but similar machine learning models on classifying the monetary status of people based on a bunch of different features. While some trade offs where made in regards to which features where kept and to what we optimized the model for. We still managed to get a respectable result especially regarding the difficult type of data that we had to work with.
%---------
% REFERENCE LIST
%----------------------------
\bibliographystyle{model1-num-names}

View File

@@ -4,9 +4,15 @@
\contentsline {subsection}{\numberline {2.1}Dataset}{1}{subsection.2.1}%
\contentsline {subsection}{\numberline {2.2}Data cleaning and feature engineering}{1}{subsection.2.2}%
\contentsline {subsection}{\numberline {2.3}Handling missing values}{1}{subsection.2.3}%
\contentsline {subsection}{\numberline {2.4}Training, validation and test sets}{1}{subsection.2.4}%
\contentsline {section}{\numberline {3}Model selection}{1}{section.3}%
\contentsline {subsection}{\numberline {2.4}Training, validation and test sets}{2}{subsection.2.4}%
\contentsline {section}{\numberline {3}Model selection}{2}{section.3}%
\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{2}{section.4}%
\contentsline {section}{\numberline {5}Model Evaluations}{2}{section.5}%
\contentsline {section}{References}{3}{figure.caption.4}%
\contentsline {subsection}{\numberline {5.1}Analyzing the Confusion Matricies}{2}{subsection.5.1}%
\contentsline {subsection}{\numberline {5.2}Analyzing Weighted Performance Metrics}{2}{subsection.5.2}%
\contentsline {subsection}{\numberline {5.3}Analyzing the Performance}{3}{subsection.5.3}%
\contentsline {subsection}{\numberline {5.4}Overfitting and Underfitting}{3}{subsection.5.4}%
\contentsline {subsection}{\numberline {5.5}Feature Importance}{4}{subsection.5.5}%
\contentsline {section}{\numberline {6}Summary}{4}{section.6}%
\contentsline {section}{References}{4}{section.6}%
\contentsfinish