Continued work on report until Model training part
This commit is contained in:
@@ -15,22 +15,23 @@
|
||||
\gdef\HyperFirstAtBeginDocument#1{#1}
|
||||
\providecommand\HyField@AuxAddToFields[1]{}
|
||||
\providecommand\HyField@AuxAddToCoFields[2]{}
|
||||
\bibstyle{model1-num-names}
|
||||
\bibcite{Steinhaus:Mathematical}{1}
|
||||
\bibcite{Greivenkamp:FieldGuide}{2}
|
||||
\bibcite{Pedrotti:Introduction}{3}
|
||||
\bibcite{Davis:ChemWiki}{4}
|
||||
\babel@aux{english}{}
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {2}Data analysis}{1}{section.2}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Dataset}{1}{subsection.2.1}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Data cleaning and feature engineering}{1}{subsection.2.2}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Handling missing values}{1}{subsection.2.3}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Training, validation and test sets}{1}{subsection.2.4}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {3}Model selection}{1}{section.3}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{1}{section.4}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {5}Model Evaluations}{1}{section.5}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {6}}{1}{section.6}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{References}{1}{section.6}\protected@file@percent }
|
||||
\bibstyle{model1-num-names}
|
||||
\bibcite{Steinhaus:Mathematical}{1}
|
||||
\bibcite{Greivenkamp:FieldGuide}{2}
|
||||
\bibcite{Pedrotti:Introduction}{3}
|
||||
\bibcite{Davis:ChemWiki}{4}
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{2}{section.4}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {5}Model Evaluations}{2}{section.5}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {6}}{2}{section.6}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{References}{2}{section.6}\protected@file@percent }
|
||||
\ttl@finishall
|
||||
\newlabel{LastPage}{{}{2}{}{page.2}{}}
|
||||
\xdef\lastpage@lastpage{2}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Fdb version 4
|
||||
["pdflatex"] 1761129238.9062 "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" "MLPproject.pdf" "MLPproject" 1761129240.39559 0
|
||||
"/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" 1761129236.61965 6049 c7e7910f66e6a5624dd2e411a4d86264 ""
|
||||
["pdflatex"] 1761133861.19469 "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" "MLPproject.pdf" "MLPproject" 1761133862.86789 0
|
||||
"/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" 1761133860.76481 9644 b114a2966591f23d2a80de0b71664c7d ""
|
||||
"/usr/share/texlive/texmf-dist/fonts/enc/dvips/base/8r.enc" 1721433600 4850 80dc9bab7f31fb78a000ccfed0e27cab ""
|
||||
"/usr/share/texlive/texmf-dist/fonts/map/fontname/texfonts.map" 1577235249 3524 cb3e574dea2d1052e39280babc910dc8 ""
|
||||
"/usr/share/texlive/texmf-dist/fonts/tfm/adobe/helvetic/phvb7t.tfm" 1136768653 2240 eb56c13537f4d8a0bd3fafc25572b1bd ""
|
||||
@@ -132,10 +132,10 @@
|
||||
"/usr/share/texlive/texmf-dist/web2c/texmf.cnf" 1721433600 40900 887e0dc8cac988a9e9c574af364cf837 ""
|
||||
"/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map" 1760290233.68077 4602002 62dba5fc29055c16380d7393a2adb07a ""
|
||||
"/var/lib/texmf/web2c/pdftex/pdflatex.fmt" 1760289849 7753794 892d611f76aecccd13eb485815d0543e ""
|
||||
"MLPproject.aux" 1761129240.22236 2066 62a9c5cb48c4e643fe8b2fb1bba9b77b "pdflatex"
|
||||
"MLPproject.out" 1761129240.22436 1374 b4a2caeadb43696bbe5350199c3331b3 "pdflatex"
|
||||
"MLPproject.tex" 1761129236.61965 6049 c7e7910f66e6a5624dd2e411a4d86264 ""
|
||||
"MLPproject.toc" 1761129240.22436 796 63fff7a313a297867c97aa2dfbafdb7f "pdflatex"
|
||||
"MLPproject.aux" 1761133862.68081 2207 83760043bb554d7220df5afc55d586db "pdflatex"
|
||||
"MLPproject.out" 1761133862.68281 1614 a2f9c909152198446a03f747ac01e9f8 "pdflatex"
|
||||
"MLPproject.tex" 1761133860.76481 9644 b114a2966591f23d2a80de0b71664c7d ""
|
||||
"MLPproject.toc" 1761133862.68381 896 462daffa7f338139a1f72b531978e0ba "pdflatex"
|
||||
"SelfArx.cls" 1761123180.54708 7316 506603b27aab6da8087bc0f1ee693041 ""
|
||||
(generated)
|
||||
"MLPproject.aux"
|
||||
|
||||
@@ -996,8 +996,6 @@ INPUT /usr/share/texlive/texmf-dist/tex/latex/psnfss/ts1ptm.fd
|
||||
INPUT /usr/share/texlive/texmf-dist/tex/latex/psnfss/ts1ptm.fd
|
||||
INPUT /usr/share/texlive/texmf-dist/tex/latex/psnfss/ts1ptm.fd
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmr8c.tfm
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmr7t.tfm
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmr7t.tfm
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/vf/adobe/helvetic/phvb7t.vf
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/helvetic/phvb8r.tfm
|
||||
INPUT /var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map
|
||||
@@ -1025,6 +1023,8 @@ INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/helvetic/phvb8r.tfm
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/vf/adobe/times/ptmr7t.vf
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmr8r.tfm
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/vf/adobe/times/ptmr8c.vf
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmr7t.tfm
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmr7t.tfm
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/vf/adobe/times/ptmb7t.vf
|
||||
INPUT /usr/share/texlive/texmf-dist/fonts/tfm/adobe/times/ptmb8r.tfm
|
||||
INPUT MLPproject.aux
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2025.10.12) 22 OCT 2025 12:33
|
||||
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2025.10.12) 22 OCT 2025 13:51
|
||||
entering extended mode
|
||||
restricted \write18 enabled.
|
||||
file:line:error style messages enabled.
|
||||
@@ -654,60 +654,57 @@ LaTeX Font Info: Font shape `OT1/phv/m/it' in size <8> not available
|
||||
LaTeX Font Info: Trying to load font information for TS1+ptm on input line 75.
|
||||
(/usr/share/texlive/texmf-dist/tex/latex/psnfss/ts1ptm.fd
|
||||
File: ts1ptm.fd 2001/06/04 font definitions for TS1/ptm.
|
||||
)
|
||||
Underfull \hbox (badness 1448) in paragraph at lines 116--120
|
||||
) [1{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}{/usr/share/texlive/texmf-dist/fonts/enc/dvips/base/8r.enc}
|
||||
|
||||
|
||||
]
|
||||
Underfull \hbox (badness 1448) in paragraph at lines 119--123
|
||||
[]\OT1/ptm/m/n/10 (+20) UC Davis ChemWiki, Prop-a-ga-tion of Er-ror, Avail-
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 7649) in paragraph at lines 116--120
|
||||
Underfull \hbox (badness 7649) in paragraph at lines 119--123
|
||||
\OT1/ptm/m/n/10 (+20) able at: [][]$https : / / chem . libretexts . org / Textbook[]Maps /
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 116--120
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 119--123
|
||||
\OT1/ptm/m/n/10 (+20) Analytical[]Chemistry / Supplemental[]Modules[]
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 116--120
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 119--123
|
||||
\OT1/ptm/m/n/10 (+20) (Analytical[]Chemistry ) /Quantifying[]Nature /
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 116--120
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 119--123
|
||||
\OT1/ptm/m/n/10 (+20) Signi^^Lcant[]Digits / Propagation[]of[]Error$[][], (Ac-cessed:
|
||||
[]
|
||||
|
||||
[1{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}{/usr/share/texlive/texmf-dist/fonts/enc/dvips/base/8r.enc}
|
||||
|
||||
|
||||
]
|
||||
|
||||
Package caption Warning: Unused \captionsetup[subfigure] on input line 32.
|
||||
See the caption package documentation for explanation.
|
||||
|
||||
[2
|
||||
|
||||
]
|
||||
[2]
|
||||
enddocument/afterlastpage: lastpage setting LastPage.
|
||||
(./MLPproject.aux)
|
||||
Package rerunfilecheck Info: File `MLPproject.out' has not changed.
|
||||
(rerunfilecheck) Checksum: B4A2CAEADB43696BBE5350199C3331B3;1374.
|
||||
(rerunfilecheck) Checksum: A2F9C909152198446A03F747AC01E9F8;1614.
|
||||
)
|
||||
Here is how much of TeX's memory you used:
|
||||
18917 strings out of 476041
|
||||
320357 string characters out of 5793173
|
||||
1878388 words of memory out of 6000000
|
||||
38854 multiletter control sequences out of 15000+600000
|
||||
568376 words of font info for 247 fonts, out of 8000000 for 9000
|
||||
18943 strings out of 476041
|
||||
320588 string characters out of 5793173
|
||||
1876388 words of memory out of 6000000
|
||||
38855 multiletter control sequences out of 15000+600000
|
||||
569328 words of font info for 263 fonts, out of 8000000 for 9000
|
||||
1137 hyphenation exceptions out of 8191
|
||||
75i,12n,77p,1049b,626s stack positions out of 10000i,1000n,20000p,200000b,200000s
|
||||
75i,12n,77p,1476b,472s stack positions out of 10000i,1000n,20000p,200000b,200000s
|
||||
</usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvb8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvr8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/helvetic/uhvro8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/times/utmb8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/times/utmr8a.pfb>
|
||||
Output written on MLPproject.pdf (2 pages, 59091 bytes).
|
||||
Output written on MLPproject.pdf (2 pages, 64293 bytes).
|
||||
PDF statistics:
|
||||
115 PDF objects out of 1000 (max. 8388607)
|
||||
90 compressed objects within 1 object stream
|
||||
16 named destinations out of 1000 (max. 500000)
|
||||
98385 words of extra memory for PDF output out of 106986 (max. 10000000)
|
||||
121 PDF objects out of 1000 (max. 8388607)
|
||||
95 compressed objects within 1 object stream
|
||||
17 named destinations out of 1000 (max. 500000)
|
||||
98393 words of extra memory for PDF output out of 106986 (max. 10000000)
|
||||
|
||||
|
||||
@@ -3,8 +3,9 @@
|
||||
\BOOKMARK [2][-]{subsection.2.1}{\376\377\000D\000a\000t\000a\000s\000e\000t}{section.2}% 3
|
||||
\BOOKMARK [2][-]{subsection.2.2}{\376\377\000D\000a\000t\000a\000\040\000c\000l\000e\000a\000n\000i\000n\000g\000\040\000a\000n\000d\000\040\000f\000e\000a\000t\000u\000r\000e\000\040\000e\000n\000g\000i\000n\000e\000e\000r\000i\000n\000g}{section.2}% 4
|
||||
\BOOKMARK [2][-]{subsection.2.3}{\376\377\000H\000a\000n\000d\000l\000i\000n\000g\000\040\000m\000i\000s\000s\000i\000n\000g\000\040\000v\000a\000l\000u\000e\000s}{section.2}% 5
|
||||
\BOOKMARK [1][-]{section.3}{\376\377\000M\000o\000d\000e\000l\000\040\000s\000e\000l\000e\000c\000t\000i\000o\000n}{}% 6
|
||||
\BOOKMARK [1][-]{section.4}{\376\377\000M\000o\000d\000e\000l\000\040\000T\000r\000a\000i\000n\000i\000n\000g\000\040\000a\000n\000d\000\040\000H\000y\000p\000e\000r\000p\000a\000r\000a\000m\000e\000t\000e\000r\000\040\000T\000u\000n\000i\000n\000g}{}% 7
|
||||
\BOOKMARK [1][-]{section.5}{\376\377\000M\000o\000d\000e\000l\000\040\000E\000v\000a\000l\000u\000a\000t\000i\000o\000n\000s}{}% 8
|
||||
\BOOKMARK [1][-]{section.6}{}{}% 9
|
||||
\BOOKMARK [1][-]{section.6}{\376\377\000R\000e\000f\000e\000r\000e\000n\000c\000e\000s}{}% 10
|
||||
\BOOKMARK [2][-]{subsection.2.4}{\376\377\000T\000r\000a\000i\000n\000i\000n\000g\000,\000\040\000v\000a\000l\000i\000d\000a\000t\000i\000o\000n\000\040\000a\000n\000d\000\040\000t\000e\000s\000t\000\040\000s\000e\000t\000s}{section.2}% 6
|
||||
\BOOKMARK [1][-]{section.3}{\376\377\000M\000o\000d\000e\000l\000\040\000s\000e\000l\000e\000c\000t\000i\000o\000n}{}% 7
|
||||
\BOOKMARK [1][-]{section.4}{\376\377\000M\000o\000d\000e\000l\000\040\000T\000r\000a\000i\000n\000i\000n\000g\000\040\000a\000n\000d\000\040\000H\000y\000p\000e\000r\000p\000a\000r\000a\000m\000e\000t\000e\000r\000\040\000T\000u\000n\000i\000n\000g}{}% 8
|
||||
\BOOKMARK [1][-]{section.5}{\376\377\000M\000o\000d\000e\000l\000\040\000E\000v\000a\000l\000u\000a\000t\000i\000o\000n\000s}{}% 9
|
||||
\BOOKMARK [1][-]{section.6}{}{}% 10
|
||||
\BOOKMARK [1][-]{section.6}{\376\377\000R\000e\000f\000e\000r\000e\000n\000c\000e\000s}{}% 11
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -77,12 +77,15 @@ The dataset we decided to study is a labeled income prediction dataset. This dat
|
||||
\subsection{Data cleaning and feature engineering}
|
||||
There were a couple of things with our dataset that had to be modified in order for it to be usable in our ML application. We find that some of the features are redundant or not interesting in our project. We romove the redundant feature education since there is another already numerically encoded feature containing the same data. We also chose to remove the feature 'fnlwgt' since it is a already calculated number that is used by the Census Bureau to estimate population statistics. Since we want to estimate the population statistics based on the other features and not the already calculated weight we remove this feature. We have a mix of numerical and non-numerical features in our dataset. Since the machine learning models cannot use non-numerical data we have to encode the non-numercial data into corresponding numbers. This is with the label encoder built into sci-kit learn and used on all non-numerical data.
|
||||
\subsection{Handling missing values}
|
||||
With our numerical version of the dataset we found with the info function in pandas that around 2500 values were NaN values. We reasoned that filling these values with something as the mean of the category does not make very much sense for our application. Since there are many discrete categories a mean value means nothing. Especially since we gave many categories arbitrary numbers
|
||||
|
||||
With our numerical version of the dataset we found with the info function in pandas that around 2500 values were NaN values. We reasoned that filling these values with something as the mean of the category does not make very much sense for our application. Since there are many discrete categories a mean value means nothing. Especially since we gave many categories arbitrary numbers the mean means nothing. We therefore decided to only use complete data points. This resulted in removing about 6\% of the total amount of data points or about 2500 data points.
|
||||
\subsection{Training, validation and test sets}
|
||||
Before doing any sort of training or analysis on the data, se split it into training, test and validation data. We did this by first splitting a random 20\% of the data into test data. This data is reserved for the final testing of the model and will not be touched until the model is finished. Then we did a further split of the rest of the data were 25\% was designated as validation data. This data will be used for calibration of the model and hyperparameter tuning. The rest of the data which is 60\% of the total data or around 18000 data points will be used to train the model.
|
||||
\section{Model selection}
|
||||
When selecting the model to use for this project we have to limit us to using models that are appropriate to the type of problem that we are trying to solve. The problem is a classification task so all models that are used for regression are immediately invalid. There are plenty of different types of classification models left to choose from. Many of them however, are good for data that has non-discrete features. This includes models such as logistic regression, KNN and other similar types of classification models. Also since we have so many features that are non-numerical and converted into arbitrary numbers these types of models would not be optimal. What is left is the Gaussian Naïve Baye's and the different tree based models. Since our data again is made up of arbitrary numbers it is not possible to assume thata we have normally distributed data. Therefore we are left with the tree based models such as the decision tree and random forests. We decided to implement two different types of models. We first do a decision tree and see how good we can get that model to work. We then do a random forest which may not be the absolute best model but since it is a continuation on the decision tree it might be interesting to see if it performs better. We then do analysis on both methods and see if these models are good enough and if there is any meaningful difference between the two.
|
||||
|
||||
\section{Model Training and Hyperparameter Tuning}
|
||||
|
||||
During the model training there are some important changes we can make to improve the accuracy of our model. One thing we implement is cross validation. Since there is a great spread in our data we choose to use randomized search. %Add more here and change type of x-val if needed. How many folds?
|
||||
Another very important part of the model training is finding the optimal hyperparameters. This is an important step in minimizing the risk of overfitting. Some important hyperparameters in our decision trees are the maximum depth and minimum sample split. The maximum depth hyperparameter decides how deep the tree is allowed to go. If a tree is allowed to go very deep there is a high risk of overfitting. We therefore test multiple different depths and see which values give the best training and validation accuracy. This will ensure that we use the most optimal depth for our tree. The minimum sample split states how many data points there has to be for a new split to be created. This is also a good measure against overfitting since if it is very low we risk training the noise of the data instead of the general trend and end up overfitting the data. It is also important that it is not to small since we then loose information and underfit instead. For the random forest there is also the hyperparameter of how many estimators to use. This decides how many trees to choose from.
|
||||
\section{Model Evaluations}
|
||||
\section{}
|
||||
|
||||
|
||||
@@ -4,9 +4,10 @@
|
||||
\contentsline {subsection}{\numberline {2.1}Dataset}{1}{subsection.2.1}%
|
||||
\contentsline {subsection}{\numberline {2.2}Data cleaning and feature engineering}{1}{subsection.2.2}%
|
||||
\contentsline {subsection}{\numberline {2.3}Handling missing values}{1}{subsection.2.3}%
|
||||
\contentsline {subsection}{\numberline {2.4}Training, validation and test sets}{1}{subsection.2.4}%
|
||||
\contentsline {section}{\numberline {3}Model selection}{1}{section.3}%
|
||||
\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{1}{section.4}%
|
||||
\contentsline {section}{\numberline {5}Model Evaluations}{1}{section.5}%
|
||||
\contentsline {section}{\numberline {6}}{1}{section.6}%
|
||||
\contentsline {section}{References}{1}{section.6}%
|
||||
\contentsline {section}{\numberline {4}Model Training and Hyperparameter Tuning}{2}{section.4}%
|
||||
\contentsline {section}{\numberline {5}Model Evaluations}{2}{section.5}%
|
||||
\contentsline {section}{\numberline {6}}{2}{section.6}%
|
||||
\contentsline {section}{References}{2}{section.6}%
|
||||
\contentsfinish
|
||||
|
||||
Reference in New Issue
Block a user