diff --git a/Decision_tree.ipynb b/Decision_tree.ipynb index d61600ef..811e6f4a 100644 --- a/Decision_tree.ipynb +++ b/Decision_tree.ipynb @@ -151,7 +151,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.12" + "version": "3.13.7" } }, "nbformat": 4, diff --git a/Report/MLPproject.fdb_latexmk b/Report/MLPproject.fdb_latexmk index df53450d..8d5be1e0 100644 --- a/Report/MLPproject.fdb_latexmk +++ b/Report/MLPproject.fdb_latexmk @@ -1,6 +1,6 @@ # Fdb version 4 -["pdflatex"] 1761125791.81013 "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" "MLPproject.pdf" "MLPproject" 1761125793.25759 0 - "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" 1761125791.50152 4318 6372d460a7a87caa250e0c6d0d25be18 "" +["pdflatex"] 1761129092.48673 "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" "MLPproject.pdf" "MLPproject" 1761129093.81415 0 + "/home/jaknyst/Documents/MLPproject/Report/MLPproject.tex" 1761129092.12093 5920 9a4d74b92fae45bd664c915d5afebba0 "" "/usr/share/texlive/texmf-dist/fonts/enc/dvips/base/8r.enc" 1721433600 4850 80dc9bab7f31fb78a000ccfed0e27cab "" "/usr/share/texlive/texmf-dist/fonts/map/fontname/texfonts.map" 1577235249 3524 cb3e574dea2d1052e39280babc910dc8 "" "/usr/share/texlive/texmf-dist/fonts/tfm/adobe/helvetic/phvb7t.tfm" 1136768653 2240 eb56c13537f4d8a0bd3fafc25572b1bd "" @@ -132,10 +132,10 @@ "/usr/share/texlive/texmf-dist/web2c/texmf.cnf" 1721433600 40900 887e0dc8cac988a9e9c574af364cf837 "" "/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map" 1760290233.68077 4602002 62dba5fc29055c16380d7393a2adb07a "" "/var/lib/texmf/web2c/pdftex/pdflatex.fmt" 1760289849 7753794 892d611f76aecccd13eb485815d0543e "" - "MLPproject.aux" 1761125793.07252 2066 17a2ed4a62d84d4f27ea168ff2ae527f "pdflatex" - "MLPproject.out" 1761125793.07452 1374 b4a2caeadb43696bbe5350199c3331b3 "pdflatex" - "MLPproject.tex" 1761125791.50152 4318 6372d460a7a87caa250e0c6d0d25be18 "" - "MLPproject.toc" 1761125793.07552 796 63fff7a313a297867c97aa2dfbafdb7f "pdflatex" + "MLPproject.aux" 1761129093.65693 2066 17a2ed4a62d84d4f27ea168ff2ae527f "pdflatex" + "MLPproject.out" 1761129093.65793 1374 b4a2caeadb43696bbe5350199c3331b3 "pdflatex" + "MLPproject.tex" 1761129092.12093 5920 9a4d74b92fae45bd664c915d5afebba0 "" + "MLPproject.toc" 1761129093.65793 796 63fff7a313a297867c97aa2dfbafdb7f "pdflatex" "SelfArx.cls" 1761123180.54708 7316 506603b27aab6da8087bc0f1ee693041 "" (generated) "MLPproject.aux" diff --git a/Report/MLPproject.log b/Report/MLPproject.log index fbf73a4b..67579ded 100644 --- a/Report/MLPproject.log +++ b/Report/MLPproject.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2025.10.12) 22 OCT 2025 11:36 +This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2025.10.12) 22 OCT 2025 12:31 entering extended mode restricted \write18 enabled. file:line:error style messages enabled. @@ -651,31 +651,31 @@ LaTeX Font Info: Font shape `OT1/phv/m/it' in size <8> not available \tf@toc=\write4 \openout4 = `MLPproject.toc'. -LaTeX Font Info: Trying to load font information for TS1+ptm on input line 77. +LaTeX Font Info: Trying to load font information for TS1+ptm on input line 75. (/usr/share/texlive/texmf-dist/tex/latex/psnfss/ts1ptm.fd File: ts1ptm.fd 2001/06/04 font definitions for TS1/ptm. ) -Underfull \hbox (badness 1448) in paragraph at lines 114--118 +Underfull \hbox (badness 1448) in paragraph at lines 116--120 []\OT1/ptm/m/n/10 (+20) UC Davis ChemWiki, Prop-a-ga-tion of Er-ror, Avail- [] -Underfull \hbox (badness 7649) in paragraph at lines 114--118 +Underfull \hbox (badness 7649) in paragraph at lines 116--120 \OT1/ptm/m/n/10 (+20) able at: [][]$https : / / chem . libretexts . org / Textbook[]Maps / [] -Underfull \hbox (badness 10000) in paragraph at lines 114--118 +Underfull \hbox (badness 10000) in paragraph at lines 116--120 \OT1/ptm/m/n/10 (+20) Analytical[]Chemistry / Supplemental[]Modules[] [] -Underfull \hbox (badness 10000) in paragraph at lines 114--118 +Underfull \hbox (badness 10000) in paragraph at lines 116--120 \OT1/ptm/m/n/10 (+20) (Analytical[]Chemistry ) /Quantifying[]Nature / [] -Underfull \hbox (badness 10000) in paragraph at lines 114--118 +Underfull \hbox (badness 10000) in paragraph at lines 116--120 \OT1/ptm/m/n/10 (+20) Signi^^Lcant[]Digits / Propagation[]of[]Error$[][], (Ac-cessed: [] @@ -693,15 +693,15 @@ Package rerunfilecheck Info: File `MLPproject.out' has not changed. (rerunfilecheck) Checksum: B4A2CAEADB43696BBE5350199C3331B3;1374. ) Here is how much of TeX's memory you used: - 18880 strings out of 476041 - 320048 string characters out of 5793173 - 1873388 words of memory out of 6000000 - 38855 multiletter control sequences out of 15000+600000 - 566907 words of font info for 222 fonts, out of 8000000 for 9000 + 18916 strings out of 476041 + 320351 string characters out of 5793173 + 1878388 words of memory out of 6000000 + 38854 multiletter control sequences out of 15000+600000 + 568376 words of font info for 247 fonts, out of 8000000 for 9000 1137 hyphenation exceptions out of 8191 75i,9n,77p,1049b,470s stack positions out of 10000i,1000n,20000p,200000b,200000s -Output written on MLPproject.pdf (1 page, 56703 bytes). +Output written on MLPproject.pdf (1 page, 58446 bytes). PDF statistics: 110 PDF objects out of 1000 (max. 8388607) 86 compressed objects within 1 object stream diff --git a/Report/MLPproject.pdf b/Report/MLPproject.pdf index bb1dbffe..35b45398 100644 Binary files a/Report/MLPproject.pdf and b/Report/MLPproject.pdf differ diff --git a/Report/MLPproject.synctex.gz b/Report/MLPproject.synctex.gz index 85866067..1c58c4da 100644 Binary files a/Report/MLPproject.synctex.gz and b/Report/MLPproject.synctex.gz differ diff --git a/Report/MLPproject.tex b/Report/MLPproject.tex index 5d55a95e..9bcb7795 100644 --- a/Report/MLPproject.tex +++ b/Report/MLPproject.tex @@ -72,10 +72,12 @@ \subsection{Dataset} %https://www.kaggle.com/datasets/mosapabdelghany/adult-income-prediction-dataset -The dataset we decided to study is a labeled income prediction dataset. This dataset includes 14 features with +The dataset we decided to study is a labeled income prediction dataset. This dataset includes 14 features with information about the people in the srudy and a label with the income as either more than 50 000\$ per year or less than or equal to 50 000 \$ per year. This means that we are looking at a binary classification problem. A lot of the features are discrete where only a set number of options available. This includes features such as marital status, education and working class. The dataset features around 32500 data points. + \subsection{Data cleaning and feature engineering} -ยง +There were a couple of things with our dataset that had to be modified in order for it to be usable in our ML application. We find that some of the features are redundant or not interesting in our project. We romove the redundant feature education since there is another already numerically encoded feature containing the same data. We also chose to remove the feature 'fnlwgt' since it is a already calculated number that is used by the Census Bureau to estimate population statistics. Since we want to estimate the population statistics based on the other features and not the already calculated weight we remove this feature. We have a mix of numerical and non-numerical features in our dataset. Since the machine learning models cannot use non-numerical data we have to encode the non-numercial data into corresponding numbers. This is with the label encoder built into sci-kit learn and used on all non-numerical data. \subsection{Handling missing values} +With our numerical version of the dataset we found with the info function in pandas that around 2500 values were NaN values. We reasoned that filling these values with something as the mean of the category does not make very much sense for our application. Since there are many discrete categories a mean value means nothing. Especially since we gave many categories arbitrary numbers \section{Model selection}