diff --git a/info.ipynb b/info.ipynb index 9f4fb4e3..833e2bee 100644 --- a/info.ipynb +++ b/info.ipynb @@ -7,480 +7,15 @@ "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ageworkclassfnlwgteducationeducation.nummarital.statusoccupationrelationshipracesexcapital.gaincapital.losshours.per.weeknative.country
3254171?287372Doctorate16Married-civ-spouse?HusbandWhiteMale0010United-States
3254245State-gov252208HS-grad9SeparatedAdm-clericalOwn-childWhiteFemale0040United-States
3254341?202822HS-grad9Separated?Not-in-familyBlackFemale0032United-States
3254472?129912HS-grad9Married-civ-spouse?HusbandWhiteMale0025United-States
3254545Local-gov119199Assoc-acdm12DivorcedProf-specialtyUnmarriedWhiteFemale0048United-States
3254631Private199655Masters14DivorcedOther-serviceNot-in-familyOtherFemale0030United-States
3254739Local-gov111499Assoc-acdm12Married-civ-spouseAdm-clericalWifeWhiteFemale0020United-States
3254837Private198216Assoc-acdm12DivorcedTech-supportNot-in-familyWhiteFemale0040United-States
3254943Private260761HS-grad9Married-civ-spouseMachine-op-inspctHusbandWhiteMale0040Mexico
3255043State-gov255835Some-college10DivorcedAdm-clericalOther-relativeWhiteFemale0040United-States
3255143Self-emp-not-inc27242Some-college10Married-civ-spouseCraft-repairHusbandWhiteMale0050United-States
3255232Private3406610th6Married-civ-spouseHandlers-cleanersHusbandAmer-Indian-EskimoMale0040United-States
3255343Private84661Assoc-voc11Married-civ-spouseSalesHusbandWhiteMale0045United-States
3255432Private116138Masters14Never-marriedTech-supportNot-in-familyAsian-Pac-IslanderMale0011Taiwan
3255553Private321865Masters14Married-civ-spouseExec-managerialHusbandWhiteMale0040United-States
3255622Private310152Some-college10Never-marriedProtective-servNot-in-familyWhiteMale0040United-States
3255727Private257302Assoc-acdm12Married-civ-spouseTech-supportWifeWhiteFemale0038United-States
3255840Private154374HS-grad9Married-civ-spouseMachine-op-inspctHusbandWhiteMale0040United-States
3255958Private151910HS-grad9WidowedAdm-clericalUnmarriedWhiteFemale0040United-States
3256022Private201490HS-grad9Never-marriedAdm-clericalOwn-childWhiteMale0020United-States
\n", - "
" - ], - "text/plain": [ - " age workclass fnlwgt education education.num \\\n", - "32541 71 ? 287372 Doctorate 16 \n", - "32542 45 State-gov 252208 HS-grad 9 \n", - "32543 41 ? 202822 HS-grad 9 \n", - "32544 72 ? 129912 HS-grad 9 \n", - "32545 45 Local-gov 119199 Assoc-acdm 12 \n", - "32546 31 Private 199655 Masters 14 \n", - "32547 39 Local-gov 111499 Assoc-acdm 12 \n", - "32548 37 Private 198216 Assoc-acdm 12 \n", - "32549 43 Private 260761 HS-grad 9 \n", - "32550 43 State-gov 255835 Some-college 10 \n", - "32551 43 Self-emp-not-inc 27242 Some-college 10 \n", - "32552 32 Private 34066 10th 6 \n", - "32553 43 Private 84661 Assoc-voc 11 \n", - "32554 32 Private 116138 Masters 14 \n", - "32555 53 Private 321865 Masters 14 \n", - "32556 22 Private 310152 Some-college 10 \n", - "32557 27 Private 257302 Assoc-acdm 12 \n", - "32558 40 Private 154374 HS-grad 9 \n", - "32559 58 Private 151910 HS-grad 9 \n", - "32560 22 Private 201490 HS-grad 9 \n", - "\n", - " marital.status occupation relationship \\\n", - "32541 Married-civ-spouse ? Husband \n", - "32542 Separated Adm-clerical Own-child \n", - "32543 Separated ? Not-in-family \n", - "32544 Married-civ-spouse ? Husband \n", - "32545 Divorced Prof-specialty Unmarried \n", - "32546 Divorced Other-service Not-in-family \n", - "32547 Married-civ-spouse Adm-clerical Wife \n", - "32548 Divorced Tech-support Not-in-family \n", - "32549 Married-civ-spouse Machine-op-inspct Husband \n", - "32550 Divorced Adm-clerical Other-relative \n", - "32551 Married-civ-spouse Craft-repair Husband \n", - "32552 Married-civ-spouse Handlers-cleaners Husband \n", - "32553 Married-civ-spouse Sales Husband \n", - "32554 Never-married Tech-support Not-in-family \n", - "32555 Married-civ-spouse Exec-managerial Husband \n", - "32556 Never-married Protective-serv Not-in-family \n", - "32557 Married-civ-spouse Tech-support Wife \n", - "32558 Married-civ-spouse Machine-op-inspct Husband \n", - "32559 Widowed Adm-clerical Unmarried \n", - "32560 Never-married Adm-clerical Own-child \n", - "\n", - " race sex capital.gain capital.loss hours.per.week \\\n", - "32541 White Male 0 0 10 \n", - "32542 White Female 0 0 40 \n", - "32543 Black Female 0 0 32 \n", - "32544 White Male 0 0 25 \n", - "32545 White Female 0 0 48 \n", - "32546 Other Female 0 0 30 \n", - "32547 White Female 0 0 20 \n", - "32548 White Female 0 0 40 \n", - "32549 White Male 0 0 40 \n", - "32550 White Female 0 0 40 \n", - "32551 White Male 0 0 50 \n", - "32552 Amer-Indian-Eskimo Male 0 0 40 \n", - "32553 White Male 0 0 45 \n", - "32554 Asian-Pac-Islander Male 0 0 11 \n", - "32555 White Male 0 0 40 \n", - "32556 White Male 0 0 40 \n", - "32557 White Female 0 0 38 \n", - "32558 White Male 0 0 40 \n", - "32559 White Female 0 0 40 \n", - "32560 White Male 0 0 20 \n", - "\n", - " native.country \n", - "32541 United-States \n", - "32542 United-States \n", - "32543 United-States \n", - "32544 United-States \n", - "32545 United-States \n", - "32546 United-States \n", - "32547 United-States \n", - "32548 United-States \n", - "32549 Mexico \n", - "32550 United-States \n", - "32551 United-States \n", - "32552 United-States \n", - "32553 United-States \n", - "32554 Taiwan \n", - "32555 United-States \n", - "32556 United-States \n", - "32557 United-States \n", - "32558 United-States \n", - "32559 United-States \n", - "32560 United-States " - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" + "ename": "TypeError", + "evalue": "'numpy.ndarray' object is not callable", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[25]\u001b[39m\u001b[32m, line 20\u001b[39m\n\u001b[32m 18\u001b[39m X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=\u001b[32m0.2\u001b[39m, random_state=\u001b[32m42\u001b[39m)\n\u001b[32m 19\u001b[39m X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=\u001b[32m0.2\u001b[39m, random_state=\u001b[32m42\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m X_train = \u001b[43mX_train\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 23\u001b[39m n_features = \u001b[32m10\u001b[39m\n\u001b[32m 24\u001b[39m fig=plt.figure( figsize=(\u001b[32m15\u001b[39m, \u001b[32m15\u001b[39m) )\n", + "\u001b[31mTypeError\u001b[39m: 'numpy.ndarray' object is not callable" + ] } ], "source": [ @@ -489,23 +24,51 @@ "import matplotlib.pyplot as plt\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import Pipeline\n", + "from sklearn.tree import DecisionTreeClassifier, plot_tree\n", + "from sklearn.preprocessing import LabelEncoder\n", "\n", + "df = pd.read_csv('./Datasets/adult.csv', comment = '#')\n", "\n", + "# First, create a copy of the dataframe to avoid modifying the original\n", + "df_encoded = df.copy()\n", "\n", - "data = pd.read_csv('./Datasets/adult.csv', comment = '#')\n", + "# Apply label encoding to categorical columns\n", + "label_encoder = LabelEncoder()\n", + "categorical_columns = ['workclass', 'marital.status', 'occupation', \n", + " 'relationship', 'race', 'sex', 'income']\n", "\n", - "# Features\n", - "X = data.drop(columns=['income'])\n", - "\n", - "# Labels\n", - "y = data['income']\n", + "for column in categorical_columns:\n", + " df_encoded[column] = label_encoder.fit_transform(df_encoded[column])\n", "\n", + "# Now properly separate features and target\n", + "X = df_encoded.drop(columns=['income', 'native.country', 'education'])\n", + "y = df_encoded['income']\n", "\n", + "# Split the data\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)\n", "\n", - "X.tail(20)\n", - "\n" + "\n", + "\n", + "n_features = 10\n", + "fig=plt.figure( figsize=(15, 15) )\n", + "plt_num = 1\n", + "for i in range(n_features):\n", + " for j in range(n_features):\n", + " ax = fig.add_subplot(n_features, n_features, plt_num)\n", + " if(i == j):\n", + " ax.hist(X_train[:, i], bins=25, color='gray')\n", + " else:\n", + " ax.scatter(X_train[:, j], X_train[:, i], c=np.array(colors)[y_train], s=30, alpha=0.3)\n", + " \n", + " if(i == n_features-1):\n", + " ax.set_xlabel(f'$x_{{{j}}}$', fontsize=22)\n", + " \n", + " if(j==0):\n", + " ax.set_ylabel(f'$x_{{{i}}}$', fontsize=22)\n", + "\n", + " ax.grid(True)\n", + " plt_num +=1\n" ] } ],