diff --git a/info.ipynb b/info.ipynb
index 9f4fb4e3..833e2bee 100644
--- a/info.ipynb
+++ b/info.ipynb
@@ -7,480 +7,15 @@
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " age | \n",
- " workclass | \n",
- " fnlwgt | \n",
- " education | \n",
- " education.num | \n",
- " marital.status | \n",
- " occupation | \n",
- " relationship | \n",
- " race | \n",
- " sex | \n",
- " capital.gain | \n",
- " capital.loss | \n",
- " hours.per.week | \n",
- " native.country | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 32541 | \n",
- " 71 | \n",
- " ? | \n",
- " 287372 | \n",
- " Doctorate | \n",
- " 16 | \n",
- " Married-civ-spouse | \n",
- " ? | \n",
- " Husband | \n",
- " White | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 10 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32542 | \n",
- " 45 | \n",
- " State-gov | \n",
- " 252208 | \n",
- " HS-grad | \n",
- " 9 | \n",
- " Separated | \n",
- " Adm-clerical | \n",
- " Own-child | \n",
- " White | \n",
- " Female | \n",
- " 0 | \n",
- " 0 | \n",
- " 40 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32543 | \n",
- " 41 | \n",
- " ? | \n",
- " 202822 | \n",
- " HS-grad | \n",
- " 9 | \n",
- " Separated | \n",
- " ? | \n",
- " Not-in-family | \n",
- " Black | \n",
- " Female | \n",
- " 0 | \n",
- " 0 | \n",
- " 32 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32544 | \n",
- " 72 | \n",
- " ? | \n",
- " 129912 | \n",
- " HS-grad | \n",
- " 9 | \n",
- " Married-civ-spouse | \n",
- " ? | \n",
- " Husband | \n",
- " White | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 25 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32545 | \n",
- " 45 | \n",
- " Local-gov | \n",
- " 119199 | \n",
- " Assoc-acdm | \n",
- " 12 | \n",
- " Divorced | \n",
- " Prof-specialty | \n",
- " Unmarried | \n",
- " White | \n",
- " Female | \n",
- " 0 | \n",
- " 0 | \n",
- " 48 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32546 | \n",
- " 31 | \n",
- " Private | \n",
- " 199655 | \n",
- " Masters | \n",
- " 14 | \n",
- " Divorced | \n",
- " Other-service | \n",
- " Not-in-family | \n",
- " Other | \n",
- " Female | \n",
- " 0 | \n",
- " 0 | \n",
- " 30 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32547 | \n",
- " 39 | \n",
- " Local-gov | \n",
- " 111499 | \n",
- " Assoc-acdm | \n",
- " 12 | \n",
- " Married-civ-spouse | \n",
- " Adm-clerical | \n",
- " Wife | \n",
- " White | \n",
- " Female | \n",
- " 0 | \n",
- " 0 | \n",
- " 20 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32548 | \n",
- " 37 | \n",
- " Private | \n",
- " 198216 | \n",
- " Assoc-acdm | \n",
- " 12 | \n",
- " Divorced | \n",
- " Tech-support | \n",
- " Not-in-family | \n",
- " White | \n",
- " Female | \n",
- " 0 | \n",
- " 0 | \n",
- " 40 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32549 | \n",
- " 43 | \n",
- " Private | \n",
- " 260761 | \n",
- " HS-grad | \n",
- " 9 | \n",
- " Married-civ-spouse | \n",
- " Machine-op-inspct | \n",
- " Husband | \n",
- " White | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 40 | \n",
- " Mexico | \n",
- "
\n",
- " \n",
- " | 32550 | \n",
- " 43 | \n",
- " State-gov | \n",
- " 255835 | \n",
- " Some-college | \n",
- " 10 | \n",
- " Divorced | \n",
- " Adm-clerical | \n",
- " Other-relative | \n",
- " White | \n",
- " Female | \n",
- " 0 | \n",
- " 0 | \n",
- " 40 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32551 | \n",
- " 43 | \n",
- " Self-emp-not-inc | \n",
- " 27242 | \n",
- " Some-college | \n",
- " 10 | \n",
- " Married-civ-spouse | \n",
- " Craft-repair | \n",
- " Husband | \n",
- " White | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 50 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32552 | \n",
- " 32 | \n",
- " Private | \n",
- " 34066 | \n",
- " 10th | \n",
- " 6 | \n",
- " Married-civ-spouse | \n",
- " Handlers-cleaners | \n",
- " Husband | \n",
- " Amer-Indian-Eskimo | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 40 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32553 | \n",
- " 43 | \n",
- " Private | \n",
- " 84661 | \n",
- " Assoc-voc | \n",
- " 11 | \n",
- " Married-civ-spouse | \n",
- " Sales | \n",
- " Husband | \n",
- " White | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 45 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32554 | \n",
- " 32 | \n",
- " Private | \n",
- " 116138 | \n",
- " Masters | \n",
- " 14 | \n",
- " Never-married | \n",
- " Tech-support | \n",
- " Not-in-family | \n",
- " Asian-Pac-Islander | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 11 | \n",
- " Taiwan | \n",
- "
\n",
- " \n",
- " | 32555 | \n",
- " 53 | \n",
- " Private | \n",
- " 321865 | \n",
- " Masters | \n",
- " 14 | \n",
- " Married-civ-spouse | \n",
- " Exec-managerial | \n",
- " Husband | \n",
- " White | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 40 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32556 | \n",
- " 22 | \n",
- " Private | \n",
- " 310152 | \n",
- " Some-college | \n",
- " 10 | \n",
- " Never-married | \n",
- " Protective-serv | \n",
- " Not-in-family | \n",
- " White | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 40 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32557 | \n",
- " 27 | \n",
- " Private | \n",
- " 257302 | \n",
- " Assoc-acdm | \n",
- " 12 | \n",
- " Married-civ-spouse | \n",
- " Tech-support | \n",
- " Wife | \n",
- " White | \n",
- " Female | \n",
- " 0 | \n",
- " 0 | \n",
- " 38 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32558 | \n",
- " 40 | \n",
- " Private | \n",
- " 154374 | \n",
- " HS-grad | \n",
- " 9 | \n",
- " Married-civ-spouse | \n",
- " Machine-op-inspct | \n",
- " Husband | \n",
- " White | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 40 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32559 | \n",
- " 58 | \n",
- " Private | \n",
- " 151910 | \n",
- " HS-grad | \n",
- " 9 | \n",
- " Widowed | \n",
- " Adm-clerical | \n",
- " Unmarried | \n",
- " White | \n",
- " Female | \n",
- " 0 | \n",
- " 0 | \n",
- " 40 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- " | 32560 | \n",
- " 22 | \n",
- " Private | \n",
- " 201490 | \n",
- " HS-grad | \n",
- " 9 | \n",
- " Never-married | \n",
- " Adm-clerical | \n",
- " Own-child | \n",
- " White | \n",
- " Male | \n",
- " 0 | \n",
- " 0 | \n",
- " 20 | \n",
- " United-States | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " age workclass fnlwgt education education.num \\\n",
- "32541 71 ? 287372 Doctorate 16 \n",
- "32542 45 State-gov 252208 HS-grad 9 \n",
- "32543 41 ? 202822 HS-grad 9 \n",
- "32544 72 ? 129912 HS-grad 9 \n",
- "32545 45 Local-gov 119199 Assoc-acdm 12 \n",
- "32546 31 Private 199655 Masters 14 \n",
- "32547 39 Local-gov 111499 Assoc-acdm 12 \n",
- "32548 37 Private 198216 Assoc-acdm 12 \n",
- "32549 43 Private 260761 HS-grad 9 \n",
- "32550 43 State-gov 255835 Some-college 10 \n",
- "32551 43 Self-emp-not-inc 27242 Some-college 10 \n",
- "32552 32 Private 34066 10th 6 \n",
- "32553 43 Private 84661 Assoc-voc 11 \n",
- "32554 32 Private 116138 Masters 14 \n",
- "32555 53 Private 321865 Masters 14 \n",
- "32556 22 Private 310152 Some-college 10 \n",
- "32557 27 Private 257302 Assoc-acdm 12 \n",
- "32558 40 Private 154374 HS-grad 9 \n",
- "32559 58 Private 151910 HS-grad 9 \n",
- "32560 22 Private 201490 HS-grad 9 \n",
- "\n",
- " marital.status occupation relationship \\\n",
- "32541 Married-civ-spouse ? Husband \n",
- "32542 Separated Adm-clerical Own-child \n",
- "32543 Separated ? Not-in-family \n",
- "32544 Married-civ-spouse ? Husband \n",
- "32545 Divorced Prof-specialty Unmarried \n",
- "32546 Divorced Other-service Not-in-family \n",
- "32547 Married-civ-spouse Adm-clerical Wife \n",
- "32548 Divorced Tech-support Not-in-family \n",
- "32549 Married-civ-spouse Machine-op-inspct Husband \n",
- "32550 Divorced Adm-clerical Other-relative \n",
- "32551 Married-civ-spouse Craft-repair Husband \n",
- "32552 Married-civ-spouse Handlers-cleaners Husband \n",
- "32553 Married-civ-spouse Sales Husband \n",
- "32554 Never-married Tech-support Not-in-family \n",
- "32555 Married-civ-spouse Exec-managerial Husband \n",
- "32556 Never-married Protective-serv Not-in-family \n",
- "32557 Married-civ-spouse Tech-support Wife \n",
- "32558 Married-civ-spouse Machine-op-inspct Husband \n",
- "32559 Widowed Adm-clerical Unmarried \n",
- "32560 Never-married Adm-clerical Own-child \n",
- "\n",
- " race sex capital.gain capital.loss hours.per.week \\\n",
- "32541 White Male 0 0 10 \n",
- "32542 White Female 0 0 40 \n",
- "32543 Black Female 0 0 32 \n",
- "32544 White Male 0 0 25 \n",
- "32545 White Female 0 0 48 \n",
- "32546 Other Female 0 0 30 \n",
- "32547 White Female 0 0 20 \n",
- "32548 White Female 0 0 40 \n",
- "32549 White Male 0 0 40 \n",
- "32550 White Female 0 0 40 \n",
- "32551 White Male 0 0 50 \n",
- "32552 Amer-Indian-Eskimo Male 0 0 40 \n",
- "32553 White Male 0 0 45 \n",
- "32554 Asian-Pac-Islander Male 0 0 11 \n",
- "32555 White Male 0 0 40 \n",
- "32556 White Male 0 0 40 \n",
- "32557 White Female 0 0 38 \n",
- "32558 White Male 0 0 40 \n",
- "32559 White Female 0 0 40 \n",
- "32560 White Male 0 0 20 \n",
- "\n",
- " native.country \n",
- "32541 United-States \n",
- "32542 United-States \n",
- "32543 United-States \n",
- "32544 United-States \n",
- "32545 United-States \n",
- "32546 United-States \n",
- "32547 United-States \n",
- "32548 United-States \n",
- "32549 Mexico \n",
- "32550 United-States \n",
- "32551 United-States \n",
- "32552 United-States \n",
- "32553 United-States \n",
- "32554 Taiwan \n",
- "32555 United-States \n",
- "32556 United-States \n",
- "32557 United-States \n",
- "32558 United-States \n",
- "32559 United-States \n",
- "32560 United-States "
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
+ "ename": "TypeError",
+ "evalue": "'numpy.ndarray' object is not callable",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+ "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[25]\u001b[39m\u001b[32m, line 20\u001b[39m\n\u001b[32m 18\u001b[39m X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=\u001b[32m0.2\u001b[39m, random_state=\u001b[32m42\u001b[39m)\n\u001b[32m 19\u001b[39m X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=\u001b[32m0.2\u001b[39m, random_state=\u001b[32m42\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m X_train = \u001b[43mX_train\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 23\u001b[39m n_features = \u001b[32m10\u001b[39m\n\u001b[32m 24\u001b[39m fig=plt.figure( figsize=(\u001b[32m15\u001b[39m, \u001b[32m15\u001b[39m) )\n",
+ "\u001b[31mTypeError\u001b[39m: 'numpy.ndarray' object is not callable"
+ ]
}
],
"source": [
@@ -489,23 +24,51 @@
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import Pipeline\n",
+ "from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
+ "from sklearn.preprocessing import LabelEncoder\n",
"\n",
+ "df = pd.read_csv('./Datasets/adult.csv', comment = '#')\n",
"\n",
+ "# First, create a copy of the dataframe to avoid modifying the original\n",
+ "df_encoded = df.copy()\n",
"\n",
- "data = pd.read_csv('./Datasets/adult.csv', comment = '#')\n",
+ "# Apply label encoding to categorical columns\n",
+ "label_encoder = LabelEncoder()\n",
+ "categorical_columns = ['workclass', 'marital.status', 'occupation', \n",
+ " 'relationship', 'race', 'sex', 'income']\n",
"\n",
- "# Features\n",
- "X = data.drop(columns=['income'])\n",
- "\n",
- "# Labels\n",
- "y = data['income']\n",
+ "for column in categorical_columns:\n",
+ " df_encoded[column] = label_encoder.fit_transform(df_encoded[column])\n",
"\n",
+ "# Now properly separate features and target\n",
+ "X = df_encoded.drop(columns=['income', 'native.country', 'education'])\n",
+ "y = df_encoded['income']\n",
"\n",
+ "# Split the data\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)\n",
"\n",
- "X.tail(20)\n",
- "\n"
+ "\n",
+ "\n",
+ "n_features = 10\n",
+ "fig=plt.figure( figsize=(15, 15) )\n",
+ "plt_num = 1\n",
+ "for i in range(n_features):\n",
+ " for j in range(n_features):\n",
+ " ax = fig.add_subplot(n_features, n_features, plt_num)\n",
+ " if(i == j):\n",
+ " ax.hist(X_train[:, i], bins=25, color='gray')\n",
+ " else:\n",
+ " ax.scatter(X_train[:, j], X_train[:, i], c=np.array(colors)[y_train], s=30, alpha=0.3)\n",
+ " \n",
+ " if(i == n_features-1):\n",
+ " ax.set_xlabel(f'$x_{{{j}}}$', fontsize=22)\n",
+ " \n",
+ " if(j==0):\n",
+ " ax.set_ylabel(f'$x_{{{i}}}$', fontsize=22)\n",
+ "\n",
+ " ax.grid(True)\n",
+ " plt_num +=1\n"
]
}
],