97 lines
3.9 KiB
Plaintext
97 lines
3.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"id": "b6ea6c3b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "TypeError",
|
|
"evalue": "'numpy.ndarray' object is not callable",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[25]\u001b[39m\u001b[32m, line 20\u001b[39m\n\u001b[32m 18\u001b[39m X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=\u001b[32m0.2\u001b[39m, random_state=\u001b[32m42\u001b[39m)\n\u001b[32m 19\u001b[39m X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=\u001b[32m0.2\u001b[39m, random_state=\u001b[32m42\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m X_train = \u001b[43mX_train\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 23\u001b[39m n_features = \u001b[32m10\u001b[39m\n\u001b[32m 24\u001b[39m fig=plt.figure( figsize=(\u001b[32m15\u001b[39m, \u001b[32m15\u001b[39m) )\n",
|
|
"\u001b[31mTypeError\u001b[39m: 'numpy.ndarray' object is not callable"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
|
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|
"\n",
|
|
"df = pd.read_csv('./Datasets/adult.csv', comment = '#')\n",
|
|
"\n",
|
|
"# First, create a copy of the dataframe to avoid modifying the original\n",
|
|
"df_encoded = df.copy()\n",
|
|
"\n",
|
|
"# Apply label encoding to categorical columns\n",
|
|
"label_encoder = LabelEncoder()\n",
|
|
"categorical_columns = ['workclass', 'marital.status', 'occupation', \n",
|
|
" 'relationship', 'race', 'sex', 'income']\n",
|
|
"\n",
|
|
"for column in categorical_columns:\n",
|
|
" df_encoded[column] = label_encoder.fit_transform(df_encoded[column])\n",
|
|
"\n",
|
|
"# Now properly separate features and target\n",
|
|
"X = df_encoded.drop(columns=['income', 'native.country', 'education'])\n",
|
|
"y = df_encoded['income']\n",
|
|
"\n",
|
|
"# Split the data\n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"n_features = 10\n",
|
|
"fig=plt.figure( figsize=(15, 15) )\n",
|
|
"plt_num = 1\n",
|
|
"for i in range(n_features):\n",
|
|
" for j in range(n_features):\n",
|
|
" ax = fig.add_subplot(n_features, n_features, plt_num)\n",
|
|
" if(i == j):\n",
|
|
" ax.hist(X_train[:, i], bins=25, color='gray')\n",
|
|
" else:\n",
|
|
" ax.scatter(X_train[:, j], X_train[:, i], c=np.array(colors)[y_train], s=30, alpha=0.3)\n",
|
|
" \n",
|
|
" if(i == n_features-1):\n",
|
|
" ax.set_xlabel(f'$x_{{{j}}}$', fontsize=22)\n",
|
|
" \n",
|
|
" if(j==0):\n",
|
|
" ax.set_ylabel(f'$x_{{{i}}}$', fontsize=22)\n",
|
|
"\n",
|
|
" ax.grid(True)\n",
|
|
" plt_num +=1\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|