{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": { "id": "mQSys62efhoG" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sb\n", "\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n", "from sklearn.feature_selection import SelectKBest, chi2\n", "from tqdm.notebook import tqdm\n", "from sklearn import metrics\n", "from sklearn.svm import SVC\n", "from xgboost import XGBClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "source": [ "df = pd.read_csv('/content/parkinson_disease.csv')" ], "metadata": { "id": "U4xaCAKSgxsP" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "pd.set_option('display.max_columns', 10)\n", "df.head(5)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 236 }, "id": "hBeAwdnzg3io", "outputId": "b88ac480-3e2c-40c6-de3e-5b62d276e766" }, "execution_count": 8, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id gender PPE DFA RPDE ... tqwt_kurtosisValue_dec_33 \\\n", "0 0 1 0.85247 0.71826 0.57227 ... 4.4625 \n", "1 0 1 0.76686 0.69481 0.53966 ... 9.5082 \n", "2 0 1 0.85083 0.67604 0.58982 ... 4.8066 \n", "3 1 0 0.41121 0.79672 0.59257 ... 4.6857 \n", "4 1 0 0.32790 0.79782 0.53028 ... 11.6891 \n", "\n", " tqwt_kurtosisValue_dec_34 tqwt_kurtosisValue_dec_35 \\\n", "0 2.6202 3.0004 \n", "1 6.5245 6.3431 \n", "2 2.9199 3.1495 \n", "3 4.8460 6.2650 \n", "4 8.2103 5.0559 \n", "\n", " tqwt_kurtosisValue_dec_36 class \n", "0 18.9405 1 \n", "1 45.1780 1 \n", "2 4.7666 1 \n", "3 4.0603 1 \n", "4 6.1164 1 \n", "\n", "[5 rows x 755 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idgenderPPEDFARPDE...tqwt_kurtosisValue_dec_33tqwt_kurtosisValue_dec_34tqwt_kurtosisValue_dec_35tqwt_kurtosisValue_dec_36class
0010.852470.718260.57227...4.46252.62023.000418.94051
1010.766860.694810.53966...9.50826.52456.343145.17801
2010.850830.676040.58982...4.80662.91993.14954.76661
3100.411210.796720.59257...4.68574.84606.26504.06031
4100.327900.797820.53028...11.68918.21035.05596.11641
\n", "

5 rows × 755 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "code", "source": [ "df.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7zlWkzd8g4gY", "outputId": "a25c4022-f3d5-4497-9ef2-5c8e625f7f5f" }, "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(756, 755)" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "source": [ "df.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Kb1NtiPONQDR", "outputId": "2ad3f8b6-8f9a-49b9-e2e3-f045c685f3c7" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 756 entries, 0 to 755\n", "Columns: 755 entries, id to class\n", "dtypes: float64(749), int64(6)\n", "memory usage: 4.4 MB\n" ] } ] }, { "cell_type": "code", "source": [ "df.describe().T" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "_cLS3vt6g5Yu", "outputId": "023640cc-3999-49ad-8d85-a306f698f515" }, "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " count mean std min 25% \\\n", "id 756.0 125.500000 72.793721 0.000000 62.750000 \n", "gender 756.0 0.515873 0.500079 0.000000 0.000000 \n", "PPE 756.0 0.746284 0.169294 0.041551 0.762833 \n", "DFA 756.0 0.700414 0.069718 0.543500 0.647053 \n", "RPDE 756.0 0.489058 0.137442 0.154300 0.386537 \n", "... ... ... ... ... ... \n", "tqwt_kurtosisValue_dec_33 756.0 12.375335 16.341665 1.628700 3.114375 \n", "tqwt_kurtosisValue_dec_34 756.0 14.799230 15.722502 1.861700 3.665925 \n", "tqwt_kurtosisValue_dec_35 756.0 14.751559 14.432979 1.955900 3.741275 \n", "tqwt_kurtosisValue_dec_36 756.0 31.481110 34.230991 2.364000 3.948750 \n", "class 756.0 0.746032 0.435568 0.000000 0.000000 \n", "\n", " 50% 75% max \n", "id 125.500000 188.250000 251.00000 \n", "gender 1.000000 1.000000 1.00000 \n", "PPE 0.809655 0.834315 0.90766 \n", "DFA 0.700525 0.754985 0.85264 \n", "RPDE 0.484355 0.586515 0.87123 \n", "... ... ... ... \n", "tqwt_kurtosisValue_dec_33 4.741450 12.201325 73.53220 \n", "tqwt_kurtosisValue_dec_34 6.725700 21.922050 62.00730 \n", "tqwt_kurtosisValue_dec_35 7.334250 22.495175 57.54430 \n", "tqwt_kurtosisValue_dec_36 10.637250 61.125325 156.42370 \n", "class 1.000000 1.000000 1.00000 \n", "\n", "[755 rows x 8 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
id756.0125.50000072.7937210.00000062.750000125.500000188.250000251.00000
gender756.00.5158730.5000790.0000000.0000001.0000001.0000001.00000
PPE756.00.7462840.1692940.0415510.7628330.8096550.8343150.90766
DFA756.00.7004140.0697180.5435000.6470530.7005250.7549850.85264
RPDE756.00.4890580.1374420.1543000.3865370.4843550.5865150.87123
...........................
tqwt_kurtosisValue_dec_33756.012.37533516.3416651.6287003.1143754.74145012.20132573.53220
tqwt_kurtosisValue_dec_34756.014.79923015.7225021.8617003.6659256.72570021.92205062.00730
tqwt_kurtosisValue_dec_35756.014.75155914.4329791.9559003.7412757.33425022.49517557.54430
tqwt_kurtosisValue_dec_36756.031.48111034.2309912.3640003.94875010.63725061.125325156.42370
class756.00.7460320.4355680.0000000.0000001.0000001.0000001.00000
\n", "

755 rows × 8 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"df\",\n \"rows\": 755,\n \"fields\": [\n {\n \"column\": \"count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 756.0,\n \"max\": 756.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 756.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 374622395.71923405,\n \"min\": -9029632763.215609,\n \"max\": 12099556.09835979,\n \"num_unique_values\": 755,\n \"samples\": [\n -4143374.954075397\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 236810548.13221908,\n \"min\": 5.7338262951403425e-21,\n \"max\": 5703424049.268989,\n \"num_unique_values\": 755,\n \"samples\": [\n 487675.08280652296\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"min\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2578254179.0303726,\n \"min\": -62062235037.0,\n \"max\": 2213588.67,\n \"num_unique_values\": 743,\n \"samples\": [\n -0.3828\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"25%\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 473008476.38622814,\n \"min\": -11400959583.75,\n \"max\": 7183016.246,\n \"num_unique_values\": 753,\n \"samples\": [\n -0.9168375\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"50%\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 325558429.5620674,\n \"min\": -7849142473.5,\n \"max\": 10662958.870000001,\n \"num_unique_values\": 735,\n \"samples\": [\n 0.0016652\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"75%\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 212971693.58083302,\n \"min\": -5136087772.75,\n \"max\": 15134891.325000001,\n \"num_unique_values\": 752,\n \"samples\": [\n 0.07430725\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"max\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 58324013.06010493,\n \"min\": -1399384344.0,\n \"max\": 74325483.41,\n \"num_unique_values\": 752,\n \"samples\": [\n 0.50277\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 11 } ] }, { "cell_type": "code", "source": [ "df.isnull().sum().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kFp2K49WQ8v7", "outputId": "d754220c-dbb2-436f-f09f-a00aa174cfe0" }, "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "np.int64(0)" ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "df = df.groupby('id').mean().reset_index()\n", "df.drop('id', axis=1, inplace=True)" ], "metadata": { "id": "9kTA7himg6la" }, "execution_count": 13, "outputs": [] }, { "cell_type": "code", "source": [ "columns = list(df.columns)\n", "for col in columns:\n", "\tif col == 'class':\n", "\t\tcontinue\n", "\n", "\tfiltered_columns = [col]\n", "\tfor col1 in df.columns:\n", "\t\tif((col == col1) | (col == 'class')):\n", "\t\t\tcontinue\n", "\n", "\t\tval = df[col].corr(df[col1])\n", "\n", "\t\tif val > 0.7:\n", "\t\t\t# If the correlation between the two\n", "\t\t\t# features is more than 0.7 remove\n", "\t\t\tcolumns.remove(col1)\n", "\t\t\tcontinue\n", "\t\telse:\n", "\t\t\tfiltered_columns.append(col1)\n", "\n", "\t# After each iteration filter out the columns\n", "\tdf = df[filtered_columns]\n", "df.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vmb-STOFg7gT", "outputId": "de9f86d0-dbe0-45d8-aada-f9c15ca19445" }, "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(252, 287)" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "X = df.drop('class', axis=1)\n", "X_norm = MinMaxScaler().fit_transform(X)\n", "selector = SelectKBest(chi2, k=30)\n", "selector.fit(X_norm, df['class'])\n", "filtered_columns = selector.get_support()\n", "filtered_data = X.loc[:, filtered_columns]\n", "filtered_data['class'] = df['class']\n", "df = filtered_data\n", "df.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4vjTyVNJg8uy", "outputId": "62831fea-e502-4381-ce3a-f6ae0e9cbc9d" }, "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(252, 31)" ] }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "x = df['class'].value_counts()\n", "plt.pie(x.values,\n", "\t\tlabels = x.index,\n", "\t\tautopct='%1.1f%%')\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 406 }, "id": "twB_CTOsg911", "outputId": "d171a2b8-4f27-478f-d029-600097380eb7" }, "execution_count": 17, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "features = df.drop('class', axis=1)\n", "target = df['class']\n", "\n", "\n", "X_train, X_val, y_train, y_val = train_test_split(features, target,\n", " test_size=0.2,\n", " random_state=10)\n", "\n", "ros = RandomOverSampler(sampling_strategy=1.0, random_state=0)\n", "X_resampled, y_resampled = ros.fit_resample(X_train, y_train)\n", "\n", "X_resampled.shape, y_resampled.value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DoTwF-YLg-_B", "outputId": "466f48c6-0556-4e97-dba9-38972ad2389a" }, "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "((302, 30),\n", " class\n", " 1.0 151\n", " 0.0 151\n", " Name: count, dtype: int64)" ] }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "code", "source": [ "from sklearn.metrics import roc_auc_score as ras\n", "\n", "# Models list\n", "models = [LogisticRegression(class_weight='balanced'), XGBClassifier(), SVC(kernel='rbf', probability=True)]\n", "\n", "# Model training and evaluation\n", "for model in models:\n", " model.fit(X_resampled, y_resampled)\n", "\n", " print(f'{model} : ')\n", "\n", " train_preds = model.predict(X_resampled)\n", " print('Training Accuracy : ', ras(y_resampled, train_preds))\n", "\n", " val_preds = model.predict(X_val)\n", " print('Validation Accuracy : ', ras(y_val, val_preds))\n", " print()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gzWQHuWVhBQ5", "outputId": "75598672-9c9f-4c31-9029-019fb3f43361" }, "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "LogisticRegression(class_weight='balanced') : \n", "Training Accuracy : 0.7814569536423841\n", "Validation Accuracy : 0.8301158301158301\n", "\n", "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=None, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=None, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " multi_strategy=None, n_estimators=None, n_jobs=None,\n", " num_parallel_tree=None, random_state=None, ...) : \n", "Training Accuracy : 1.0\n", "Validation Accuracy : 0.6467181467181468\n", "\n", "SVC(probability=True) : \n", "Training Accuracy : 0.6258278145695364\n", "Validation Accuracy : 0.6457528957528957\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "from sklearn.metrics import ConfusionMatrixDisplay\n", "ConfusionMatrixDisplay.from_estimator(models[0], X_val, y_val)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 455 }, "id": "b7pACb6fFoFs", "outputId": "3cca6a84-372b-4151-939c-9038b31fb693" }, "execution_count": 23, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "w1uFjdFVG8-N" }, "execution_count": null, "outputs": [] } ] }