Regression_with_an_Insurance_dataset

{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":84896,"databundleVersionId":10305135,"sourceType":"competition"}],"dockerImageVersionId":30804,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example,running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:39.509798Z","iopub.execute_input":"2024-12-08T17:48:39.510142Z","iopub.status.idle":"2024-12-08T17:48:39.834528Z","shell.execute_reply.started":"2024-12-08T17:48:39.510112Z","shell.execute_reply":"2024-12-08T17:48:39.833634Z"}},"outputs":[{"name":"stdout","text":"/kaggle/input/playground-series-s4e12/sample_submission.csv\n/kaggle/input/playground-series-s4e12/train.csv\n/kaggle/input/playground-series-s4e12/test.csv\n","output_type":"stream"}],"execution_count":3},{"cell_type":"code","source":"train_df = pd.read_csv(\"/kaggle/input/playground-series-s4e12/train.csv\")\ntrain_df.head()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:39.835960Z","iopub.execute_input":"2024-12-08T17:48:39.836313Z","iopub.status.idle":"2024-12-08T17:48:44.967812Z","shell.execute_reply.started":"2024-12-08T17:48:39.836286Z","shell.execute_reply":"2024-12-08T17:48:44.966816Z"}},"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":"   id   Age  Gender  Annual Income Marital Status  Number of Dependents  \\\n0   0  19.0  Female        10049.0        Married                   1.0   \n1   1  39.0  Female        31678.0       Divorced                   3.0   \n2   2  23.0    Male        25602.0       Divorced                   3.0   \n3   3  21.0    Male       141855.0        Married                   2.0   \n4   4  21.0    Male        39651.0         Single                   1.0   \n\n  Education Level     Occupation  Health Score  Location  ... Previous Claims  \\\n0      Bachelor's  Self-Employed     22.598761     Urban  ...             2.0   \n1        Master's            NaN     15.569731     Rural  ...             1.0   \n2     High School  Self-Employed     47.177549  Suburban  ...             1.0   \n3      Bachelor's            NaN     10.938144     Rural  ...             1.0   \n4      Bachelor's  Self-Employed     20.376094     Rural  ...             0.0   \n\n   Vehicle Age  Credit Score  Insurance Duration           Policy Start Date  \\\n0         17.0         372.0                 5.0  2023-12-23 15:21:39.134960   \n1         12.0         694.0                 2.0  2023-06-12 15:21:39.111551   \n2         14.0           NaN                 3.0  2023-09-30 15:21:39.221386   \n3          0.0         367.0                 1.0  2024-06-12 15:21:39.226954   \n4          8.0         598.0                 4.0  2021-12-01 15:21:39.252145   \n\n  Customer Feedback Smoking Status Exercise Frequency Property Type  \\\n0              Poor             No             Weekly         House   \n1           Average            Yes            Monthly         House   \n2              Good            Yes             Weekly         House   \n3              Poor            Yes              Daily     Apartment   \n4              Poor            Yes             Weekly         House   \n\n  Premium Amount  \n0         2869.0  \n1         1483.0  \n2          567.0  \n3          765.0  \n4         2022.0  \n\n[5 rows x 21 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>Age</th>\n      <th>Gender</th>\n      <th>Annual Income</th>\n      <th>Marital Status</th>\n      <th>Number of Dependents</th>\n      <th>Education Level</th>\n      <th>Occupation</th>\n      <th>Health Score</th>\n      <th>Location</th>\n      <th>...</th>\n      <th>Previous Claims</th>\n      <th>Vehicle Age</th>\n      <th>Credit Score</th>\n      <th>Insurance Duration</th>\n      <th>Policy Start Date</th>\n      <th>Customer Feedback</th>\n      <th>Smoking Status</th>\n      <th>Exercise Frequency</th>\n      <th>Property Type</th>\n      <th>Premium Amount</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>19.0</td>\n      <td>Female</td>\n      <td>10049.0</td>\n      <td>Married</td>\n      <td>1.0</td>\n      <td>Bachelor's</td>\n      <td>Self-Employed</td>\n      <td>22.598761</td>\n      <td>Urban</td>\n      <td>...</td>\n      <td>2.0</td>\n      <td>17.0</td>\n      <td>372.0</td>\n      <td>5.0</td>\n      <td>2023-12-23 15:21:39.134960</td>\n      <td>Poor</td>\n      <td>No</td>\n      <td>Weekly</td>\n      <td>House</td>\n      <td>2869.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>39.0</td>\n      <td>Female</td>\n      <td>31678.0</td>\n      <td>Divorced</td>\n      <td>3.0</td>\n      <td>Master's</td>\n      <td>NaN</td>\n      <td>15.569731</td>\n      <td>Rural</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>12.0</td>\n      <td>694.0</td>\n      <td>2.0</td>\n      <td>2023-06-12 15:21:39.111551</td>\n      <td>Average</td>\n      <td>Yes</td>\n      <td>Monthly</td>\n      <td>House</td>\n      <td>1483.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>23.0</td>\n      <td>Male</td>\n      <td>25602.0</td>\n      <td>Divorced</td>\n      <td>3.0</td>\n      <td>High School</td>\n      <td>Self-Employed</td>\n      <td>47.177549</td>\n      <td>Suburban</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>14.0</td>\n      <td>NaN</td>\n      <td>3.0</td>\n      <td>2023-09-30 15:21:39.221386</td>\n      <td>Good</td>\n      <td>Yes</td>\n      <td>Weekly</td>\n      <td>House</td>\n      <td>567.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>21.0</td>\n      <td>Male</td>\n      <td>141855.0</td>\n      <td>Married</td>\n      <td>2.0</td>\n      <td>Bachelor's</td>\n      <td>NaN</td>\n      <td>10.938144</td>\n      <td>Rural</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>0.0</td>\n      <td>367.0</td>\n      <td>1.0</td>\n      <td>2024-06-12 15:21:39.226954</td>\n      <td>Poor</td>\n      <td>Yes</td>\n      <td>Daily</td>\n      <td>Apartment</td>\n      <td>765.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>21.0</td>\n      <td>Male</td>\n      <td>39651.0</td>\n      <td>Single</td>\n      <td>1.0</td>\n      <td>Bachelor's</td>\n      <td>Self-Employed</td>\n      <td>20.376094</td>\n      <td>Rural</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>8.0</td>\n      <td>598.0</td>\n      <td>4.0</td>\n      <td>2021-12-01 15:21:39.252145</td>\n      <td>Poor</td>\n      <td>Yes</td>\n      <td>Weekly</td>\n      <td>House</td>\n      <td>2022.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 21 columns</p>\n</div>"},"metadata":{}}],"execution_count":4},{"cell_type":"code","source":"print(train_df.describe())\nprint(train_df.info())","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:44.969121Z","iopub.execute_input":"2024-12-08T17:48:44.969406Z","iopub.status.idle":"2024-12-08T17:48:46.154613Z","shell.execute_reply.started":"2024-12-08T17:48:44.969378Z","shell.execute_reply":"2024-12-08T17:48:46.153602Z"}},"outputs":[{"name":"stdout","text":"                 id           Age  Annual Income  Number of Dependents  \\\ncount  1.200000e+06  1.181295e+06   1.155051e+06          1.090328e+06   \nmean   5.999995e+05  4.114556e+01   3.274522e+04          2.009934e+00   \nstd    3.464103e+05  1.353995e+01   3.217951e+04          1.417338e+00   \nmin    0.000000e+00  1.800000e+01   1.000000e+00          0.000000e+00   \n25%    2.999998e+05  3.000000e+01   8.001000e+03          1.000000e+00   \n50%    5.999995e+05  4.100000e+01   2.391100e+04          2.000000e+00   \n75%    8.999992e+05  5.300000e+01   4.463400e+04          3.000000e+00   \nmax    1.199999e+06  6.400000e+01   1.499970e+05          4.000000e+00   \n\n       Health Score  Previous Claims   Vehicle Age  Credit Score  \\\ncount  1.125924e+06    835971.000000  1.199994e+06  1.062118e+06   \nmean   2.561391e+01         1.002689  9.569889e+00  5.929244e+02   \nstd    1.220346e+01         0.982840  5.776189e+00  1.499819e+02   \nmin    2.012237e+00         0.000000  0.000000e+00  3.000000e+02   \n25%    1.591896e+01         0.000000  5.000000e+00  4.680000e+02   \n50%    2.457865e+01         1.000000  1.000000e+01  5.950000e+02   \n75%    3.452721e+01         2.000000  1.500000e+01  7.210000e+02   \nmax    5.897591e+01         9.000000  1.900000e+01  8.490000e+02   \n\n       Insurance Duration  Premium Amount  \ncount        1.199999e+06    1.200000e+06  \nmean         5.018219e+00    1.102545e+03  \nstd          2.594331e+00    8.649989e+02  \nmin          1.000000e+00    2.000000e+01  \n25%          3.000000e+00    5.140000e+02  \n50%          5.000000e+00    8.720000e+02  \n75%          7.000000e+00    1.509000e+03  \nmax          9.000000e+00    4.999000e+03  \n<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 1200000 entries, 0 to 1199999\nData columns (total 21 columns):\n #   Column                Non-Null Count    Dtype  \n---  ------                --------------    -----  \n 0   id                    1200000 non-null  int64  \n 1   Age                   1181295 non-null  float64\n 2   Gender                1200000 non-null  object \n 3   Annual Income         1155051 non-null  float64\n 4   Marital Status        1181471 non-null  object \n 5   Number of Dependents  1090328 non-null  float64\n 6   Education Level       1200000 non-null  object \n 7   Occupation            841925 non-null   object \n 8   Health Score          1125924 non-null  float64\n 9   Location              1200000 non-null  object \n 10  Policy Type           1200000 non-null  object \n 11  Previous Claims       835971 non-null   float64\n 12  Vehicle Age           1199994 non-null  float64\n 13  Credit Score          1062118 non-null  float64\n 14  Insurance Duration    1199999 non-null  float64\n 15  Policy Start Date     1200000 non-null  object \n 16  Customer Feedback     1122176 non-null  object \n 17  Smoking Status        1200000 non-null  object \n 18  Exercise Frequency    1200000 non-null  object \n 19  Property Type         1200000 non-null  object \n 20  Premium Amount        1200000 non-null  float64\ndtypes: float64(9), int64(1), object(11)\nmemory usage: 192.3+ MB\nNone\n","output_type":"stream"}],"execution_count":5},{"cell_type":"code","source":"null_data = pd.DataFrame()\nnull_data['null_count'] = train_df.isnull().sum()\nnull_data['null_ratio'] = round(null_data['null_count'] / len(train_df), 4)\nnull_data","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.156860Z","iopub.execute_input":"2024-12-08T17:48:46.157206Z","iopub.status.idle":"2024-12-08T17:48:46.738189Z","shell.execute_reply.started":"2024-12-08T17:48:46.157168Z","shell.execute_reply":"2024-12-08T17:48:46.737207Z"}},"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":"                      null_count  null_ratio\nid                             0      0.0000\nAge                        18705      0.0156\nGender                         0      0.0000\nAnnual Income              44949      0.0375\nMarital Status             18529      0.0154\nNumber of Dependents      109672      0.0914\nEducation Level                0      0.0000\nOccupation                358075      0.2984\nHealth Score               74076      0.0617\nLocation                       0      0.0000\nPolicy Type                    0      0.0000\nPrevious Claims           364029      0.3034\nVehicle Age                    6      0.0000\nCredit Score              137882      0.1149\nInsurance Duration             1      0.0000\nPolicy Start Date              0      0.0000\nCustomer Feedback          77824      0.0649\nSmoking Status                 0      0.0000\nExercise Frequency             0      0.0000\nProperty Type                  0      0.0000\nPremium Amount                 0      0.0000","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>null_count</th>\n      <th>null_ratio</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>id</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Age</th>\n      <td>18705</td>\n      <td>0.0156</td>\n    </tr>\n    <tr>\n      <th>Gender</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Annual Income</th>\n      <td>44949</td>\n      <td>0.0375</td>\n    </tr>\n    <tr>\n      <th>Marital Status</th>\n      <td>18529</td>\n      <td>0.0154</td>\n    </tr>\n    <tr>\n      <th>Number of Dependents</th>\n      <td>109672</td>\n      <td>0.0914</td>\n    </tr>\n    <tr>\n      <th>Education Level</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Occupation</th>\n      <td>358075</td>\n      <td>0.2984</td>\n    </tr>\n    <tr>\n      <th>Health Score</th>\n      <td>74076</td>\n      <td>0.0617</td>\n    </tr>\n    <tr>\n      <th>Location</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Policy Type</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Previous Claims</th>\n      <td>364029</td>\n      <td>0.3034</td>\n    </tr>\n    <tr>\n      <th>Vehicle Age</th>\n      <td>6</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Credit Score</th>\n      <td>137882</td>\n      <td>0.1149</td>\n    </tr>\n    <tr>\n      <th>Insurance Duration</th>\n      <td>1</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Policy Start Date</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Customer Feedback</th>\n      <td>77824</td>\n      <td>0.0649</td>\n    </tr>\n    <tr>\n      <th>Smoking Status</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Exercise Frequency</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Property Type</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Premium Amount</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}],"execution_count":6},{"cell_type":"code","source":"train_df.columns","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.739322Z","iopub.execute_input":"2024-12-08T17:48:46.739623Z","iopub.status.idle":"2024-12-08T17:48:46.745549Z","shell.execute_reply.started":"2024-12-08T17:48:46.739596Z","shell.execute_reply":"2024-12-08T17:48:46.744540Z"}},"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"Index(['id', 'Age', 'Gender', 'Annual Income', 'Marital Status',\n       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',\n       'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',\n       'Credit Score', 'Insurance Duration', 'Policy Start Date',\n       'Customer Feedback', 'Smoking Status', 'Exercise Frequency',\n       'Property Type', 'Premium Amount'],\n      dtype='object')"},"metadata":{}}],"execution_count":7},{"cell_type":"code","source":"print(train_df[\"Number of Dependents\"].describe().astype(int))\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.746919Z","iopub.execute_input":"2024-12-08T17:48:46.747738Z","iopub.status.idle":"2024-12-08T17:48:46.816989Z","shell.execute_reply.started":"2024-12-08T17:48:46.747694Z","shell.execute_reply":"2024-12-08T17:48:46.815981Z"}},"outputs":[{"name":"stdout","text":"count    1090328\nmean           2\nstd            1\nmin            0\n25%            1\n50%            2\n75%            3\nmax            4\nName: Number of Dependents, dtype: int64\n","output_type":"stream"}],"execution_count":8},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"train_df.shape","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.817979Z","iopub.execute_input":"2024-12-08T17:48:46.818278Z","iopub.status.idle":"2024-12-08T17:48:46.824171Z","shell.execute_reply.started":"2024-12-08T17:48:46.818249Z","shell.execute_reply":"2024-12-08T17:48:46.823272Z"}},"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"(1200000, 21)"},"metadata":{}}],"execution_count":9},{"cell_type":"code","source":"categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns\ncategorical_cols","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.825313Z","iopub.execute_input":"2024-12-08T17:48:46.825652Z","iopub.status.idle":"2024-12-08T17:48:46.982087Z","shell.execute_reply.started":"2024-12-08T17:48:46.825624Z","shell.execute_reply":"2024-12-08T17:48:46.981140Z"}},"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"Index(['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',\n       'Policy Type', 'Policy Start Date', 'Customer Feedback',\n       'Smoking Status', 'Exercise Frequency', 'Property Type'],\n      dtype='object')"},"metadata":{}}],"execution_count":10},{"cell_type":"code","source":"categorical_null_data = pd.DataFrame()\ncategorical_null_data['null_count'] = train_df[categorical_cols].isnull().sum()\ncategorical_null_data['null_ratio'] = round(categorical_null_data['null_count'] / len(train_df), 4)\ncategorical_null_data","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.983148Z","iopub.execute_input":"2024-12-08T17:48:46.983486Z","iopub.status.idle":"2024-12-08T17:48:47.710754Z","shell.execute_reply.started":"2024-12-08T17:48:46.983460Z","shell.execute_reply":"2024-12-08T17:48:47.709885Z"}},"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"                    null_count  null_ratio\nGender                       0      0.0000\nMarital Status           18529      0.0154\nEducation Level              0      0.0000\nOccupation              358075      0.2984\nLocation                     0      0.0000\nPolicy Type                  0      0.0000\nPolicy Start Date            0      0.0000\nCustomer Feedback        77824      0.0649\nSmoking Status               0      0.0000\nExercise Frequency           0      0.0000\nProperty Type                0      0.0000","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>null_count</th>\n      <th>null_ratio</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Gender</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Marital Status</th>\n      <td>18529</td>\n      <td>0.0154</td>\n    </tr>\n    <tr>\n      <th>Education Level</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Occupation</th>\n      <td>358075</td>\n      <td>0.2984</td>\n    </tr>\n    <tr>\n      <th>Location</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Policy Type</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Policy Start Date</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Customer Feedback</th>\n      <td>77824</td>\n      <td>0.0649</td>\n    </tr>\n    <tr>\n      <th>Smoking Status</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Exercise Frequency</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n    <tr>\n      <th>Property Type</th>\n      <td>0</td>\n      <td>0.0000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}],"execution_count":11},{"cell_type":"code","source":"# for c in categorical_cols:\n#     print(f'Category: {c}, Count: {len(train_df[c].unique())} Unique values: {train_df[c].unique()}')\n\n# for c in categorical_cols:\n#     print(f'Category: {c}')\n\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:47.714360Z","iopub.execute_input":"2024-12-08T17:48:47.714644Z","iopub.status.idle":"2024-12-08T17:48:47.718805Z","shell.execute_reply.started":"2024-12-08T17:48:47.714601Z","shell.execute_reply":"2024-12-08T17:48:47.717672Z"}},"outputs":[],"execution_count":12},{"cell_type":"code","source":"# Category: Gender ////\n# Category: Marital Status ////\n# Category: Education Level /\n# Category: Occupation ////\n# Category: Location //\n# Category: Policy Type ///\n# Category: Policy Start Date ///\n# Category: Customer Feedback /\n# Category: Smoking Status /\n# Category: Exercise Frequency /\n# Category: Property Type //\n\n# import pandas as pd\n\n# df = train_df.copy()\n\n# binary_mappings = {\n#     'Smoking Status': {'No': 0, 'Yes': 1}\n# }\n# for col, mapping in binary_mappings.items():\n#     if col in df.columns:\n#         df[col] = df[col].map(mapping)\n\n# onehot_encode_cols = ['Gender', 'Location', 'Policy Type', 'Property Type']\n# df = pd.get_dummies(df, columns=onehot_encode_cols, drop_first=False)\n\n# ordinal_mappings = {\n#     'Exercise Frequency': {'Rarely': 0, 'Monthly': 1, 'Weekly': 2, 'Daily': 3},\n#     'Customer Feedback': {'Poor': 0, 'Average': 1, 'Good': 2},\n#     'Education Level': {'High School': 0, \"Bachelor's\": 1, \"Master's\": 2, 'PhD': 3},\n#     'Policy Type': {'Premium': 2, 'Comprehensive': 1, 'Basic': 0}\n# }\n# for col, mapping in ordinal_mappings.items():\n#     if col in df.columns:\n#         df[col] = df[col].map(mapping)\n\n# if 'Policy Start Date' in df.columns:\n#     df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date']).astype(int) / 10**9  \n\n# categorical_cols = ['Customer Feedback', 'Marital Status', 'Occupation']\n# for col in categorical_cols:\n#     df[col] = df[col].fillna('Unknown')\n\n# label_encode_cols = ['Customer Feedback', 'Marital Status', 'Occupation']\n# for col in label_encode_cols:\n#     df[col] = df[col].astype('category').cat.codes\n\n# print(df.head())\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:47.719948Z","iopub.execute_input":"2024-12-08T17:48:47.720190Z","iopub.status.idle":"2024-12-08T17:48:47.730914Z","shell.execute_reply.started":"2024-12-08T17:48:47.720168Z","shell.execute_reply":"2024-12-08T17:48:47.730265Z"}},"outputs":[],"execution_count":13},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns\ncategorical_cols","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:28.802194Z","iopub.execute_input":"2024-12-08T17:52:28.802627Z","iopub.status.idle":"2024-12-08T17:52:28.962929Z","shell.execute_reply.started":"2024-12-08T17:52:28.802594Z","shell.execute_reply":"2024-12-08T17:52:28.961797Z"}},"outputs":[{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"Index(['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',\n       'Policy Type', 'Policy Start Date', 'Customer Feedback',\n       'Smoking Status', 'Exercise Frequency', 'Property Type'],\n      dtype='object')"},"metadata":{}}],"execution_count":15},{"cell_type":"code","source":"train_df = train_df.drop(columns=categorical_cols)\ntrain_df","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:28.964602Z","iopub.execute_input":"2024-12-08T17:52:28.964944Z","iopub.status.idle":"2024-12-08T17:52:29.022026Z","shell.execute_reply.started":"2024-12-08T17:52:28.964893Z","shell.execute_reply":"2024-12-08T17:52:29.021056Z"}},"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"              id   Age  Annual Income  Number of Dependents  Health Score  \\\n0              0  19.0        10049.0                   1.0     22.598761   \n1              1  39.0        31678.0                   3.0     15.569731   \n2              2  23.0        25602.0                   3.0     47.177549   \n3              3  21.0       141855.0                   2.0     10.938144   \n4              4  21.0        39651.0                   1.0     20.376094   \n...          ...   ...            ...                   ...           ...   \n1199995  1199995  36.0        27316.0                   0.0     13.772907   \n1199996  1199996  54.0        35786.0                   NaN     11.483482   \n1199997  1199997  19.0        51884.0                   0.0     14.724469   \n1199998  1199998  55.0            NaN                   1.0     18.547381   \n1199999  1199999  21.0            NaN                   0.0     10.125323   \n\n         Previous Claims  Vehicle Age  Credit Score  Insurance Duration  \\\n0                    2.0         17.0         372.0                 5.0   \n1                    1.0         12.0         694.0                 2.0   \n2                    1.0         14.0           NaN                 3.0   \n3                    1.0          0.0         367.0                 1.0   \n4                    0.0          8.0         598.0                 4.0   \n...                  ...          ...           ...                 ...   \n1199995              NaN          5.0         372.0                 3.0   \n1199996              NaN         10.0         597.0                 4.0   \n1199997              0.0         19.0           NaN                 6.0   \n1199998              1.0          7.0         407.0                 4.0   \n1199999              0.0         18.0         502.0                 6.0   \n\n         Premium Amount  \n0                2869.0  \n1                1483.0  \n2                 567.0  \n3                 765.0  \n4                2022.0  \n...                 ...  \n1199995          1303.0  \n1199996           821.0  \n1199997           371.0  \n1199998           596.0  \n1199999          2480.0  \n\n[1200000 rows x 10 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>Age</th>\n      <th>Annual Income</th>\n      <th>Number of Dependents</th>\n      <th>Health Score</th>\n      <th>Previous Claims</th>\n      <th>Vehicle Age</th>\n      <th>Credit Score</th>\n      <th>Insurance Duration</th>\n      <th>Premium Amount</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>19.0</td>\n      <td>10049.0</td>\n      <td>1.0</td>\n      <td>22.598761</td>\n      <td>2.0</td>\n      <td>17.0</td>\n      <td>372.0</td>\n      <td>5.0</td>\n      <td>2869.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>39.0</td>\n      <td>31678.0</td>\n      <td>3.0</td>\n      <td>15.569731</td>\n      <td>1.0</td>\n      <td>12.0</td>\n      <td>694.0</td>\n      <td>2.0</td>\n      <td>1483.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>23.0</td>\n      <td>25602.0</td>\n      <td>3.0</td>\n      <td>47.177549</td>\n      <td>1.0</td>\n      <td>14.0</td>\n      <td>NaN</td>\n      <td>3.0</td>\n      <td>567.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>21.0</td>\n      <td>141855.0</td>\n      <td>2.0</td>\n      <td>10.938144</td>\n      <td>1.0</td>\n      <td>0.0</td>\n      <td>367.0</td>\n      <td>1.0</td>\n      <td>765.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>21.0</td>\n      <td>39651.0</td>\n      <td>1.0</td>\n      <td>20.376094</td>\n      <td>0.0</td>\n      <td>8.0</td>\n      <td>598.0</td>\n      <td>4.0</td>\n      <td>2022.0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1199995</th>\n      <td>1199995</td>\n      <td>36.0</td>\n      <td>27316.0</td>\n      <td>0.0</td>\n      <td>13.772907</td>\n      <td>NaN</td>\n      <td>5.0</td>\n      <td>372.0</td>\n      <td>3.0</td>\n      <td>1303.0</td>\n    </tr>\n    <tr>\n      <th>1199996</th>\n      <td>1199996</td>\n      <td>54.0</td>\n      <td>35786.0</td>\n      <td>NaN</td>\n      <td>11.483482</td>\n      <td>NaN</td>\n      <td>10.0</td>\n      <td>597.0</td>\n      <td>4.0</td>\n      <td>821.0</td>\n    </tr>\n    <tr>\n      <th>1199997</th>\n      <td>1199997</td>\n      <td>19.0</td>\n      <td>51884.0</td>\n      <td>0.0</td>\n      <td>14.724469</td>\n      <td>0.0</td>\n      <td>19.0</td>\n      <td>NaN</td>\n      <td>6.0</td>\n      <td>371.0</td>\n    </tr>\n    <tr>\n      <th>1199998</th>\n      <td>1199998</td>\n      <td>55.0</td>\n      <td>NaN</td>\n      <td>1.0</td>\n      <td>18.547381</td>\n      <td>1.0</td>\n      <td>7.0</td>\n      <td>407.0</td>\n      <td>4.0</td>\n      <td>596.0</td>\n    </tr>\n    <tr>\n      <th>1199999</th>\n      <td>1199999</td>\n      <td>21.0</td>\n      <td>NaN</td>\n      <td>0.0</td>\n      <td>10.125323</td>\n      <td>0.0</td>\n      <td>18.0</td>\n      <td>502.0</td>\n      <td>6.0</td>\n      <td>2480.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>1200000 rows × 10 columns</p>\n</div>"},"metadata":{}}],"execution_count":16},{"cell_type":"code","source":"import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\n# fig, axes = plt.subplots(len(categorical_cols) + 1, 1, figsize=(10, 5 * (len(categorical_cols) + 1)))\n# fig.tight_layout(pad=6.0)\n\n# for i, col in enumerate(categorical_cols):\n#     sns.boxplot(x=train_df[col], y=train_df['Premium Amount'], ax=axes[i])\n#     axes[i].set_title(f'Boxplot of {col} vs Premium Amount')\n#     axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)\n\n# corr_matrix = train_df.select_dtypes(include=['number']).corr()\n# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=axes[len(categorical_cols)])\n# axes[len(categorical_cols)].set_title('Correlation Matrix Heatmap')\n\n# plt.show()\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:29.023474Z","iopub.execute_input":"2024-12-08T17:52:29.023924Z","iopub.status.idle":"2024-12-08T17:52:29.741584Z","shell.execute_reply.started":"2024-12-08T17:52:29.023882Z","shell.execute_reply":"2024-12-08T17:52:29.740653Z"}},"outputs":[],"execution_count":17},{"cell_type":"code","source":"# train_df = train_df.drop(columns=categorical_cols)\n# train_df","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:29.743353Z","iopub.execute_input":"2024-12-08T17:52:29.743694Z","iopub.status.idle":"2024-12-08T17:52:29.747412Z","shell.execute_reply.started":"2024-12-08T17:52:29.743668Z","shell.execute_reply":"2024-12-08T17:52:29.746561Z"}},"outputs":[],"execution_count":18},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nfrom scipy.stats import skew, kurtosis\n\ndf = train_df.copy()\ndf = df.fillna(df.mean())\n\ndf['Mean_Income_Duration'] = (df['Annual Income'] + df['Insurance Duration']) / 2\ndf['Income_Skewness'] = skew(df['Annual Income'], nan_policy='omit')\ndf['Health_Skewness'] = skew(df['Health Score'], nan_policy='omit')\ndf['Claims_Skewness'] = skew(df['Previous Claims'], nan_policy='omit')\n\ndf['Income_Kurtosis'] = kurtosis(df['Annual Income'], nan_policy='omit')\ndf['Health_Kurtosis'] = kurtosis(df['Health Score'], nan_policy='omit')\ndf['Claims_Kurtosis'] = kurtosis(df['Previous Claims'], nan_policy='omit')\n\ndf['Income_Per_Dependent'] = df['Annual Income'] / (df['Number of Dependents'] + 1)  \ndf['Claims_Per_Year'] = df['Previous Claims'] / df['Insurance Duration']\n\ndf['Age_to_VehicleAge_Diff'] = df['Age'] - df['Vehicle Age']\ndf['Health_to_Credit_Score_Ratio'] = df['Health Score'] / (df['Credit Score'] + 1)\n\ndf['Total_Assets'] = df['Annual Income'] + df['Credit Score']\ndf['Overall_Risk'] = (df['Vehicle Age'] + df['Previous Claims'] + df['Insurance Duration']) / df['Health Score']\n\ndf['Log_Income'] = np.log1p(df['Annual Income'])\ndf['Log_Credit'] = np.log1p(df['Credit Score'])\n\n\ndf['Income_Quartile'] = pd.qcut(df['Annual Income'], 4, labels=[1, 2, 3, 4]).astype(int)\ndf['Credit_Quartile'] = pd.qcut(df['Credit Score'], 4, labels=[1, 2, 3, 4]).astype(int)\n\ndf['High_Income'] = (df['Annual Income'] > df['Annual Income'].median()).astype(int)\ndf['Old_Vehicle'] = (df['Vehicle Age'] > 10).astype(int)\n\n# Display the enriched dataset\ndf.head()\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:29.748330Z","iopub.execute_input":"2024-12-08T17:52:29.748570Z","iopub.status.idle":"2024-12-08T17:52:30.332481Z","shell.execute_reply.started":"2024-12-08T17:52:29.748546Z","shell.execute_reply":"2024-12-08T17:52:30.331638Z"}},"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":"   id   Age  Annual Income  Number of Dependents  Health Score  \\\n0   0  19.0        10049.0                   1.0     22.598761   \n1   1  39.0        31678.0                   3.0     15.569731   \n2   2  23.0        25602.0                   3.0     47.177549   \n3   3  21.0       141855.0                   2.0     10.938144   \n4   4  21.0        39651.0                   1.0     20.376094   \n\n   Previous Claims  Vehicle Age  Credit Score  Insurance Duration  \\\n0              2.0         17.0     372.00000                 5.0   \n1              1.0         12.0     694.00000                 2.0   \n2              1.0         14.0     592.92435                 3.0   \n3              1.0          0.0     367.00000                 1.0   \n4              0.0          8.0     598.00000                 4.0   \n\n   Premium Amount  ...  Age_to_VehicleAge_Diff  Health_to_Credit_Score_Ratio  \\\n0          2869.0  ...                     2.0                      0.060586   \n1          1483.0  ...                    27.0                      0.022402   \n2           567.0  ...                     9.0                      0.079434   \n3           765.0  ...                    21.0                      0.029723   \n4          2022.0  ...                    13.0                      0.034017   \n\n   Total_Assets  Overall_Risk  Log_Income  Log_Credit  Income_Quartile  \\\n0   10421.00000      1.062005    9.215328    5.921578                2   \n1   32372.00000      0.963408   10.363409    6.543912                3   \n2   26194.92435      0.381537   10.150465    6.386752                3   \n3  142222.00000      0.182846   11.862568    5.908083                4   \n4   40249.00000      0.588925   10.587897    6.395262                3   \n\n   Credit_Quartile  High_Income  Old_Vehicle  \n0                1            0            1  \n1                3            1            1  \n2                2            1            1  \n3                1            1            0  \n4                3            1            0  \n\n[5 rows x 29 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>Age</th>\n      <th>Annual Income</th>\n      <th>Number of Dependents</th>\n      <th>Health Score</th>\n      <th>Previous Claims</th>\n      <th>Vehicle Age</th>\n      <th>Credit Score</th>\n      <th>Insurance Duration</th>\n      <th>Premium Amount</th>\n      <th>...</th>\n      <th>Age_to_VehicleAge_Diff</th>\n      <th>Health_to_Credit_Score_Ratio</th>\n      <th>Total_Assets</th>\n      <th>Overall_Risk</th>\n      <th>Log_Income</th>\n      <th>Log_Credit</th>\n      <th>Income_Quartile</th>\n      <th>Credit_Quartile</th>\n      <th>High_Income</th>\n      <th>Old_Vehicle</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>19.0</td>\n      <td>10049.0</td>\n      <td>1.0</td>\n      <td>22.598761</td>\n      <td>2.0</td>\n      <td>17.0</td>\n      <td>372.00000</td>\n      <td>5.0</td>\n      <td>2869.0</td>\n      <td>...</td>\n      <td>2.0</td>\n      <td>0.060586</td>\n      <td>10421.00000</td>\n      <td>1.062005</td>\n      <td>9.215328</td>\n      <td>5.921578</td>\n      <td>2</td>\n      <td>1</td>\n      <td>0</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>39.0</td>\n      <td>31678.0</td>\n      <td>3.0</td>\n      <td>15.569731</td>\n      <td>1.0</td>\n      <td>12.0</td>\n      <td>694.00000</td>\n      <td>2.0</td>\n      <td>1483.0</td>\n      <td>...</td>\n      <td>27.0</td>\n      <td>0.022402</td>\n      <td>32372.00000</td>\n      <td>0.963408</td>\n      <td>10.363409</td>\n      <td>6.543912</td>\n      <td>3</td>\n      <td>3</td>\n      <td>1</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>23.0</td>\n      <td>25602.0</td>\n      <td>3.0</td>\n      <td>47.177549</td>\n      <td>1.0</td>\n      <td>14.0</td>\n      <td>592.92435</td>\n      <td>3.0</td>\n      <td>567.0</td>\n      <td>...</td>\n      <td>9.0</td>\n      <td>0.079434</td>\n      <td>26194.92435</td>\n      <td>0.381537</td>\n      <td>10.150465</td>\n      <td>6.386752</td>\n      <td>3</td>\n      <td>2</td>\n      <td>1</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>21.0</td>\n      <td>141855.0</td>\n      <td>2.0</td>\n      <td>10.938144</td>\n      <td>1.0</td>\n      <td>0.0</td>\n      <td>367.00000</td>\n      <td>1.0</td>\n      <td>765.0</td>\n      <td>...</td>\n      <td>21.0</td>\n      <td>0.029723</td>\n      <td>142222.00000</td>\n      <td>0.182846</td>\n      <td>11.862568</td>\n      <td>5.908083</td>\n      <td>4</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>21.0</td>\n      <td>39651.0</td>\n      <td>1.0</td>\n      <td>20.376094</td>\n      <td>0.0</td>\n      <td>8.0</td>\n      <td>598.00000</td>\n      <td>4.0</td>\n      <td>2022.0</td>\n      <td>...</td>\n      <td>13.0</td>\n      <td>0.034017</td>\n      <td>40249.00000</td>\n      <td>0.588925</td>\n      <td>10.587897</td>\n      <td>6.395262</td>\n      <td>3</td>\n      <td>3</td>\n      <td>1</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 29 columns</p>\n</div>"},"metadata":{}}],"execution_count":19},{"cell_type":"code","source":"df.shape","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:30.334013Z","iopub.execute_input":"2024-12-08T17:52:30.334402Z","iopub.status.idle":"2024-12-08T17:52:30.340309Z","shell.execute_reply.started":"2024-12-08T17:52:30.334362Z","shell.execute_reply":"2024-12-08T17:52:30.339311Z"}},"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":"(1200000, 29)"},"metadata":{}}],"execution_count":20},{"cell_type":"code","source":"null_data = pd.DataFrame()\nnull_data['null_count'] = df.isnull().sum()\nnull_data['null_ratio'] = round(null_data['null_count'] / len(df), 4)\nnull_data","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:30.341552Z","iopub.execute_input":"2024-12-08T17:52:30.341818Z","iopub.status.idle":"2024-12-08T17:52:30.418501Z","shell.execute_reply.started":"2024-12-08T17:52:30.341781Z","shell.execute_reply":"2024-12-08T17:52:30.417571Z"}},"outputs":[{"execution_count":21,"output_type":"execute_result","data":{"text/plain":"                              null_count  null_ratio\nid                                     0         0.0\nAge                                    0         0.0\nAnnual Income                          0         0.0\nNumber of Dependents                   0         0.0\nHealth Score                           0         0.0\nPrevious Claims                        0         0.0\nVehicle Age                            0         0.0\nCredit Score                           0         0.0\nInsurance Duration                     0         0.0\nPremium Amount                         0         0.0\nMean_Income_Duration                   0         0.0\nIncome_Skewness                        0         0.0\nHealth_Skewness                        0         0.0\nClaims_Skewness                        0         0.0\nIncome_Kurtosis                        0         0.0\nHealth_Kurtosis                        0         0.0\nClaims_Kurtosis                        0         0.0\nIncome_Per_Dependent                   0         0.0\nClaims_Per_Year                        0         0.0\nAge_to_VehicleAge_Diff                 0         0.0\nHealth_to_Credit_Score_Ratio           0         0.0\nTotal_Assets                           0         0.0\nOverall_Risk                           0         0.0\nLog_Income                             0         0.0\nLog_Credit                             0         0.0\nIncome_Quartile                        0         0.0\nCredit_Quartile                        0         0.0\nHigh_Income                            0         0.0\nOld_Vehicle                            0         0.0","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>null_count</th>\n      <th>null_ratio</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>id</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Age</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Annual Income</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Number of Dependents</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Health Score</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Previous Claims</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Vehicle Age</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Credit Score</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Insurance Duration</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Premium Amount</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Mean_Income_Duration</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Income_Skewness</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Health_Skewness</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Claims_Skewness</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Income_Kurtosis</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Health_Kurtosis</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Claims_Kurtosis</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Income_Per_Dependent</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Claims_Per_Year</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Age_to_VehicleAge_Diff</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Health_to_Credit_Score_Ratio</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Total_Assets</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Overall_Risk</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Log_Income</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Log_Credit</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Income_Quartile</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Credit_Quartile</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>High_Income</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>Old_Vehicle</th>\n      <td>0</td>\n      <td>0.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}],"execution_count":21},{"cell_type":"code","source":"df.info()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:30.419596Z","iopub.execute_input":"2024-12-08T17:52:30.419878Z","iopub.status.idle":"2024-12-08T17:52:30.508497Z","shell.execute_reply.started":"2024-12-08T17:52:30.419853Z","shell.execute_reply":"2024-12-08T17:52:30.507652Z"}},"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 1200000 entries, 0 to 1199999\nData columns (total 29 columns):\n #   Column                        Non-Null Count    Dtype  \n---  ------                        --------------    -----  \n 0   id                            1200000 non-null  int64  \n 1   Age                           1200000 non-null  float64\n 2   Annual Income                 1200000 non-null  float64\n 3   Number of Dependents          1200000 non-null  float64\n 4   Health Score                  1200000 non-null  float64\n 5   Previous Claims               1200000 non-null  float64\n 6   Vehicle Age                   1200000 non-null  float64\n 7   Credit Score                  1200000 non-null  float64\n 8   Insurance Duration            1200000 non-null  float64\n 9   Premium Amount                1200000 non-null  float64\n 10  Mean_Income_Duration          1200000 non-null  float64\n 11  Income_Skewness               1200000 non-null  float64\n 12  Health_Skewness               1200000 non-null  float64\n 13  Claims_Skewness               1200000 non-null  float64\n 14  Income_Kurtosis               1200000 non-null  float64\n 15  Health_Kurtosis               1200000 non-null  float64\n 16  Claims_Kurtosis               1200000 non-null  float64\n 17  Income_Per_Dependent          1200000 non-null  float64\n 18  Claims_Per_Year               1200000 non-null  float64\n 19  Age_to_VehicleAge_Diff        1200000 non-null  float64\n 20  Health_to_Credit_Score_Ratio  1200000 non-null  float64\n 21  Total_Assets                  1200000 non-null  float64\n 22  Overall_Risk                  1200000 non-null  float64\n 23  Log_Income                    1200000 non-null  float64\n 24  Log_Credit                    1200000 non-null  float64\n 25  Income_Quartile               1200000 non-null  int64  \n 26  Credit_Quartile               1200000 non-null  int64  \n 27  High_Income                   1200000 non-null  int64  \n 28  Old_Vehicle                   1200000 non-null  int64  \ndtypes: float64(24), int64(5)\nmemory usage: 265.5 MB\n","output_type":"stream"}],"execution_count":22},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.metrics import mean_squared_log_error\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense, Dropout\nfrom tensorflow.keras.optimizers import Adam\n\n# Separate features and target\nX = df.drop(columns=['Premium Amount'])\ny = df['Premium Amount']\n\n# Train-test split\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)\n\n# Standardize features (important for NN models)\nscaler = StandardScaler()\nX_train = scaler.fit_transform(X_train)\nX_test = scaler.transform(X_test)\n\n# Log-transform the target variable to stabilize variance\ny_train = np.log1p(y_train)\ny_test = np.log1p(y_test)\n\n# Build the Neural Network model\nmodel = Sequential([\n    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),\n    Dropout(0.1),\n    Dense(32, activation='relu'),\n    Dropout(0.05),\n    Dense(16, activation='relu'),\n    Dense(1, activation='linear')\n])\n\n# Compile the model\nmodel.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])\n\n# Train the model\nhistory = model.fit(X_train, y_train, validation_split=0.2, epochs=25, batch_size=32, verbose=1)\n\n# Predict and Evaluate\ny_pred = model.predict(X_test)\ny_pred = np.expm1(y_pred)  # Reverse the log transformation\ny_test = np.expm1(y_test)  # Reverse the log transformation\n\n# Calculate RMSLE\nrmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))\n\nprint(f\"Root Mean Squared Logarithmic Error (RMSLE): {rmsle}\")\n\n# Predict method\ndef predict_premium(X_input):\n    X_input_scaled = scaler.transform(X_input)\n    predictions = model.predict(X_input_scaled)\n    return np.expm1(predictions)  # Reverse the log transformation\n\n# Example usage of predict\nexample_data = X_test[:5]  # Replace with new input data\npredictions = predict_premium(example_data)\nprint(\"Predicted Premium Amounts:\", predictions)\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:30.590116Z","iopub.execute_input":"2024-12-08T17:52:30.590391Z"}},"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n  super().__init__(activity_regularizer=activity_regularizer, **kwargs)\n","output_type":"stream"},{"name":"stdout","text":"Epoch 1/25\n","output_type":"stream"},{"name":"stderr","text":"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\nI0000 00:00:1733680367.235933      92 service.cc:145] XLA service 0x79a480007bd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\nI0000 00:00:1733680367.235977      92 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5\nI0000 00:00:1733680367.235981      92 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5\n","output_type":"stream"},{"name":"stdout","text":"\u001b[1m  107/30000\u001b[0m \u001b[37m━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[1m42s\u001b[0m 1ms/step - loss: 561555008.0000 - mae: 13204.8057  ","output_type":"stream"},{"name":"stderr","text":"I0000 00:00:1733680370.635419      92 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.\n","output_type":"stream"},{"name":"stdout","text":"\u001b[1m30000/30000\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m54s\u001b[0m 2ms/step - loss: 7111422.5000 - mae: 1033.8452 - val_loss: 756785.3125 - val_mae: 692.7130\nEpoch 2/25\n\u001b[1m30000/30000\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m50s\u001b[0m 2ms/step - loss: 801107.4375 - mae: 684.2218 - val_loss: 803390.1875 - val_mae: 738.0118\nEpoch 3/25\n\u001b[1m29991/30000\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━\u001b[0m \u001b[1m0s\u001b[0m 1ms/step - loss: 792910.2500 - mae: 681.2413","output_type":"stream"}],"execution_count":null},{"cell_type":"code","source":"# add_compound_data(convert_categorical_to_numeric(test_df), scaler = scaler, fit_scaler=False).shape","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"test_df = pd.read_csv(\"/kaggle/input/playground-series-s4e12/test.csv\")\n\ntest_df","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def convert_categorical_to_numeric(df):\n    binary_mappings = {\n        'Smoking Status': {'No': 0, 'Yes': 1}\n    }\n    for col, mapping in binary_mappings.items():\n        if col in df.columns:\n            df[col] = df[col].map(mapping)\n    \n    onehot_encode_cols = ['Gender', 'Location', 'Policy Type', 'Property Type']\n    df = pd.get_dummies(df, columns=onehot_encode_cols, drop_first=False)\n    \n    ordinal_mappings = {\n        'Exercise Frequency': {'Rarely': 0, 'Monthly': 1, 'Weekly': 2, 'Daily': 3},\n        'Customer Feedback': {'Poor': 0, 'Average': 1, 'Good': 2},\n        'Education Level': {'High School': 0, \"Bachelor's\": 1, \"Master's\": 2, 'PhD': 3},\n        'Policy Type': {'Premium': 2, 'Comprehensive': 1, 'Basic': 0}\n    }\n    for col, mapping in ordinal_mappings.items():\n        if col in df.columns:\n            df[col] = df[col].map(mapping)\n    \n    if 'Policy Start Date' in df.columns:\n        df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date']).astype(int) / 10**9  \n    \n    categorical_cols = ['Customer Feedback', 'Marital Status', 'Occupation']\n    for col in categorical_cols:\n        df[col] = df[col].fillna('Unknown')\n    \n    label_encode_cols = ['Customer Feedback', 'Marital Status', 'Occupation']\n    for col in label_encode_cols:\n        df[col] = df[col].astype('category').cat.codes\n    \n    return df\nconvert_categorical_to_numeric(test_df).shape\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def add_compound_data(df, scaler=None, fit_scaler=False):\n    df = df.drop(columns=categorical_cols, errors='ignore') \n    df = df.fillna(df.mean())\n    # df[\"Smoking Status\"] = 0.5\n\n    # Add new features\n    df['Mean_Income_Duration'] = (df['Annual Income'] + df['Insurance Duration']) / 2\n    df['Income_Skewness'] = skew(df['Annual Income'], nan_policy='omit')\n    df['Health_Skewness'] = skew(df['Health Score'], nan_policy='omit')\n    df['Claims_Skewness'] = skew(df['Previous Claims'], nan_policy='omit')\n\n    df['Income_Kurtosis'] = kurtosis(df['Annual Income'], nan_policy='omit')\n    df['Health_Kurtosis'] = kurtosis(df['Health Score'], nan_policy='omit')\n    df['Claims_Kurtosis'] = kurtosis(df['Previous Claims'], nan_policy='omit')\n\n    df['Income_Per_Dependent'] = df['Annual Income'] / (df['Number of Dependents'] + 1)\n    df['Claims_Per_Year'] = df['Previous Claims'] / df['Insurance Duration']\n    df['Age_to_VehicleAge_Diff'] = df['Age'] - df['Vehicle Age']\n    df['Health_to_Credit_Score_Ratio'] = df['Health Score'] / (df['Credit Score'] + 1)\n\n    df['Total_Assets'] = df['Annual Income'] + df['Credit Score']\n    df['Overall_Risk'] = (df['Vehicle Age'] + df['Previous Claims'] + df['Insurance Duration']) / df['Health Score']\n\n    # Avoid issues with log transformation\n    df['Log_Income'] = np.log1p(df['Annual Income'])\n    df['Log_Credit'] = np.log1p(df['Credit Score'])\n\n    df['Income_Quartile'] = pd.qcut(df['Annual Income'], 4, labels=[1, 2, 3, 4]).astype(int)\n    df['Credit_Quartile'] = pd.qcut(df['Credit Score'], 4, labels=[1, 2, 3, 4]).astype(int)\n\n    df['High_Income'] = (df['Annual Income'] > df['Annual Income'].median()).astype(int)\n    df['Old_Vehicle'] = (df['Vehicle Age'] > 10).astype(int)\n\n    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns\n    if scaler is not None:\n        if fit_scaler:\n            scaler.fit(df[numeric_cols])\n        df[numeric_cols] = scaler.transform(df[numeric_cols])\n\n    # df = scaler.transform(df)\n\n    return df\n\n\n# transformed_test_df = convert_categorical_to_numeric(test_df)\ntransformed_test_df = add_compound_data(test_df, scaler=scaler, fit_scaler=False)\n# transformed_test_df = transformed_test_df.apply(pd.to_numeric, errors='coerce')\n# transformed_test_df = scaler.transform(transformed_test_df)\n# transformed_test_df = pd.DataFrame(transformed_test_df, columns=X.columns)\ntransformed_test_df","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"print(\"Any NaN values:\", np.any(np.isnan(transformed_test_df)))\nprint(\"Any Inf values:\", np.any(np.isinf(transformed_test_df)))\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"null_data = pd.DataFrame()\nnull_data['null_count'] = transformed_test_df.isnull().sum()\nnull_data['null_ratio'] = round(null_data['null_count'] / len(transformed_test_df), 4)\nnull_data","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"transformed_test_df.shape","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"\nprint(\"Columns in transformed_test_df:\", transformed_test_df.columns)\nprint(\"Columns in X_train:\", X.columns)\nmissing_columns = set(X.columns) - set(transformed_test_df.columns)\nextra_columns = set(transformed_test_df.columns) - set(X.columns)\nprint(\"Missing columns:\", missing_columns)\nprint(\"Extra columns:\", extra_columns)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"transformed_test_df","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"print(\"Min value in transformed_test_df:\", transformed_test_df.min())\nprint(\"Max value in transformed_test_df:\", transformed_test_df.max())\nprint(\"Mean value in transformed_test_df:\", transformed_test_df.mean())","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"len(example_data[0])\n# predict_premium(example_data)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"test_df.shape","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"predictions = predict_premium(transformed_test_df)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"submission = test_df[['id']]\nsubmission","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# transformed_test_df = transformed_test_df[X.columns]","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"submission[\"Premium Amount\"] = predictions","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"print(\"Shape of training data:\", X.shape)\nprint(\"Shape of test data:\", transformed_test_df.shape)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"submission","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"submission.to_csv('submission.csv', index=False)","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}