-
Notifications
You must be signed in to change notification settings - Fork 0
/
Regression_with_an_Insurance_dataset
1 lines (1 loc) · 59.6 KB
/
Regression_with_an_Insurance_dataset
1
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":84896,"databundleVersionId":10305135,"sourceType":"competition"}],"dockerImageVersionId":30804,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example,running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:39.509798Z","iopub.execute_input":"2024-12-08T17:48:39.510142Z","iopub.status.idle":"2024-12-08T17:48:39.834528Z","shell.execute_reply.started":"2024-12-08T17:48:39.510112Z","shell.execute_reply":"2024-12-08T17:48:39.833634Z"}},"outputs":[{"name":"stdout","text":"/kaggle/input/playground-series-s4e12/sample_submission.csv\n/kaggle/input/playground-series-s4e12/train.csv\n/kaggle/input/playground-series-s4e12/test.csv\n","output_type":"stream"}],"execution_count":3},{"cell_type":"code","source":"train_df = pd.read_csv(\"/kaggle/input/playground-series-s4e12/train.csv\")\ntrain_df.head()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:39.835960Z","iopub.execute_input":"2024-12-08T17:48:39.836313Z","iopub.status.idle":"2024-12-08T17:48:44.967812Z","shell.execute_reply.started":"2024-12-08T17:48:39.836286Z","shell.execute_reply":"2024-12-08T17:48:44.966816Z"}},"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" id Age Gender Annual Income Marital Status Number of Dependents \\\n0 0 19.0 Female 10049.0 Married 1.0 \n1 1 39.0 Female 31678.0 Divorced 3.0 \n2 2 23.0 Male 25602.0 Divorced 3.0 \n3 3 21.0 Male 141855.0 Married 2.0 \n4 4 21.0 Male 39651.0 Single 1.0 \n\n Education Level Occupation Health Score Location ... Previous Claims \\\n0 Bachelor's Self-Employed 22.598761 Urban ... 2.0 \n1 Master's NaN 15.569731 Rural ... 1.0 \n2 High School Self-Employed 47.177549 Suburban ... 1.0 \n3 Bachelor's NaN 10.938144 Rural ... 1.0 \n4 Bachelor's Self-Employed 20.376094 Rural ... 0.0 \n\n Vehicle Age Credit Score Insurance Duration Policy Start Date \\\n0 17.0 372.0 5.0 2023-12-23 15:21:39.134960 \n1 12.0 694.0 2.0 2023-06-12 15:21:39.111551 \n2 14.0 NaN 3.0 2023-09-30 15:21:39.221386 \n3 0.0 367.0 1.0 2024-06-12 15:21:39.226954 \n4 8.0 598.0 4.0 2021-12-01 15:21:39.252145 \n\n Customer Feedback Smoking Status Exercise Frequency Property Type \\\n0 Poor No Weekly House \n1 Average Yes Monthly House \n2 Good Yes Weekly House \n3 Poor Yes Daily Apartment \n4 Poor Yes Weekly House \n\n Premium Amount \n0 2869.0 \n1 1483.0 \n2 567.0 \n3 765.0 \n4 2022.0 \n\n[5 rows x 21 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>Age</th>\n <th>Gender</th>\n <th>Annual Income</th>\n <th>Marital Status</th>\n <th>Number of Dependents</th>\n <th>Education Level</th>\n <th>Occupation</th>\n <th>Health Score</th>\n <th>Location</th>\n <th>...</th>\n <th>Previous Claims</th>\n <th>Vehicle Age</th>\n <th>Credit Score</th>\n <th>Insurance Duration</th>\n <th>Policy Start Date</th>\n <th>Customer Feedback</th>\n <th>Smoking Status</th>\n <th>Exercise Frequency</th>\n <th>Property Type</th>\n <th>Premium Amount</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>19.0</td>\n <td>Female</td>\n <td>10049.0</td>\n <td>Married</td>\n <td>1.0</td>\n <td>Bachelor's</td>\n <td>Self-Employed</td>\n <td>22.598761</td>\n <td>Urban</td>\n <td>...</td>\n <td>2.0</td>\n <td>17.0</td>\n <td>372.0</td>\n <td>5.0</td>\n <td>2023-12-23 15:21:39.134960</td>\n <td>Poor</td>\n <td>No</td>\n <td>Weekly</td>\n <td>House</td>\n <td>2869.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>39.0</td>\n <td>Female</td>\n <td>31678.0</td>\n <td>Divorced</td>\n <td>3.0</td>\n <td>Master's</td>\n <td>NaN</td>\n <td>15.569731</td>\n <td>Rural</td>\n <td>...</td>\n <td>1.0</td>\n <td>12.0</td>\n <td>694.0</td>\n <td>2.0</td>\n <td>2023-06-12 15:21:39.111551</td>\n <td>Average</td>\n <td>Yes</td>\n <td>Monthly</td>\n <td>House</td>\n <td>1483.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>23.0</td>\n <td>Male</td>\n <td>25602.0</td>\n <td>Divorced</td>\n <td>3.0</td>\n <td>High School</td>\n <td>Self-Employed</td>\n <td>47.177549</td>\n <td>Suburban</td>\n <td>...</td>\n <td>1.0</td>\n <td>14.0</td>\n <td>NaN</td>\n <td>3.0</td>\n <td>2023-09-30 15:21:39.221386</td>\n <td>Good</td>\n <td>Yes</td>\n <td>Weekly</td>\n <td>House</td>\n <td>567.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>21.0</td>\n <td>Male</td>\n <td>141855.0</td>\n <td>Married</td>\n <td>2.0</td>\n <td>Bachelor's</td>\n <td>NaN</td>\n <td>10.938144</td>\n <td>Rural</td>\n <td>...</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>367.0</td>\n <td>1.0</td>\n <td>2024-06-12 15:21:39.226954</td>\n <td>Poor</td>\n <td>Yes</td>\n <td>Daily</td>\n <td>Apartment</td>\n <td>765.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>21.0</td>\n <td>Male</td>\n <td>39651.0</td>\n <td>Single</td>\n <td>1.0</td>\n <td>Bachelor's</td>\n <td>Self-Employed</td>\n <td>20.376094</td>\n <td>Rural</td>\n <td>...</td>\n <td>0.0</td>\n <td>8.0</td>\n <td>598.0</td>\n <td>4.0</td>\n <td>2021-12-01 15:21:39.252145</td>\n <td>Poor</td>\n <td>Yes</td>\n <td>Weekly</td>\n <td>House</td>\n <td>2022.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 21 columns</p>\n</div>"},"metadata":{}}],"execution_count":4},{"cell_type":"code","source":"print(train_df.describe())\nprint(train_df.info())","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:44.969121Z","iopub.execute_input":"2024-12-08T17:48:44.969406Z","iopub.status.idle":"2024-12-08T17:48:46.154613Z","shell.execute_reply.started":"2024-12-08T17:48:44.969378Z","shell.execute_reply":"2024-12-08T17:48:46.153602Z"}},"outputs":[{"name":"stdout","text":" id Age Annual Income Number of Dependents \\\ncount 1.200000e+06 1.181295e+06 1.155051e+06 1.090328e+06 \nmean 5.999995e+05 4.114556e+01 3.274522e+04 2.009934e+00 \nstd 3.464103e+05 1.353995e+01 3.217951e+04 1.417338e+00 \nmin 0.000000e+00 1.800000e+01 1.000000e+00 0.000000e+00 \n25% 2.999998e+05 3.000000e+01 8.001000e+03 1.000000e+00 \n50% 5.999995e+05 4.100000e+01 2.391100e+04 2.000000e+00 \n75% 8.999992e+05 5.300000e+01 4.463400e+04 3.000000e+00 \nmax 1.199999e+06 6.400000e+01 1.499970e+05 4.000000e+00 \n\n Health Score Previous Claims Vehicle Age Credit Score \\\ncount 1.125924e+06 835971.000000 1.199994e+06 1.062118e+06 \nmean 2.561391e+01 1.002689 9.569889e+00 5.929244e+02 \nstd 1.220346e+01 0.982840 5.776189e+00 1.499819e+02 \nmin 2.012237e+00 0.000000 0.000000e+00 3.000000e+02 \n25% 1.591896e+01 0.000000 5.000000e+00 4.680000e+02 \n50% 2.457865e+01 1.000000 1.000000e+01 5.950000e+02 \n75% 3.452721e+01 2.000000 1.500000e+01 7.210000e+02 \nmax 5.897591e+01 9.000000 1.900000e+01 8.490000e+02 \n\n Insurance Duration Premium Amount \ncount 1.199999e+06 1.200000e+06 \nmean 5.018219e+00 1.102545e+03 \nstd 2.594331e+00 8.649989e+02 \nmin 1.000000e+00 2.000000e+01 \n25% 3.000000e+00 5.140000e+02 \n50% 5.000000e+00 8.720000e+02 \n75% 7.000000e+00 1.509000e+03 \nmax 9.000000e+00 4.999000e+03 \n<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 1200000 entries, 0 to 1199999\nData columns (total 21 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 id 1200000 non-null int64 \n 1 Age 1181295 non-null float64\n 2 Gender 1200000 non-null object \n 3 Annual Income 1155051 non-null float64\n 4 Marital Status 1181471 non-null object \n 5 Number of Dependents 1090328 non-null float64\n 6 Education Level 1200000 non-null object \n 7 Occupation 841925 non-null object \n 8 Health Score 1125924 non-null float64\n 9 Location 1200000 non-null object \n 10 Policy Type 1200000 non-null object \n 11 Previous Claims 835971 non-null float64\n 12 Vehicle Age 1199994 non-null float64\n 13 Credit Score 1062118 non-null float64\n 14 Insurance Duration 1199999 non-null float64\n 15 Policy Start Date 1200000 non-null object \n 16 Customer Feedback 1122176 non-null object \n 17 Smoking Status 1200000 non-null object \n 18 Exercise Frequency 1200000 non-null object \n 19 Property Type 1200000 non-null object \n 20 Premium Amount 1200000 non-null float64\ndtypes: float64(9), int64(1), object(11)\nmemory usage: 192.3+ MB\nNone\n","output_type":"stream"}],"execution_count":5},{"cell_type":"code","source":"null_data = pd.DataFrame()\nnull_data['null_count'] = train_df.isnull().sum()\nnull_data['null_ratio'] = round(null_data['null_count'] / len(train_df), 4)\nnull_data","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.156860Z","iopub.execute_input":"2024-12-08T17:48:46.157206Z","iopub.status.idle":"2024-12-08T17:48:46.738189Z","shell.execute_reply.started":"2024-12-08T17:48:46.157168Z","shell.execute_reply":"2024-12-08T17:48:46.737207Z"}},"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" null_count null_ratio\nid 0 0.0000\nAge 18705 0.0156\nGender 0 0.0000\nAnnual Income 44949 0.0375\nMarital Status 18529 0.0154\nNumber of Dependents 109672 0.0914\nEducation Level 0 0.0000\nOccupation 358075 0.2984\nHealth Score 74076 0.0617\nLocation 0 0.0000\nPolicy Type 0 0.0000\nPrevious Claims 364029 0.3034\nVehicle Age 6 0.0000\nCredit Score 137882 0.1149\nInsurance Duration 1 0.0000\nPolicy Start Date 0 0.0000\nCustomer Feedback 77824 0.0649\nSmoking Status 0 0.0000\nExercise Frequency 0 0.0000\nProperty Type 0 0.0000\nPremium Amount 0 0.0000","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>null_count</th>\n <th>null_ratio</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>id</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Age</th>\n <td>18705</td>\n <td>0.0156</td>\n </tr>\n <tr>\n <th>Gender</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Annual Income</th>\n <td>44949</td>\n <td>0.0375</td>\n </tr>\n <tr>\n <th>Marital Status</th>\n <td>18529</td>\n <td>0.0154</td>\n </tr>\n <tr>\n <th>Number of Dependents</th>\n <td>109672</td>\n <td>0.0914</td>\n </tr>\n <tr>\n <th>Education Level</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Occupation</th>\n <td>358075</td>\n <td>0.2984</td>\n </tr>\n <tr>\n <th>Health Score</th>\n <td>74076</td>\n <td>0.0617</td>\n </tr>\n <tr>\n <th>Location</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Policy Type</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Previous Claims</th>\n <td>364029</td>\n <td>0.3034</td>\n </tr>\n <tr>\n <th>Vehicle Age</th>\n <td>6</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Credit Score</th>\n <td>137882</td>\n <td>0.1149</td>\n </tr>\n <tr>\n <th>Insurance Duration</th>\n <td>1</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Policy Start Date</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Customer Feedback</th>\n <td>77824</td>\n <td>0.0649</td>\n </tr>\n <tr>\n <th>Smoking Status</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Exercise Frequency</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Property Type</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Premium Amount</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}],"execution_count":6},{"cell_type":"code","source":"train_df.columns","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.739322Z","iopub.execute_input":"2024-12-08T17:48:46.739623Z","iopub.status.idle":"2024-12-08T17:48:46.745549Z","shell.execute_reply.started":"2024-12-08T17:48:46.739596Z","shell.execute_reply":"2024-12-08T17:48:46.744540Z"}},"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"Index(['id', 'Age', 'Gender', 'Annual Income', 'Marital Status',\n 'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',\n 'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',\n 'Credit Score', 'Insurance Duration', 'Policy Start Date',\n 'Customer Feedback', 'Smoking Status', 'Exercise Frequency',\n 'Property Type', 'Premium Amount'],\n dtype='object')"},"metadata":{}}],"execution_count":7},{"cell_type":"code","source":"print(train_df[\"Number of Dependents\"].describe().astype(int))\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.746919Z","iopub.execute_input":"2024-12-08T17:48:46.747738Z","iopub.status.idle":"2024-12-08T17:48:46.816989Z","shell.execute_reply.started":"2024-12-08T17:48:46.747694Z","shell.execute_reply":"2024-12-08T17:48:46.815981Z"}},"outputs":[{"name":"stdout","text":"count 1090328\nmean 2\nstd 1\nmin 0\n25% 1\n50% 2\n75% 3\nmax 4\nName: Number of Dependents, dtype: int64\n","output_type":"stream"}],"execution_count":8},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"train_df.shape","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.817979Z","iopub.execute_input":"2024-12-08T17:48:46.818278Z","iopub.status.idle":"2024-12-08T17:48:46.824171Z","shell.execute_reply.started":"2024-12-08T17:48:46.818249Z","shell.execute_reply":"2024-12-08T17:48:46.823272Z"}},"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"(1200000, 21)"},"metadata":{}}],"execution_count":9},{"cell_type":"code","source":"categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns\ncategorical_cols","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.825313Z","iopub.execute_input":"2024-12-08T17:48:46.825652Z","iopub.status.idle":"2024-12-08T17:48:46.982087Z","shell.execute_reply.started":"2024-12-08T17:48:46.825624Z","shell.execute_reply":"2024-12-08T17:48:46.981140Z"}},"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"Index(['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',\n 'Policy Type', 'Policy Start Date', 'Customer Feedback',\n 'Smoking Status', 'Exercise Frequency', 'Property Type'],\n dtype='object')"},"metadata":{}}],"execution_count":10},{"cell_type":"code","source":"categorical_null_data = pd.DataFrame()\ncategorical_null_data['null_count'] = train_df[categorical_cols].isnull().sum()\ncategorical_null_data['null_ratio'] = round(categorical_null_data['null_count'] / len(train_df), 4)\ncategorical_null_data","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:46.983148Z","iopub.execute_input":"2024-12-08T17:48:46.983486Z","iopub.status.idle":"2024-12-08T17:48:47.710754Z","shell.execute_reply.started":"2024-12-08T17:48:46.983460Z","shell.execute_reply":"2024-12-08T17:48:47.709885Z"}},"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":" null_count null_ratio\nGender 0 0.0000\nMarital Status 18529 0.0154\nEducation Level 0 0.0000\nOccupation 358075 0.2984\nLocation 0 0.0000\nPolicy Type 0 0.0000\nPolicy Start Date 0 0.0000\nCustomer Feedback 77824 0.0649\nSmoking Status 0 0.0000\nExercise Frequency 0 0.0000\nProperty Type 0 0.0000","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>null_count</th>\n <th>null_ratio</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>Gender</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Marital Status</th>\n <td>18529</td>\n <td>0.0154</td>\n </tr>\n <tr>\n <th>Education Level</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Occupation</th>\n <td>358075</td>\n <td>0.2984</td>\n </tr>\n <tr>\n <th>Location</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Policy Type</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Policy Start Date</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Customer Feedback</th>\n <td>77824</td>\n <td>0.0649</td>\n </tr>\n <tr>\n <th>Smoking Status</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Exercise Frequency</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n <tr>\n <th>Property Type</th>\n <td>0</td>\n <td>0.0000</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}],"execution_count":11},{"cell_type":"code","source":"# for c in categorical_cols:\n# print(f'Category: {c}, Count: {len(train_df[c].unique())} Unique values: {train_df[c].unique()}')\n\n# for c in categorical_cols:\n# print(f'Category: {c}')\n\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:47.714360Z","iopub.execute_input":"2024-12-08T17:48:47.714644Z","iopub.status.idle":"2024-12-08T17:48:47.718805Z","shell.execute_reply.started":"2024-12-08T17:48:47.714601Z","shell.execute_reply":"2024-12-08T17:48:47.717672Z"}},"outputs":[],"execution_count":12},{"cell_type":"code","source":"# Category: Gender ////\n# Category: Marital Status ////\n# Category: Education Level /\n# Category: Occupation ////\n# Category: Location //\n# Category: Policy Type ///\n# Category: Policy Start Date ///\n# Category: Customer Feedback /\n# Category: Smoking Status /\n# Category: Exercise Frequency /\n# Category: Property Type //\n\n# import pandas as pd\n\n# df = train_df.copy()\n\n# binary_mappings = {\n# 'Smoking Status': {'No': 0, 'Yes': 1}\n# }\n# for col, mapping in binary_mappings.items():\n# if col in df.columns:\n# df[col] = df[col].map(mapping)\n\n# onehot_encode_cols = ['Gender', 'Location', 'Policy Type', 'Property Type']\n# df = pd.get_dummies(df, columns=onehot_encode_cols, drop_first=False)\n\n# ordinal_mappings = {\n# 'Exercise Frequency': {'Rarely': 0, 'Monthly': 1, 'Weekly': 2, 'Daily': 3},\n# 'Customer Feedback': {'Poor': 0, 'Average': 1, 'Good': 2},\n# 'Education Level': {'High School': 0, \"Bachelor's\": 1, \"Master's\": 2, 'PhD': 3},\n# 'Policy Type': {'Premium': 2, 'Comprehensive': 1, 'Basic': 0}\n# }\n# for col, mapping in ordinal_mappings.items():\n# if col in df.columns:\n# df[col] = df[col].map(mapping)\n\n# if 'Policy Start Date' in df.columns:\n# df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date']).astype(int) / 10**9 \n\n# categorical_cols = ['Customer Feedback', 'Marital Status', 'Occupation']\n# for col in categorical_cols:\n# df[col] = df[col].fillna('Unknown')\n\n# label_encode_cols = ['Customer Feedback', 'Marital Status', 'Occupation']\n# for col in label_encode_cols:\n# df[col] = df[col].astype('category').cat.codes\n\n# print(df.head())\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:48:47.719948Z","iopub.execute_input":"2024-12-08T17:48:47.720190Z","iopub.status.idle":"2024-12-08T17:48:47.730914Z","shell.execute_reply.started":"2024-12-08T17:48:47.720168Z","shell.execute_reply":"2024-12-08T17:48:47.730265Z"}},"outputs":[],"execution_count":13},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns\ncategorical_cols","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:28.802194Z","iopub.execute_input":"2024-12-08T17:52:28.802627Z","iopub.status.idle":"2024-12-08T17:52:28.962929Z","shell.execute_reply.started":"2024-12-08T17:52:28.802594Z","shell.execute_reply":"2024-12-08T17:52:28.961797Z"}},"outputs":[{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"Index(['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',\n 'Policy Type', 'Policy Start Date', 'Customer Feedback',\n 'Smoking Status', 'Exercise Frequency', 'Property Type'],\n dtype='object')"},"metadata":{}}],"execution_count":15},{"cell_type":"code","source":"train_df = train_df.drop(columns=categorical_cols)\ntrain_df","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:28.964602Z","iopub.execute_input":"2024-12-08T17:52:28.964944Z","iopub.status.idle":"2024-12-08T17:52:29.022026Z","shell.execute_reply.started":"2024-12-08T17:52:28.964893Z","shell.execute_reply":"2024-12-08T17:52:29.021056Z"}},"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":" id Age Annual Income Number of Dependents Health Score \\\n0 0 19.0 10049.0 1.0 22.598761 \n1 1 39.0 31678.0 3.0 15.569731 \n2 2 23.0 25602.0 3.0 47.177549 \n3 3 21.0 141855.0 2.0 10.938144 \n4 4 21.0 39651.0 1.0 20.376094 \n... ... ... ... ... ... \n1199995 1199995 36.0 27316.0 0.0 13.772907 \n1199996 1199996 54.0 35786.0 NaN 11.483482 \n1199997 1199997 19.0 51884.0 0.0 14.724469 \n1199998 1199998 55.0 NaN 1.0 18.547381 \n1199999 1199999 21.0 NaN 0.0 10.125323 \n\n Previous Claims Vehicle Age Credit Score Insurance Duration \\\n0 2.0 17.0 372.0 5.0 \n1 1.0 12.0 694.0 2.0 \n2 1.0 14.0 NaN 3.0 \n3 1.0 0.0 367.0 1.0 \n4 0.0 8.0 598.0 4.0 \n... ... ... ... ... \n1199995 NaN 5.0 372.0 3.0 \n1199996 NaN 10.0 597.0 4.0 \n1199997 0.0 19.0 NaN 6.0 \n1199998 1.0 7.0 407.0 4.0 \n1199999 0.0 18.0 502.0 6.0 \n\n Premium Amount \n0 2869.0 \n1 1483.0 \n2 567.0 \n3 765.0 \n4 2022.0 \n... ... \n1199995 1303.0 \n1199996 821.0 \n1199997 371.0 \n1199998 596.0 \n1199999 2480.0 \n\n[1200000 rows x 10 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>Age</th>\n <th>Annual Income</th>\n <th>Number of Dependents</th>\n <th>Health Score</th>\n <th>Previous Claims</th>\n <th>Vehicle Age</th>\n <th>Credit Score</th>\n <th>Insurance Duration</th>\n <th>Premium Amount</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>19.0</td>\n <td>10049.0</td>\n <td>1.0</td>\n <td>22.598761</td>\n <td>2.0</td>\n <td>17.0</td>\n <td>372.0</td>\n <td>5.0</td>\n <td>2869.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>39.0</td>\n <td>31678.0</td>\n <td>3.0</td>\n <td>15.569731</td>\n <td>1.0</td>\n <td>12.0</td>\n <td>694.0</td>\n <td>2.0</td>\n <td>1483.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>23.0</td>\n <td>25602.0</td>\n <td>3.0</td>\n <td>47.177549</td>\n <td>1.0</td>\n <td>14.0</td>\n <td>NaN</td>\n <td>3.0</td>\n <td>567.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>21.0</td>\n <td>141855.0</td>\n <td>2.0</td>\n <td>10.938144</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>367.0</td>\n <td>1.0</td>\n <td>765.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>21.0</td>\n <td>39651.0</td>\n <td>1.0</td>\n <td>20.376094</td>\n <td>0.0</td>\n <td>8.0</td>\n <td>598.0</td>\n <td>4.0</td>\n <td>2022.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1199995</th>\n <td>1199995</td>\n <td>36.0</td>\n <td>27316.0</td>\n <td>0.0</td>\n <td>13.772907</td>\n <td>NaN</td>\n <td>5.0</td>\n <td>372.0</td>\n <td>3.0</td>\n <td>1303.0</td>\n </tr>\n <tr>\n <th>1199996</th>\n <td>1199996</td>\n <td>54.0</td>\n <td>35786.0</td>\n <td>NaN</td>\n <td>11.483482</td>\n <td>NaN</td>\n <td>10.0</td>\n <td>597.0</td>\n <td>4.0</td>\n <td>821.0</td>\n </tr>\n <tr>\n <th>1199997</th>\n <td>1199997</td>\n <td>19.0</td>\n <td>51884.0</td>\n <td>0.0</td>\n <td>14.724469</td>\n <td>0.0</td>\n <td>19.0</td>\n <td>NaN</td>\n <td>6.0</td>\n <td>371.0</td>\n </tr>\n <tr>\n <th>1199998</th>\n <td>1199998</td>\n <td>55.0</td>\n <td>NaN</td>\n <td>1.0</td>\n <td>18.547381</td>\n <td>1.0</td>\n <td>7.0</td>\n <td>407.0</td>\n <td>4.0</td>\n <td>596.0</td>\n </tr>\n <tr>\n <th>1199999</th>\n <td>1199999</td>\n <td>21.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>10.125323</td>\n <td>0.0</td>\n <td>18.0</td>\n <td>502.0</td>\n <td>6.0</td>\n <td>2480.0</td>\n </tr>\n </tbody>\n</table>\n<p>1200000 rows × 10 columns</p>\n</div>"},"metadata":{}}],"execution_count":16},{"cell_type":"code","source":"import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\n# fig, axes = plt.subplots(len(categorical_cols) + 1, 1, figsize=(10, 5 * (len(categorical_cols) + 1)))\n# fig.tight_layout(pad=6.0)\n\n# for i, col in enumerate(categorical_cols):\n# sns.boxplot(x=train_df[col], y=train_df['Premium Amount'], ax=axes[i])\n# axes[i].set_title(f'Boxplot of {col} vs Premium Amount')\n# axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)\n\n# corr_matrix = train_df.select_dtypes(include=['number']).corr()\n# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=axes[len(categorical_cols)])\n# axes[len(categorical_cols)].set_title('Correlation Matrix Heatmap')\n\n# plt.show()\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:29.023474Z","iopub.execute_input":"2024-12-08T17:52:29.023924Z","iopub.status.idle":"2024-12-08T17:52:29.741584Z","shell.execute_reply.started":"2024-12-08T17:52:29.023882Z","shell.execute_reply":"2024-12-08T17:52:29.740653Z"}},"outputs":[],"execution_count":17},{"cell_type":"code","source":"# train_df = train_df.drop(columns=categorical_cols)\n# train_df","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:29.743353Z","iopub.execute_input":"2024-12-08T17:52:29.743694Z","iopub.status.idle":"2024-12-08T17:52:29.747412Z","shell.execute_reply.started":"2024-12-08T17:52:29.743668Z","shell.execute_reply":"2024-12-08T17:52:29.746561Z"}},"outputs":[],"execution_count":18},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nfrom scipy.stats import skew, kurtosis\n\ndf = train_df.copy()\ndf = df.fillna(df.mean())\n\ndf['Mean_Income_Duration'] = (df['Annual Income'] + df['Insurance Duration']) / 2\ndf['Income_Skewness'] = skew(df['Annual Income'], nan_policy='omit')\ndf['Health_Skewness'] = skew(df['Health Score'], nan_policy='omit')\ndf['Claims_Skewness'] = skew(df['Previous Claims'], nan_policy='omit')\n\ndf['Income_Kurtosis'] = kurtosis(df['Annual Income'], nan_policy='omit')\ndf['Health_Kurtosis'] = kurtosis(df['Health Score'], nan_policy='omit')\ndf['Claims_Kurtosis'] = kurtosis(df['Previous Claims'], nan_policy='omit')\n\ndf['Income_Per_Dependent'] = df['Annual Income'] / (df['Number of Dependents'] + 1) \ndf['Claims_Per_Year'] = df['Previous Claims'] / df['Insurance Duration']\n\ndf['Age_to_VehicleAge_Diff'] = df['Age'] - df['Vehicle Age']\ndf['Health_to_Credit_Score_Ratio'] = df['Health Score'] / (df['Credit Score'] + 1)\n\ndf['Total_Assets'] = df['Annual Income'] + df['Credit Score']\ndf['Overall_Risk'] = (df['Vehicle Age'] + df['Previous Claims'] + df['Insurance Duration']) / df['Health Score']\n\ndf['Log_Income'] = np.log1p(df['Annual Income'])\ndf['Log_Credit'] = np.log1p(df['Credit Score'])\n\n\ndf['Income_Quartile'] = pd.qcut(df['Annual Income'], 4, labels=[1, 2, 3, 4]).astype(int)\ndf['Credit_Quartile'] = pd.qcut(df['Credit Score'], 4, labels=[1, 2, 3, 4]).astype(int)\n\ndf['High_Income'] = (df['Annual Income'] > df['Annual Income'].median()).astype(int)\ndf['Old_Vehicle'] = (df['Vehicle Age'] > 10).astype(int)\n\n# Display the enriched dataset\ndf.head()\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:29.748330Z","iopub.execute_input":"2024-12-08T17:52:29.748570Z","iopub.status.idle":"2024-12-08T17:52:30.332481Z","shell.execute_reply.started":"2024-12-08T17:52:29.748546Z","shell.execute_reply":"2024-12-08T17:52:30.331638Z"}},"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":" id Age Annual Income Number of Dependents Health Score \\\n0 0 19.0 10049.0 1.0 22.598761 \n1 1 39.0 31678.0 3.0 15.569731 \n2 2 23.0 25602.0 3.0 47.177549 \n3 3 21.0 141855.0 2.0 10.938144 \n4 4 21.0 39651.0 1.0 20.376094 \n\n Previous Claims Vehicle Age Credit Score Insurance Duration \\\n0 2.0 17.0 372.00000 5.0 \n1 1.0 12.0 694.00000 2.0 \n2 1.0 14.0 592.92435 3.0 \n3 1.0 0.0 367.00000 1.0 \n4 0.0 8.0 598.00000 4.0 \n\n Premium Amount ... Age_to_VehicleAge_Diff Health_to_Credit_Score_Ratio \\\n0 2869.0 ... 2.0 0.060586 \n1 1483.0 ... 27.0 0.022402 \n2 567.0 ... 9.0 0.079434 \n3 765.0 ... 21.0 0.029723 \n4 2022.0 ... 13.0 0.034017 \n\n Total_Assets Overall_Risk Log_Income Log_Credit Income_Quartile \\\n0 10421.00000 1.062005 9.215328 5.921578 2 \n1 32372.00000 0.963408 10.363409 6.543912 3 \n2 26194.92435 0.381537 10.150465 6.386752 3 \n3 142222.00000 0.182846 11.862568 5.908083 4 \n4 40249.00000 0.588925 10.587897 6.395262 3 \n\n Credit_Quartile High_Income Old_Vehicle \n0 1 0 1 \n1 3 1 1 \n2 2 1 1 \n3 1 1 0 \n4 3 1 0 \n\n[5 rows x 29 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>Age</th>\n <th>Annual Income</th>\n <th>Number of Dependents</th>\n <th>Health Score</th>\n <th>Previous Claims</th>\n <th>Vehicle Age</th>\n <th>Credit Score</th>\n <th>Insurance Duration</th>\n <th>Premium Amount</th>\n <th>...</th>\n <th>Age_to_VehicleAge_Diff</th>\n <th>Health_to_Credit_Score_Ratio</th>\n <th>Total_Assets</th>\n <th>Overall_Risk</th>\n <th>Log_Income</th>\n <th>Log_Credit</th>\n <th>Income_Quartile</th>\n <th>Credit_Quartile</th>\n <th>High_Income</th>\n <th>Old_Vehicle</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>19.0</td>\n <td>10049.0</td>\n <td>1.0</td>\n <td>22.598761</td>\n <td>2.0</td>\n <td>17.0</td>\n <td>372.00000</td>\n <td>5.0</td>\n <td>2869.0</td>\n <td>...</td>\n <td>2.0</td>\n <td>0.060586</td>\n <td>10421.00000</td>\n <td>1.062005</td>\n <td>9.215328</td>\n <td>5.921578</td>\n <td>2</td>\n <td>1</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>39.0</td>\n <td>31678.0</td>\n <td>3.0</td>\n <td>15.569731</td>\n <td>1.0</td>\n <td>12.0</td>\n <td>694.00000</td>\n <td>2.0</td>\n <td>1483.0</td>\n <td>...</td>\n <td>27.0</td>\n <td>0.022402</td>\n <td>32372.00000</td>\n <td>0.963408</td>\n <td>10.363409</td>\n <td>6.543912</td>\n <td>3</td>\n <td>3</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>23.0</td>\n <td>25602.0</td>\n <td>3.0</td>\n <td>47.177549</td>\n <td>1.0</td>\n <td>14.0</td>\n <td>592.92435</td>\n <td>3.0</td>\n <td>567.0</td>\n <td>...</td>\n <td>9.0</td>\n <td>0.079434</td>\n <td>26194.92435</td>\n <td>0.381537</td>\n <td>10.150465</td>\n <td>6.386752</td>\n <td>3</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>21.0</td>\n <td>141855.0</td>\n <td>2.0</td>\n <td>10.938144</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>367.00000</td>\n <td>1.0</td>\n <td>765.0</td>\n <td>...</td>\n <td>21.0</td>\n <td>0.029723</td>\n <td>142222.00000</td>\n <td>0.182846</td>\n <td>11.862568</td>\n <td>5.908083</td>\n <td>4</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>21.0</td>\n <td>39651.0</td>\n <td>1.0</td>\n <td>20.376094</td>\n <td>0.0</td>\n <td>8.0</td>\n <td>598.00000</td>\n <td>4.0</td>\n <td>2022.0</td>\n <td>...</td>\n <td>13.0</td>\n <td>0.034017</td>\n <td>40249.00000</td>\n <td>0.588925</td>\n <td>10.587897</td>\n <td>6.395262</td>\n <td>3</td>\n <td>3</td>\n <td>1</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 29 columns</p>\n</div>"},"metadata":{}}],"execution_count":19},{"cell_type":"code","source":"df.shape","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:30.334013Z","iopub.execute_input":"2024-12-08T17:52:30.334402Z","iopub.status.idle":"2024-12-08T17:52:30.340309Z","shell.execute_reply.started":"2024-12-08T17:52:30.334362Z","shell.execute_reply":"2024-12-08T17:52:30.339311Z"}},"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":"(1200000, 29)"},"metadata":{}}],"execution_count":20},{"cell_type":"code","source":"null_data = pd.DataFrame()\nnull_data['null_count'] = df.isnull().sum()\nnull_data['null_ratio'] = round(null_data['null_count'] / len(df), 4)\nnull_data","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:30.341552Z","iopub.execute_input":"2024-12-08T17:52:30.341818Z","iopub.status.idle":"2024-12-08T17:52:30.418501Z","shell.execute_reply.started":"2024-12-08T17:52:30.341781Z","shell.execute_reply":"2024-12-08T17:52:30.417571Z"}},"outputs":[{"execution_count":21,"output_type":"execute_result","data":{"text/plain":" null_count null_ratio\nid 0 0.0\nAge 0 0.0\nAnnual Income 0 0.0\nNumber of Dependents 0 0.0\nHealth Score 0 0.0\nPrevious Claims 0 0.0\nVehicle Age 0 0.0\nCredit Score 0 0.0\nInsurance Duration 0 0.0\nPremium Amount 0 0.0\nMean_Income_Duration 0 0.0\nIncome_Skewness 0 0.0\nHealth_Skewness 0 0.0\nClaims_Skewness 0 0.0\nIncome_Kurtosis 0 0.0\nHealth_Kurtosis 0 0.0\nClaims_Kurtosis 0 0.0\nIncome_Per_Dependent 0 0.0\nClaims_Per_Year 0 0.0\nAge_to_VehicleAge_Diff 0 0.0\nHealth_to_Credit_Score_Ratio 0 0.0\nTotal_Assets 0 0.0\nOverall_Risk 0 0.0\nLog_Income 0 0.0\nLog_Credit 0 0.0\nIncome_Quartile 0 0.0\nCredit_Quartile 0 0.0\nHigh_Income 0 0.0\nOld_Vehicle 0 0.0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>null_count</th>\n <th>null_ratio</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>id</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Age</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Annual Income</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Number of Dependents</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Health Score</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Previous Claims</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Vehicle Age</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Credit Score</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Insurance Duration</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Premium Amount</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Mean_Income_Duration</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Income_Skewness</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Health_Skewness</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Claims_Skewness</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Income_Kurtosis</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Health_Kurtosis</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Claims_Kurtosis</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Income_Per_Dependent</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Claims_Per_Year</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Age_to_VehicleAge_Diff</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Health_to_Credit_Score_Ratio</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Total_Assets</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Overall_Risk</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Log_Income</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Log_Credit</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Income_Quartile</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Credit_Quartile</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>High_Income</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>Old_Vehicle</th>\n <td>0</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}],"execution_count":21},{"cell_type":"code","source":"df.info()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:30.419596Z","iopub.execute_input":"2024-12-08T17:52:30.419878Z","iopub.status.idle":"2024-12-08T17:52:30.508497Z","shell.execute_reply.started":"2024-12-08T17:52:30.419853Z","shell.execute_reply":"2024-12-08T17:52:30.507652Z"}},"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 1200000 entries, 0 to 1199999\nData columns (total 29 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 id 1200000 non-null int64 \n 1 Age 1200000 non-null float64\n 2 Annual Income 1200000 non-null float64\n 3 Number of Dependents 1200000 non-null float64\n 4 Health Score 1200000 non-null float64\n 5 Previous Claims 1200000 non-null float64\n 6 Vehicle Age 1200000 non-null float64\n 7 Credit Score 1200000 non-null float64\n 8 Insurance Duration 1200000 non-null float64\n 9 Premium Amount 1200000 non-null float64\n 10 Mean_Income_Duration 1200000 non-null float64\n 11 Income_Skewness 1200000 non-null float64\n 12 Health_Skewness 1200000 non-null float64\n 13 Claims_Skewness 1200000 non-null float64\n 14 Income_Kurtosis 1200000 non-null float64\n 15 Health_Kurtosis 1200000 non-null float64\n 16 Claims_Kurtosis 1200000 non-null float64\n 17 Income_Per_Dependent 1200000 non-null float64\n 18 Claims_Per_Year 1200000 non-null float64\n 19 Age_to_VehicleAge_Diff 1200000 non-null float64\n 20 Health_to_Credit_Score_Ratio 1200000 non-null float64\n 21 Total_Assets 1200000 non-null float64\n 22 Overall_Risk 1200000 non-null float64\n 23 Log_Income 1200000 non-null float64\n 24 Log_Credit 1200000 non-null float64\n 25 Income_Quartile 1200000 non-null int64 \n 26 Credit_Quartile 1200000 non-null int64 \n 27 High_Income 1200000 non-null int64 \n 28 Old_Vehicle 1200000 non-null int64 \ndtypes: float64(24), int64(5)\nmemory usage: 265.5 MB\n","output_type":"stream"}],"execution_count":22},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.metrics import mean_squared_log_error\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense, Dropout\nfrom tensorflow.keras.optimizers import Adam\n\n# Separate features and target\nX = df.drop(columns=['Premium Amount'])\ny = df['Premium Amount']\n\n# Train-test split\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)\n\n# Standardize features (important for NN models)\nscaler = StandardScaler()\nX_train = scaler.fit_transform(X_train)\nX_test = scaler.transform(X_test)\n\n# Log-transform the target variable to stabilize variance\ny_train = np.log1p(y_train)\ny_test = np.log1p(y_test)\n\n# Build the Neural Network model\nmodel = Sequential([\n Dense(64, activation='relu', input_shape=(X_train.shape[1],)),\n Dropout(0.1),\n Dense(32, activation='relu'),\n Dropout(0.05),\n Dense(16, activation='relu'),\n Dense(1, activation='linear')\n])\n\n# Compile the model\nmodel.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])\n\n# Train the model\nhistory = model.fit(X_train, y_train, validation_split=0.2, epochs=25, batch_size=32, verbose=1)\n\n# Predict and Evaluate\ny_pred = model.predict(X_test)\ny_pred = np.expm1(y_pred) # Reverse the log transformation\ny_test = np.expm1(y_test) # Reverse the log transformation\n\n# Calculate RMSLE\nrmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))\n\nprint(f\"Root Mean Squared Logarithmic Error (RMSLE): {rmsle}\")\n\n# Predict method\ndef predict_premium(X_input):\n X_input_scaled = scaler.transform(X_input)\n predictions = model.predict(X_input_scaled)\n return np.expm1(predictions) # Reverse the log transformation\n\n# Example usage of predict\nexample_data = X_test[:5] # Replace with new input data\npredictions = predict_premium(example_data)\nprint(\"Predicted Premium Amounts:\", predictions)\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-08T17:52:30.590116Z","iopub.execute_input":"2024-12-08T17:52:30.590391Z"}},"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n super().__init__(activity_regularizer=activity_regularizer, **kwargs)\n","output_type":"stream"},{"name":"stdout","text":"Epoch 1/25\n","output_type":"stream"},{"name":"stderr","text":"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\nI0000 00:00:1733680367.235933 92 service.cc:145] XLA service 0x79a480007bd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\nI0000 00:00:1733680367.235977 92 service.cc:153] StreamExecutor device (0): Tesla T4, Compute Capability 7.5\nI0000 00:00:1733680367.235981 92 service.cc:153] StreamExecutor device (1): Tesla T4, Compute Capability 7.5\n","output_type":"stream"},{"name":"stdout","text":"\u001b[1m 107/30000\u001b[0m \u001b[37m━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[1m42s\u001b[0m 1ms/step - loss: 561555008.0000 - mae: 13204.8057 ","output_type":"stream"},{"name":"stderr","text":"I0000 00:00:1733680370.635419 92 device_compiler.h:188] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n","output_type":"stream"},{"name":"stdout","text":"\u001b[1m30000/30000\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m54s\u001b[0m 2ms/step - loss: 7111422.5000 - mae: 1033.8452 - val_loss: 756785.3125 - val_mae: 692.7130\nEpoch 2/25\n\u001b[1m30000/30000\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m50s\u001b[0m 2ms/step - loss: 801107.4375 - mae: 684.2218 - val_loss: 803390.1875 - val_mae: 738.0118\nEpoch 3/25\n\u001b[1m29991/30000\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━\u001b[0m \u001b[1m0s\u001b[0m 1ms/step - loss: 792910.2500 - mae: 681.2413","output_type":"stream"}],"execution_count":null},{"cell_type":"code","source":"# add_compound_data(convert_categorical_to_numeric(test_df), scaler = scaler, fit_scaler=False).shape","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"test_df = pd.read_csv(\"/kaggle/input/playground-series-s4e12/test.csv\")\n\ntest_df","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def convert_categorical_to_numeric(df):\n binary_mappings = {\n 'Smoking Status': {'No': 0, 'Yes': 1}\n }\n for col, mapping in binary_mappings.items():\n if col in df.columns:\n df[col] = df[col].map(mapping)\n \n onehot_encode_cols = ['Gender', 'Location', 'Policy Type', 'Property Type']\n df = pd.get_dummies(df, columns=onehot_encode_cols, drop_first=False)\n \n ordinal_mappings = {\n 'Exercise Frequency': {'Rarely': 0, 'Monthly': 1, 'Weekly': 2, 'Daily': 3},\n 'Customer Feedback': {'Poor': 0, 'Average': 1, 'Good': 2},\n 'Education Level': {'High School': 0, \"Bachelor's\": 1, \"Master's\": 2, 'PhD': 3},\n 'Policy Type': {'Premium': 2, 'Comprehensive': 1, 'Basic': 0}\n }\n for col, mapping in ordinal_mappings.items():\n if col in df.columns:\n df[col] = df[col].map(mapping)\n \n if 'Policy Start Date' in df.columns:\n df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date']).astype(int) / 10**9 \n \n categorical_cols = ['Customer Feedback', 'Marital Status', 'Occupation']\n for col in categorical_cols:\n df[col] = df[col].fillna('Unknown')\n \n label_encode_cols = ['Customer Feedback', 'Marital Status', 'Occupation']\n for col in label_encode_cols:\n df[col] = df[col].astype('category').cat.codes\n \n return df\nconvert_categorical_to_numeric(test_df).shape\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def add_compound_data(df, scaler=None, fit_scaler=False):\n df = df.drop(columns=categorical_cols, errors='ignore') \n df = df.fillna(df.mean())\n # df[\"Smoking Status\"] = 0.5\n\n # Add new features\n df['Mean_Income_Duration'] = (df['Annual Income'] + df['Insurance Duration']) / 2\n df['Income_Skewness'] = skew(df['Annual Income'], nan_policy='omit')\n df['Health_Skewness'] = skew(df['Health Score'], nan_policy='omit')\n df['Claims_Skewness'] = skew(df['Previous Claims'], nan_policy='omit')\n\n df['Income_Kurtosis'] = kurtosis(df['Annual Income'], nan_policy='omit')\n df['Health_Kurtosis'] = kurtosis(df['Health Score'], nan_policy='omit')\n df['Claims_Kurtosis'] = kurtosis(df['Previous Claims'], nan_policy='omit')\n\n df['Income_Per_Dependent'] = df['Annual Income'] / (df['Number of Dependents'] + 1)\n df['Claims_Per_Year'] = df['Previous Claims'] / df['Insurance Duration']\n df['Age_to_VehicleAge_Diff'] = df['Age'] - df['Vehicle Age']\n df['Health_to_Credit_Score_Ratio'] = df['Health Score'] / (df['Credit Score'] + 1)\n\n df['Total_Assets'] = df['Annual Income'] + df['Credit Score']\n df['Overall_Risk'] = (df['Vehicle Age'] + df['Previous Claims'] + df['Insurance Duration']) / df['Health Score']\n\n # Avoid issues with log transformation\n df['Log_Income'] = np.log1p(df['Annual Income'])\n df['Log_Credit'] = np.log1p(df['Credit Score'])\n\n df['Income_Quartile'] = pd.qcut(df['Annual Income'], 4, labels=[1, 2, 3, 4]).astype(int)\n df['Credit_Quartile'] = pd.qcut(df['Credit Score'], 4, labels=[1, 2, 3, 4]).astype(int)\n\n df['High_Income'] = (df['Annual Income'] > df['Annual Income'].median()).astype(int)\n df['Old_Vehicle'] = (df['Vehicle Age'] > 10).astype(int)\n\n numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns\n if scaler is not None:\n if fit_scaler:\n scaler.fit(df[numeric_cols])\n df[numeric_cols] = scaler.transform(df[numeric_cols])\n\n # df = scaler.transform(df)\n\n return df\n\n\n# transformed_test_df = convert_categorical_to_numeric(test_df)\ntransformed_test_df = add_compound_data(test_df, scaler=scaler, fit_scaler=False)\n# transformed_test_df = transformed_test_df.apply(pd.to_numeric, errors='coerce')\n# transformed_test_df = scaler.transform(transformed_test_df)\n# transformed_test_df = pd.DataFrame(transformed_test_df, columns=X.columns)\ntransformed_test_df","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"print(\"Any NaN values:\", np.any(np.isnan(transformed_test_df)))\nprint(\"Any Inf values:\", np.any(np.isinf(transformed_test_df)))\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"null_data = pd.DataFrame()\nnull_data['null_count'] = transformed_test_df.isnull().sum()\nnull_data['null_ratio'] = round(null_data['null_count'] / len(transformed_test_df), 4)\nnull_data","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"transformed_test_df.shape","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"\nprint(\"Columns in transformed_test_df:\", transformed_test_df.columns)\nprint(\"Columns in X_train:\", X.columns)\nmissing_columns = set(X.columns) - set(transformed_test_df.columns)\nextra_columns = set(transformed_test_df.columns) - set(X.columns)\nprint(\"Missing columns:\", missing_columns)\nprint(\"Extra columns:\", extra_columns)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"transformed_test_df","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"print(\"Min value in transformed_test_df:\", transformed_test_df.min())\nprint(\"Max value in transformed_test_df:\", transformed_test_df.max())\nprint(\"Mean value in transformed_test_df:\", transformed_test_df.mean())","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"len(example_data[0])\n# predict_premium(example_data)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"test_df.shape","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"predictions = predict_premium(transformed_test_df)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"submission = test_df[['id']]\nsubmission","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# transformed_test_df = transformed_test_df[X.columns]","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"submission[\"Premium Amount\"] = predictions","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"print(\"Shape of training data:\", X.shape)\nprint(\"Shape of test data:\", transformed_test_df.shape)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"submission","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"submission.to_csv('submission.csv', index=False)","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}