# Data handling 
import pandas as pd
import numpy as np
# EDA
import seaborn as sns
# Preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
# Model training and evaluation
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Model explanation
import shap
import matplotlib.pyplot as plt


df = pd.read_csv("dataset.csv", encoding="utf-8")


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


# Dropping unnecessary columns
df.drop(columns=["RowNumber", "CustomerId", "Surname"], inplace=True)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


df['Exited'].mean()

0.2037


df["Age"].describe()

count    10000.000000
mean        38.921800
std         10.487806
min         18.000000
25%         32.000000
50%         37.000000
75%         44.000000
max         92.000000
Name: Age, dtype: float64


bins = list(range(15, df['Age'].max() + 5, 5))
labels = [f"{b} - {b+4}" for b in bins[:-1]]
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)


age_group_exited_ratio = df.groupby('AgeGroup')['Exited'].mean().reset_index()
age_group_exited_ratio['Exited'] = age_group_exited_ratio['Exited'] * 100 
age_group_exited_ratio


sns.barplot(x='AgeGroup', y='Exited', data=age_group_exited_ratio)
plt.xlabel('Age Group')
plt.ylabel('Churn (%)')
plt.title('Churn (%) by Age Group')
plt.xticks(rotation=45)  
plt.show()


df["EstimatedSalary"].describe()

count     10000.000000
mean     100090.239881
std       57510.492818
min          11.580000
25%       51002.110000
50%      100193.915000
75%      149388.247500
max      199992.480000
Name: EstimatedSalary, dtype: float64


df["CreditScore"].describe()

count    10000.000000
mean       650.528800
std         96.653299
min        350.000000
25%        584.000000
50%        652.000000
75%        718.000000
max        850.000000
Name: CreditScore, dtype: float64


X_raw = df.loc[:, df.columns != "Exited"]
y = df.loc[:, df.columns == "Exited"]


# One hot encoding for all object columns 

## Split X by data type
X_raw_numerical = X_raw.select_dtypes(exclude=['object'])
X_raw_categorical = X_raw.select_dtypes(include=['object'])

X_raw_categorical_ohe = pd.get_dummies(X_raw_categorical)

# Combine numerical columns and one hot encoded categorical columns
X = pd.concat([X_raw_numerical, X_raw_categorical_ohe], axis=1)

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   CreditScore        10000 non-null  int64   
 1   Age                10000 non-null  int64   
 2   Tenure             10000 non-null  int64   
 3   Balance            10000 non-null  float64 
 4   NumOfProducts      10000 non-null  int64   
 5   HasCrCard          10000 non-null  int64   
 6   IsActiveMember     10000 non-null  int64   
 7   EstimatedSalary    10000 non-null  float64 
 8   AgeGroup           10000 non-null  category
 9   Geography_France   10000 non-null  uint8   
 10  Geography_Germany  10000 non-null  uint8   
 11  Geography_Spain    10000 non-null  uint8   
 12  Gender_Female      10000 non-null  uint8   
 13  Gender_Male        10000 non-null  uint8   
dtypes: category(1), float64(2), int64(6), uint8(5)
memory usage: 684.4 KB


cols_for_training = list(df.columns) # use all columns by default
cols_not_for_training = ["AgeGroup"] # define unwanted columns
cols_for_training = list(set(cols_for_training) - set(cols_not_for_training )) # remove unwanted columns from list
print(cols_for_training)

['Tenure', 'Gender', 'Exited', 'Geography', 'IsActiveMember', 'Balance', 'EstimatedSalary', 'HasCrCard', 'Age', 'CreditScore', 'NumOfProducts']


# Split into training and test data sets  (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


# Converting to XGB data format
dtrain = xgb.DMatrix(X_train.loc[:, X_train.columns.isin(cols_for_training)], label=y_train)
dtest = xgb.DMatrix(X_test.loc[:, X_train.columns.isin(cols_for_training)], label=y_test)


# Setting parameters
params = {
    'objective': 'binary:logistic',
    'max_depth': 10,
    'eta': 0.3,
    'nthread': 4
}


num_round = 1000
xgb_classifier = xgb.train(params, dtrain, num_round)


## Model evaluation

## Generate predictions for test data
preds = xgb_classifier.predict(dtest)
preds_binary = (preds > 0.5).astype(int)

## Calculate metrics

positive_ratio = y_test.mean()[0]
print(f"Proportion of positive cases in the test set: {positive_ratio:.2f}")

accuracy = accuracy_score(y_test, preds_binary)
print(f"Accuracy: {accuracy:.2f}")

precision = precision_score(y_test, preds_binary)
recall = recall_score(y_test, preds_binary)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Proportion of positive cases in the test set: 0.20
Accuracy: 0.83
Precision: 0.61
Recall: 0.46


results = X_test.copy()
results['Actual'] = y_test
results['Score'] = preds
results['Predicted'] = preds_binary


results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 6417 to 9704
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   CreditScore        3000 non-null   int64   
 1   Age                3000 non-null   int64   
 2   Tenure             3000 non-null   int64   
 3   Balance            3000 non-null   float64 
 4   NumOfProducts      3000 non-null   int64   
 5   HasCrCard          3000 non-null   int64   
 6   IsActiveMember     3000 non-null   int64   
 7   EstimatedSalary    3000 non-null   float64 
 8   AgeGroup           3000 non-null   category
 9   Geography_France   3000 non-null   uint8   
 10  Geography_Germany  3000 non-null   uint8   
 11  Geography_Spain    3000 non-null   uint8   
 12  Gender_Female      3000 non-null   uint8   
 13  Gender_Male        3000 non-null   uint8   
 14  Actual             3000 non-null   int64   
 15  Score              3000 non-null   float32 
 16  Predicted          3000 non-null   int32   
dtypes: category(1), float32(1), float64(2), int32(1), int64(7), uint8(5)
memory usage: 276.1 KB


# Plot Scores and Age
plt.figure(figsize=(10, 6))
results.boxplot(column='Score', by='AgeGroup', grid=False)
plt.xlabel('Age')
plt.ylabel('Score')
plt.title('Scores by Age')
plt.xticks(rotation=45)  
plt.suptitle('')
plt.show()

<Figure size 1000x600 with 0 Axes>


X_shap = X_test.loc[:, X_test.columns.isin(cols_for_training)]


explainer = shap.TreeExplainer(xgb_classifier)


shap_values = explainer.shap_values(X_shap)


shap.summary_plot(shap_values, X_shap, plot_type="bar")


shap.summary_plot(shap_values, X_shap)


high_pred_index = np.argmax(preds)
high_pred_instance = X_shap.iloc[high_pred_index:high_pred_index+1]


shap_values_high = explainer.shap_values(high_pred_instance)


shap.initjs()
shap.force_plot(explainer.expected_value, shap_values_high, high_pred_instance)


low_pred_index = np.argmin(preds)
low_pred_instance = X_shap.iloc[low_pred_index:low_pred_index+1]


shap_values_low = explainer.shap_values(low_pred_instance)


shap.force_plot(explainer.expected_value, shap_values_low, low_pred_instance)

	AgeGroup	Exited
0	15 - 19	6.122449
1	20 - 24	9.068627
2	25 - 29	7.094595
3	30 - 34	8.145240
4	35 - 39	13.301560
5	40 - 44	23.670054
6	45 - 49	43.386243
7	50 - 54	56.920078
8	55 - 59	54.775281
9	60 - 64	42.622951
10	65 - 69	21.374046
11	70 - 74	14.432990
12	75 - 79	0.000000
13	80 - 84	9.090909
14	85 - 89	0.000000
15	90 - 94	0.000000

SHAP for Classification Models¶

Loading the data¶

Exploratory Data Analysis¶

Preparing the data for machine learning¶

Setting up XGB¶

Model evaluation¶

Exploring Model predictions¶

SHAP¶

Global explanations¶

Local explanations: explaining individual predictions¶