Credit Card segmentation with Machine Learning

Import Library

In [1]:
#Basic python library which need to import
import pandas as pd
import numpy as np

#Date stuff
from datetime import datetime
from datetime import timedelta

#Library for Nice graphing
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sn
%matplotlib inline

#Library for statistics operation
import scipy.stats as stats

# Date Time library
from datetime import datetime

#Machine learning Library
import statsmodels.api as sm
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.nan)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

Load data

In [2]:
# reading data into dataframe
credit= pd.read_csv("CC_GENERAL.csv")

Information about data set

In [3]:
credit.head()
Out[3]:
CUST_ID BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
0 C10001 40.900749 0.818182 95.40 0.00 95.4 0.000000 0.166667 0.000000 0.083333 0.000000 0 2 1000.0 201.802084 139.509787 0.000000 12
1 C10002 3202.467416 0.909091 0.00 0.00 0.0 6442.945483 0.000000 0.000000 0.000000 0.250000 4 0 7000.0 4103.032597 1072.340217 0.222222 12
2 C10003 2495.148862 1.000000 773.17 773.17 0.0 0.000000 1.000000 1.000000 0.000000 0.000000 0 12 7500.0 622.066742 627.284787 0.000000 12
3 C10004 1666.670542 0.636364 1499.00 1499.00 0.0 205.788017 0.083333 0.083333 0.000000 0.083333 1 1 7500.0 0.000000 NaN 0.000000 12
4 C10005 817.714335 1.000000 16.00 16.00 0.0 0.000000 0.083333 0.083333 0.000000 0.000000 0 1 1200.0 678.334763 244.791237 0.000000 12
In [4]:
credit.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 18 columns):
CUST_ID                             8950 non-null object
BALANCE                             8950 non-null float64
BALANCE_FREQUENCY                   8950 non-null float64
PURCHASES                           8950 non-null float64
ONEOFF_PURCHASES                    8950 non-null float64
INSTALLMENTS_PURCHASES              8950 non-null float64
CASH_ADVANCE                        8950 non-null float64
PURCHASES_FREQUENCY                 8950 non-null float64
ONEOFF_PURCHASES_FREQUENCY          8950 non-null float64
PURCHASES_INSTALLMENTS_FREQUENCY    8950 non-null float64
CASH_ADVANCE_FREQUENCY              8950 non-null float64
CASH_ADVANCE_TRX                    8950 non-null int64
PURCHASES_TRX                       8950 non-null int64
CREDIT_LIMIT                        8949 non-null float64
PAYMENTS                            8950 non-null float64
MINIMUM_PAYMENTS                    8637 non-null float64
PRC_FULL_PAYMENT                    8950 non-null float64
TENURE                              8950 non-null int64
dtypes: float64(14), int64(3), object(1)
memory usage: 1.2+ MB
In [5]:
# Find the total number of missing values in the dataframe
print ("\nMissing values :  ", credit.isnull().sum().values.sum())

# printing total numbers of Unique value in the dataframe. 
print ("\nUnique values :  \n",credit.nunique())
Missing values :   314

Unique values :  
 CUST_ID                             8950
BALANCE                             8871
BALANCE_FREQUENCY                     43
PURCHASES                           6203
ONEOFF_PURCHASES                    4014
INSTALLMENTS_PURCHASES              4452
CASH_ADVANCE                        4323
PURCHASES_FREQUENCY                   47
ONEOFF_PURCHASES_FREQUENCY            47
PURCHASES_INSTALLMENTS_FREQUENCY      47
CASH_ADVANCE_FREQUENCY                54
CASH_ADVANCE_TRX                      65
PURCHASES_TRX                        173
CREDIT_LIMIT                         205
PAYMENTS                            8711
MINIMUM_PAYMENTS                    8636
PRC_FULL_PAYMENT                      47
TENURE                                 7
dtype: int64
In [6]:
credit.shape
Out[6]:
(8950, 18)
In [7]:
# Intital descriptive analysis of data.
credit.describe()
Out[7]:
BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
count 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8949.000000 8950.000000 8637.000000 8950.000000 8950.000000
mean 1564.474828 0.877271 1003.204834 592.437371 411.067645 978.871112 0.490351 0.202458 0.364437 0.135144 3.248827 14.709832 4494.449450 1733.143852 864.206542 0.153715 11.517318
std 2081.531879 0.236904 2136.634782 1659.887917 904.338115 2097.163877 0.401371 0.298336 0.397448 0.200121 6.824647 24.857649 3638.815725 2895.063757 2372.446607 0.292499 1.338331
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 50.000000 0.000000 0.019163 0.000000 6.000000
25% 128.281915 0.888889 39.635000 0.000000 0.000000 0.000000 0.083333 0.000000 0.000000 0.000000 0.000000 1.000000 1600.000000 383.276166 169.123707 0.000000 12.000000
50% 873.385231 1.000000 361.280000 38.000000 89.000000 0.000000 0.500000 0.083333 0.166667 0.000000 0.000000 7.000000 3000.000000 856.901546 312.343947 0.000000 12.000000
75% 2054.140036 1.000000 1110.130000 577.405000 468.637500 1113.821139 0.916667 0.300000 0.750000 0.222222 4.000000 17.000000 6500.000000 1901.134317 825.485459 0.142857 12.000000
max 19043.138560 1.000000 49039.570000 40761.250000 22500.000000 47137.211760 1.000000 1.000000 1.000000 1.500000 123.000000 358.000000 30000.000000 50721.483360 76406.207520 1.000000 12.000000

a) Missing Value Treatment

   - Since there are missing values in the data so we are imputing them with median.


In [8]:
credit.isnull().any()
Out[8]:
CUST_ID                             False
BALANCE                             False
BALANCE_FREQUENCY                   False
PURCHASES                           False
ONEOFF_PURCHASES                    False
INSTALLMENTS_PURCHASES              False
CASH_ADVANCE                        False
PURCHASES_FREQUENCY                 False
ONEOFF_PURCHASES_FREQUENCY          False
PURCHASES_INSTALLMENTS_FREQUENCY    False
CASH_ADVANCE_FREQUENCY              False
CASH_ADVANCE_TRX                    False
PURCHASES_TRX                       False
CREDIT_LIMIT                         True
PAYMENTS                            False
MINIMUM_PAYMENTS                     True
PRC_FULL_PAYMENT                    False
TENURE                              False
dtype: bool
In [9]:
# CREDIT_LIMIT  and MINIMUM_PAYMENTS has missing values so we need to remove with median.

credit['CREDIT_LIMIT'].fillna(credit['CREDIT_LIMIT'].median(),inplace=True)

credit['CREDIT_LIMIT'].count()


credit['MINIMUM_PAYMENTS'].median()
credit['MINIMUM_PAYMENTS'].fillna(credit['MINIMUM_PAYMENTS'].median(),inplace=True)
In [13]:
# Now again check the missing values.

credit.isnull().any()
Out[13]:
CUST_ID                             False
BALANCE                             False
BALANCE_FREQUENCY                   False
PURCHASES                           False
ONEOFF_PURCHASES                    False
INSTALLMENTS_PURCHASES              False
CASH_ADVANCE                        False
PURCHASES_FREQUENCY                 False
ONEOFF_PURCHASES_FREQUENCY          False
PURCHASES_INSTALLMENTS_FREQUENCY    False
CASH_ADVANCE_FREQUENCY              False
CASH_ADVANCE_TRX                    False
PURCHASES_TRX                       False
CREDIT_LIMIT                        False
PAYMENTS                            False
MINIMUM_PAYMENTS                    False
PRC_FULL_PAYMENT                    False
TENURE                              False
dtype: bool

Deriving New KPI

1. Monthly average purchase and cash advance amount

Monthly_avg_purchase

In [10]:
credit['Monthly_avg_purchase']=credit['PURCHASES']/credit['TENURE']
In [12]:
print(credit['Monthly_avg_purchase'].head(),'\n ',
credit['TENURE'].head(),'\n', credit['PURCHASES'].head())
0      7.950000
1      0.000000
2     64.430833
3    124.916667
4      1.333333
Name: Monthly_avg_purchase, dtype: float64 
  0    12
1    12
2    12
3    12
4    12
Name: TENURE, dtype: int64 
 0      95.40
1       0.00
2     773.17
3    1499.00
4      16.00
Name: PURCHASES, dtype: float64

Monthly_cash_advance Amount

In [13]:
credit['Monthly_cash_advance']=credit['CASH_ADVANCE']/credit['TENURE']
In [14]:
credit[credit['ONEOFF_PURCHASES']==0]['ONEOFF_PURCHASES'].count()
Out[14]:
4302

2- Purchases by type (one-off, installments)

  • To find what type of purchases customers are making on credit card
In [45]:
credit.loc[:,['ONEOFF_PURCHASES','INSTALLMENTS_PURCHASES']]
Out[45]:
ONEOFF_PURCHASES INSTALLMENTS_PURCHASES
0 0.00 95.40
1 0.00 0.00
2 773.17 0.00
3 1499.00 0.00
4 16.00 0.00
5 0.00 1333.28
6 6402.63 688.38
7 0.00 436.20
8 661.49 200.00
9 1281.60 0.00
10 0.00 920.12
11 1492.18 0.00
12 2500.23 717.76
13 419.96 1717.97
14 0.00 0.00
15 0.00 1611.70
16 0.00 0.00
17 0.00 519.00
18 166.00 338.35
19 0.00 398.64
20 0.00 176.68
21 5910.04 449.91
22 0.00 815.90
23 3454.56 793.79
24 0.00 0.00
25 0.00 399.60
26 102.00 0.00
27 0.00 233.28
28 204.55 182.50
29 0.00 100.00
... ... ...
8920 0.00 0.00
8921 0.00 57.42
8922 0.00 145.98
8923 939.09 959.79
8924 74.00 0.00
8925 0.00 418.59
8926 0.00 580.00
8927 147.80 167.40
8928 0.00 500.00
8929 0.00 0.00
8930 0.00 84.00
8931 0.00 235.80
8932 0.00 180.00
8933 255.62 363.98
8934 0.00 110.50
8935 0.00 465.90
8936 0.00 712.50
8937 0.00 0.00
8938 0.00 0.00
8939 734.40 0.00
8940 0.00 591.24
8941 0.00 214.55
8942 0.00 113.28
8943 20.90 0.00
8944 1012.73 0.00
8945 0.00 291.12
8946 0.00 300.00
8947 0.00 144.40
8948 0.00 0.00
8949 1093.25 0.00

8950 rows × 2 columns

Find customers ONEOFF_PURCHASES and INSTALLMENTS_PURCHASES details

In [16]:
credit[(credit['ONEOFF_PURCHASES']==0) & (credit['INSTALLMENTS_PURCHASES']==0)].shape
Out[16]:
(2042, 20)
In [17]:
credit[(credit['ONEOFF_PURCHASES']>0) & (credit['INSTALLMENTS_PURCHASES']>0)].shape
Out[17]:
(2774, 20)
In [19]:
credit[(credit['ONEOFF_PURCHASES']>0) & (credit['INSTALLMENTS_PURCHASES']==0)].shape
Out[19]:
(1874, 20)
In [20]:
credit[(credit['ONEOFF_PURCHASES']==0) & (credit['INSTALLMENTS_PURCHASES']>0)].shape
Out[20]:
(2260, 20)

As per above detail we found out that there are 4 types of purchase behaviour in the data set. So we need to derive a categorical variable based on their behaviour

In [21]:
def purchase(credit):
    if (credit['ONEOFF_PURCHASES']==0) & (credit['INSTALLMENTS_PURCHASES']==0):
        return 'none'
    if (credit['ONEOFF_PURCHASES']>0) & (credit['INSTALLMENTS_PURCHASES']>0):
         return 'both_oneoff_installment'
    if (credit['ONEOFF_PURCHASES']>0) & (credit['INSTALLMENTS_PURCHASES']==0):
        return 'one_off'
    if (credit['ONEOFF_PURCHASES']==0) & (credit['INSTALLMENTS_PURCHASES']>0):
        return 'istallment'
In [22]:
credit['purchase_type']=credit.apply(purchase,axis=1)
In [23]:
credit['purchase_type'].value_counts()
Out[23]:
both_oneoff_installment    2774
istallment                 2260
none                       2042
one_off                    1874
Name: purchase_type, dtype: int64

4. Limit_usage (balance to credit limit ratio ) credit card utilization

  • Lower value implies cutomers are maintaing thier balance properly. Lower value means good credit score
In [52]:
credit['limit_usage']=credit.apply(lambda x: x['BALANCE']/x['CREDIT_LIMIT'], axis=1)
In [53]:
credit['limit_usage'].head()
Out[53]:
0    0.040901
1    0.457495
2    0.332687
3    0.222223
4    0.681429
Name: limit_usage, dtype: float64

5- Payments to minimum payments ratio etc.

In [54]:
credit['PAYMENTS'].isnull().any()
credit['MINIMUM_PAYMENTS'].isnull().value_counts()
Out[54]:
False    8950
Name: MINIMUM_PAYMENTS, dtype: int64
In [55]:
credit['MINIMUM_PAYMENTS'].describe()
Out[55]:
count     8950.000000
mean       844.906767
std       2332.792322
min          0.019163
25%        170.857654
50%        312.343947
75%        788.713501
max      76406.207520
Name: MINIMUM_PAYMENTS, dtype: float64
In [56]:
credit['payment_minpay']=credit.apply(lambda x:x['PAYMENTS']/x['MINIMUM_PAYMENTS'],axis=1)
In [57]:
credit['payment_minpay']
Out[57]:
0        1.446508
1        3.826241
2        0.991682
3        0.000000
4        2.771075
5        0.581601
6       32.081820
7        1.276357
8        2.206280
9       11.612605
10       0.498597
11       4.536309
12       1.240830
13       6.593552
14       0.813816
15       0.944800
16       1.040881
17       3.477861
18       2.311048
19       0.084551
20       0.016454
21       1.251952
22      27.332818
23       6.649970
24       0.858615
25       1.320852
26       0.916592
27       1.261155
28       0.971251
29       1.006784
          ...    
8920     0.234940
8921     3.304997
8922     0.659354
8923     1.985987
8924     4.856447
8925     5.191574
8926    12.226247
8927     2.678503
8928     6.184871
8929     0.000000
8930     1.385733
8931     2.112821
8932     1.596541
8933     0.314676
8934     2.448965
8935     0.000000
8936     6.972789
8937     1.936734
8938    64.055621
8939     0.653714
8940     5.745025
8941     1.120950
8942     1.095102
8943     1.348973
8944     0.000000
8945     6.660231
8946     0.883197
8947     0.986076
8948     0.942505
8949     0.715439
Name: payment_minpay, Length: 8950, dtype: float64

Extreme value Treatment

  • Since there are variables having extreme values so I am doing log-transformation on the dataset to remove outlier effect
In [58]:
# log tranformation
cr_log=credit.drop(['CUST_ID','purchase_type'],axis=1).applymap(lambda x: np.log(x+1))
In [59]:
cr_log.describe()
Out[59]:
BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE Monthly_avg_purchase Monthly_cash_advance limit_usage payment_minpay
count 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000
mean 6.161637 0.619940 4.899647 3.204274 3.352403 3.319086 0.361268 0.158699 0.270072 0.113512 0.817570 1.894731 8.094825 6.624540 5.916079 0.117730 2.519680 3.050877 2.163970 0.296081 1.357600
std 2.013303 0.148590 2.916872 3.246365 3.082973 3.566298 0.277317 0.216672 0.281852 0.156716 1.009316 1.373856 0.819629 1.591763 1.169929 0.211617 0.130367 2.002823 2.429741 0.250303 0.940149
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.931826 0.000000 0.018982 0.000000 1.945910 0.000000 0.000000 0.000000 0.000000
25% 4.861995 0.635989 3.704627 0.000000 0.000000 0.000000 0.080042 0.000000 0.000000 0.000000 0.000000 0.693147 7.378384 5.951361 5.146667 0.000000 2.564949 1.481458 0.000000 0.040656 0.648817
50% 6.773521 0.693147 5.892417 3.663562 4.499810 0.000000 0.405465 0.080042 0.154151 0.000000 0.000000 2.079442 8.006701 6.754489 5.747301 0.000000 2.564949 3.494587 0.000000 0.264455 1.109459
75% 7.628099 0.693147 7.013133 6.360274 6.151961 7.016449 0.650588 0.262364 0.559616 0.200671 1.609438 2.890372 8.779711 7.550732 6.671670 0.133531 2.564949 4.587295 4.606022 0.540911 1.953415
max 9.854515 0.693147 10.800403 10.615512 10.021315 10.760839 0.693147 0.693147 0.693147 0.916291 4.820282 5.883322 10.308986 10.834125 11.243832 0.693147 2.564949 8.315721 8.276166 2.827902 8.830767
In [60]:
col=['BALANCE','PURCHASES','CASH_ADVANCE','TENURE','PAYMENTS','MINIMUM_PAYMENTS','PRC_FULL_PAYMENT','CREDIT_LIMIT']
cr_pre=cr_log[[x for x in cr_log.columns if x not in col ]]
In [61]:
cr_pre.columns
Out[61]:
Index(['BALANCE_FREQUENCY', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES',
       'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
       'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'Monthly_avg_purchase',
       'Monthly_cash_advance', 'limit_usage', 'payment_minpay'],
      dtype='object')
In [62]:
cr_log.columns
Out[62]:
Index(['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES',
       'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',
       'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
       'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
       'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
       'TENURE', 'Monthly_avg_purchase', 'Monthly_cash_advance', 'limit_usage',
       'payment_minpay'],
      dtype='object')

Insights from KPIs

Average payment_minpayment ratio for each purchse type.

In [63]:
x=credit.groupby('purchase_type').apply(lambda x: np.mean(x['payment_minpay']))
type(x)
x.values
Out[63]:
array([ 7.237, 13.259, 10.087,  5.571])
In [71]:
ax.barh?
In [72]:
fig,ax=plt.subplots()
ax.barh(y=range(len(x)), width=x.values,align='center')
ax.set(yticks= np.arange(len(x)),yticklabels = x.index);
plt.title('Mean payment_minpayment ratio for each purchse type')
Out[72]:
Text(0.5, 1.0, 'Mean payment_minpayment ratio for each purchse type')
In [73]:
credit.describe()
Out[73]:
BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE Monthly_avg_purchase Monthly_cash_advance limit_usage payment_minpay
count 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000
mean 1564.474828 0.877271 1003.204834 592.437371 411.067645 978.871112 0.490351 0.202458 0.364437 0.135144 3.248827 14.709832 4494.282473 1733.143852 844.906767 0.153715 11.517318 86.175173 88.977984 0.388884 9.059164
std 2081.531879 0.236904 2136.634782 1659.887917 904.338115 2097.163877 0.401371 0.298336 0.397448 0.200121 6.824647 24.857649 3638.646702 2895.063757 2332.792322 0.292499 1.338331 180.508787 193.136115 0.389722 118.180526
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 50.000000 0.000000 0.019163 0.000000 6.000000 0.000000 0.000000 0.000000 0.000000
25% 128.281915 0.888889 39.635000 0.000000 0.000000 0.000000 0.083333 0.000000 0.000000 0.000000 0.000000 1.000000 1600.000000 383.276166 170.857654 0.000000 12.000000 3.399375 0.000000 0.041494 0.913275
50% 873.385231 1.000000 361.280000 38.000000 89.000000 0.000000 0.500000 0.083333 0.166667 0.000000 0.000000 7.000000 3000.000000 856.901546 312.343947 0.000000 12.000000 31.936667 0.000000 0.302720 2.032717
75% 2054.140036 1.000000 1110.130000 577.405000 468.637500 1113.821139 0.916667 0.300000 0.750000 0.222222 4.000000 17.000000 6500.000000 1901.134317 788.713501 0.142857 12.000000 97.228333 99.085196 0.717571 6.052729
max 19043.138560 1.000000 49039.570000 40761.250000 22500.000000 47137.211760 1.000000 1.000000 1.000000 1.500000 123.000000 358.000000 30000.000000 50721.483360 76406.207520 1.000000 12.000000 4086.630833 3928.100980 15.909951 6840.528861

customers with installment purchases are paying dues

In [77]:
credit[credit['purchase_type']=='n']
Out[77]:
CUST_ID BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE Monthly_avg_purchase Monthly_cash_advance purchase_type limit_usage payment_minpay
In [78]:
credit.groupby('purchase_type').apply(lambda x: np.mean(x['Monthly_cash_advance'])).plot.barh()

plt.title('Average cash advance taken by customers of different Purchase type : Both, None,Installment,One_Off')
Out[78]:
Text(0.5, 1.0, 'Average cash advance taken by customers of different Purchase type : Both, None,Installment,One_Off')

Customers who don't do either one-off or installment purchases take more cash on advance

In [79]:
credit.groupby('purchase_type').apply(lambda x: np.mean(x['limit_usage'])).plot.barh()
Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0xc881a20>

Original dataset with categorical column converted to number type.

In [139]:
cre_original=pd.concat([credit,pd.get_dummies(credit['purchase_type'])],axis=1)

Preparing Machine learning algorithm

We do have some categorical data which need to convert with the help of dummy creation

In [80]:
# creating Dummies for categorical variable
cr_pre['purchase_type']=credit.loc[:,'purchase_type']
pd.get_dummies(cr_pre['purchase_type'])
Out[80]:
both_oneoff_installment istallment none one_off
0 0 1 0 0
1 0 0 1 0
2 0 0 0 1
3 0 0 0 1
4 0 0 0 1
5 0 1 0 0
6 1 0 0 0
7 0 1 0 0
8 1 0 0 0
9 0 0 0 1
10 0 1 0 0
11 0 0 0 1
12 1 0 0 0
13 1 0 0 0
14 0 0 1 0
15 0 1 0 0
16 0 0 1 0
17 0 1 0 0
18 1 0 0 0
19 0 1 0 0
20 0 1 0 0
21 1 0 0 0
22 0 1 0 0
23 1 0 0 0
24 0 0 1 0
25 0 1 0 0
26 0 0 0 1
27 0 1 0 0
28 1 0 0 0
29 0 1 0 0
... ... ... ... ...
8920 0 0 1 0
8921 0 1 0 0
8922 0 1 0 0
8923 1 0 0 0
8924 0 0 0 1
8925 0 1 0 0
8926 0 1 0 0
8927 1 0 0 0
8928 0 1 0 0
8929 0 0 1 0
8930 0 1 0 0
8931 0 1 0 0
8932 0 1 0 0
8933 1 0 0 0
8934 0 1 0 0
8935 0 1 0 0
8936 0 1 0 0
8937 0 0 1 0
8938 0 0 1 0
8939 0 0 0 1
8940 0 1 0 0
8941 0 1 0 0
8942 0 1 0 0
8943 0 0 0 1
8944 0 0 0 1
8945 0 1 0 0
8946 0 1 0 0
8947 0 1 0 0
8948 0 0 1 0
8949 0 0 0 1

8950 rows × 4 columns

Now merge the created dummy with the original data frame

In [81]:
cr_dummy=pd.concat([cr_pre,pd.get_dummies(cr_pre['purchase_type'])],axis=1)
In [82]:
l=['purchase_type']
In [83]:
cr_dummy=cr_dummy.drop(l,axis=1)
cr_dummy.isnull().any()
Out[83]:
BALANCE_FREQUENCY                   False
ONEOFF_PURCHASES                    False
INSTALLMENTS_PURCHASES              False
PURCHASES_FREQUENCY                 False
ONEOFF_PURCHASES_FREQUENCY          False
PURCHASES_INSTALLMENTS_FREQUENCY    False
CASH_ADVANCE_FREQUENCY              False
CASH_ADVANCE_TRX                    False
PURCHASES_TRX                       False
Monthly_avg_purchase                False
Monthly_cash_advance                False
limit_usage                         False
payment_minpay                      False
both_oneoff_installment             False
istallment                          False
none                                False
one_off                             False
dtype: bool
In [84]:
cr_dummy.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 17 columns):
BALANCE_FREQUENCY                   8950 non-null float64
ONEOFF_PURCHASES                    8950 non-null float64
INSTALLMENTS_PURCHASES              8950 non-null float64
PURCHASES_FREQUENCY                 8950 non-null float64
ONEOFF_PURCHASES_FREQUENCY          8950 non-null float64
PURCHASES_INSTALLMENTS_FREQUENCY    8950 non-null float64
CASH_ADVANCE_FREQUENCY              8950 non-null float64
CASH_ADVANCE_TRX                    8950 non-null float64
PURCHASES_TRX                       8950 non-null float64
Monthly_avg_purchase                8950 non-null float64
Monthly_cash_advance                8950 non-null float64
limit_usage                         8950 non-null float64
payment_minpay                      8950 non-null float64
both_oneoff_installment             8950 non-null uint8
istallment                          8950 non-null uint8
none                                8950 non-null uint8
one_off                             8950 non-null uint8
dtypes: float64(13), uint8(4)
memory usage: 944.0 KB
In [85]:
cr_dummy.head(3)
Out[85]:
BALANCE_FREQUENCY ONEOFF_PURCHASES INSTALLMENTS_PURCHASES PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX Monthly_avg_purchase Monthly_cash_advance limit_usage payment_minpay both_oneoff_installment istallment none one_off
0 0.597837 0.000000 4.568506 0.154151 0.000000 0.080042 0.000000 0.000000 1.098612 2.191654 0.000000 0.040086 0.894662 0 1 0 0
1 0.646627 0.000000 0.000000 0.000000 0.000000 0.000000 0.223144 1.609438 0.000000 0.000000 6.287695 0.376719 1.574068 0 0 1 0
2 0.693147 6.651791 0.000000 0.693147 0.693147 0.000000 0.000000 0.000000 2.564949 4.180994 0.000000 0.287197 0.688979 0 0 0 1
In [86]:
sns.heatmap(cr_dummy.corr())
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0xcaf1cc0>
- Heat map shows that many features are co-related so applying dimensionality reduction will help negating multi-colinearity in data
  • Before applying PCA we will standardize data to avoid effect of scale on our result. Centering and Scaling will make all features with equal weight.

Standardrizing data

  • To put data on the same scale
In [88]:
from sklearn.preprocessing import  StandardScaler
In [89]:
sc=StandardScaler()
In [90]:
cr_dummy.shape
Out[90]:
(8950, 17)
In [91]:
cr_scaled=sc.fit_transform(cr_dummy)
In [ ]:
cr_scaled

Applying PCA

With the help of principal component analysis we will reduce features

In [94]:
from sklearn.decomposition import PCA
In [95]:
cr_dummy.shape
Out[95]:
(8950, 17)
In [96]:
#We have 17 features so our n_component will be 17.
pc=PCA(n_components=17)
cr_pca=pc.fit(cr_scaled)
In [97]:
#Lets check if we will take 17 component then how much varience it explain. Ideally it should be 1 i.e 100%
sum(cr_pca.explained_variance_ratio_)
Out[97]:
1.0
In [98]:
var_ratio={}
for n in range(2,18):
    pc=PCA(n_components=n)
    cr_pca=pc.fit(cr_scaled)
    var_ratio[n]=sum(cr_pca.explained_variance_ratio_)
In [99]:
var_ratio
Out[99]:
{2: 0.5826439793960285,
 3: 0.7299379309512699,
 4: 0.8115442762351249,
 5: 0.8770555795291439,
 6: 0.9186492443512616,
 7: 0.9410925256030134,
 8: 0.961611405368306,
 9: 0.9739787081990642,
 10: 0.9835896584630712,
 11: 0.9897248107341952,
 12: 0.9927550009135226,
 13: 0.9953907562385428,
 14: 0.9979616898169594,
 15: 0.9996360473172955,
 16: 1.0,
 17: 1.0}

Since 6 components are explaining about 90% variance so we select 5 components

In [100]:
pc=PCA(n_components=6)
In [101]:
p=pc.fit(cr_scaled)
In [102]:
cr_scaled.shape
Out[102]:
(8950, 17)
In [103]:
p.explained_variance_
Out[103]:
array([6.836, 3.07 , 2.504, 1.387, 1.114, 0.707])
In [104]:
np.sum(p.explained_variance_)
Out[104]:
15.61878226930881
In [105]:
np.sum(p.explained_variance_)
Out[105]:
15.61878226930881
In [106]:
var_ratio
Out[106]:
{2: 0.5826439793960285,
 3: 0.7299379309512699,
 4: 0.8115442762351249,
 5: 0.8770555795291439,
 6: 0.9186492443512616,
 7: 0.9410925256030134,
 8: 0.961611405368306,
 9: 0.9739787081990642,
 10: 0.9835896584630712,
 11: 0.9897248107341952,
 12: 0.9927550009135226,
 13: 0.9953907562385428,
 14: 0.9979616898169594,
 15: 0.9996360473172955,
 16: 1.0,
 17: 1.0}
In [107]:
pd.Series(var_ratio).plot()
Out[107]:
<matplotlib.axes._subplots.AxesSubplot at 0xc9842e8>

Since 5 components are explaining about 87% variance so we select 5 components

In [108]:
cr_scaled.shape
Out[108]:
(8950, 17)
In [109]:
pc_final=PCA(n_components=6).fit(cr_scaled)

reduced_cr=pc_final.fit_transform(cr_scaled)
In [110]:
dd=pd.DataFrame(reduced_cr)
In [111]:
dd.head()
Out[111]:
0 1 2 3 4 5
0 -0.242841 -2.759668 0.343061 -0.417359 -0.007100 0.019755
1 -3.975652 0.144625 -0.542989 1.023832 -0.428929 -0.572463
2 1.287396 1.508938 2.709966 -1.892252 0.010809 -0.599932
3 -1.047613 0.673103 2.501794 -1.306784 0.761348 1.408986
4 -1.451586 -0.176336 2.286074 -1.624896 -0.561969 -0.675214

So initially we had 17 variables now its 5 so our variable go reduced

In [112]:
dd.shape
Out[112]:
(8950, 6)
In [113]:
col_list=cr_dummy.columns
In [114]:
col_list
Out[114]:
Index(['BALANCE_FREQUENCY', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES',
       'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
       'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'Monthly_avg_purchase',
       'Monthly_cash_advance', 'limit_usage', 'payment_minpay',
       'both_oneoff_installment', 'istallment', 'none', 'one_off'],
      dtype='object')
In [115]:
pd.DataFrame(pc_final.components_.T, columns=['PC_' +str(i) for i in range(6)],index=col_list)
Out[115]:
PC_0 PC_1 PC_2 PC_3 PC_4 PC_5
BALANCE_FREQUENCY 0.029707 0.240072 -0.263140 -0.353549 -0.228681 -0.693816
ONEOFF_PURCHASES 0.214107 0.406078 0.239165 0.001520 -0.023197 0.129094
INSTALLMENTS_PURCHASES 0.312051 -0.098404 -0.315625 0.087983 -0.002181 0.115223
PURCHASES_FREQUENCY 0.345823 0.015813 -0.162843 -0.074617 0.115948 -0.081879
ONEOFF_PURCHASES_FREQUENCY 0.214702 0.362208 0.163222 0.036303 -0.051279 -0.097299
PURCHASES_INSTALLMENTS_FREQUENCY 0.295451 -0.112002 -0.330029 0.023502 0.025871 0.006731
CASH_ADVANCE_FREQUENCY -0.214336 0.286074 -0.278586 0.096353 0.360132 0.066589
CASH_ADVANCE_TRX -0.229393 0.291556 -0.285089 0.103484 0.332753 0.082307
PURCHASES_TRX 0.355503 0.106625 -0.102743 -0.054296 0.104971 -0.009402
Monthly_avg_purchase 0.345992 0.141635 0.023986 -0.079373 0.194147 0.015878
Monthly_cash_advance -0.243861 0.264318 -0.257427 0.135292 0.268026 0.058258
limit_usage -0.146302 0.235710 -0.251278 -0.431682 -0.181885 0.024298
payment_minpay 0.119632 0.021328 0.136357 0.591561 0.215446 -0.572467
both_oneoff_installment 0.241392 0.273676 -0.131935 0.254710 -0.340849 0.294708
istallment 0.082209 -0.443375 -0.208683 -0.190829 0.353821 -0.086087
none -0.310283 -0.005214 -0.096911 0.245104 -0.342222 -0.176809
one_off -0.042138 0.167737 0.472749 -0.338549 0.362585 -0.060698

So above data gave us eigen vector for each component we had all eigen vector value very small we can remove those variable bur in our case its not.

In [117]:
# Factor Analysis : variance explained by each component- 
pd.Series(pc_final.explained_variance_ratio_,index=['PC_'+ str(i) for i in range(6)])
Out[117]:
PC_0    0.402058
PC_1    0.180586
PC_2    0.147294
PC_3    0.081606
PC_4    0.065511
PC_5    0.041594
dtype: float64

Clustering

Based on the intuition on type of purchases made by customers and their distinctive behavior exhibited based on the purchase_type (as visualized above in Insights from KPI) , I am starting with 4 clusters.

In [119]:
from sklearn.cluster import KMeans
In [120]:
km_4=KMeans(n_clusters=4,random_state=123)
In [121]:
km_4.fit(reduced_cr)
Out[121]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=123, tol=0.0001, verbose=0)
In [ ]:
km_4.labels_
In [123]:
pd.Series(km_4.labels_).value_counts()
Out[123]:
3    2769
2    2224
1    2088
0    1869
dtype: int64

Here we donot have known k value so we will find the K. To do that we need to take a cluster range between 1 and 21.

Identify cluster Error.

In [124]:
cluster_range = range( 1, 21 )
cluster_errors = []

for num_clusters in cluster_range:
    clusters = KMeans( num_clusters )
    clusters.fit( reduced_cr )
    cluster_errors.append( clusters.inertia_ )# clusters.inertia_ is basically cluster error here.
In [125]:
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )

clusters_df[0:21]
Out[125]:
num_clusters cluster_errors
0 1 139772.482528
1 2 93308.123825
2 3 70745.193400
3 4 49446.066485
4 5 42548.525149
5 6 37713.045270
6 7 34124.456465
7 8 31502.720956
8 9 28865.971034
9 10 26302.837007
10 11 24020.031165
11 12 22364.382649
12 13 21006.533766
13 14 19857.234322
14 15 19214.601923
15 16 18057.825042
16 17 17469.334973
17 18 17040.494574
18 19 16356.843904
19 20 15937.614441
In [126]:
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
Out[126]:
[<matplotlib.lines.Line2D at 0xc8812e8>]

From above graph we will find elbow range. here it is 4,5,6

Silhouette Coefficient

In [127]:
from sklearn import metrics
In [128]:
# calculate SC for K=3 through K=12
k_range = range(2, 21)
scores = []
for k in k_range:
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(reduced_cr)
    scores.append(metrics.silhouette_score(reduced_cr, km.labels_))
In [129]:
scores
Out[129]:
[0.33194521792342696,
 0.3540311511892251,
 0.43708577439659485,
 0.43121145209717765,
 0.4228144914653746,
 0.4022440826179354,
 0.4144537298622617,
 0.3889240713086914,
 0.392999135547462,
 0.3678798374268567,
 0.3669766371659527,
 0.3403473055008083,
 0.3526966868656406,
 0.3439233788957459,
 0.3336956707851422,
 0.32231668200851954,
 0.3322902581427395,
 0.3274102528436649,
 0.3239792346891927]
In [130]:
# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)
In [131]:
color_map={0:'r',1:'b',2:'g',3:'y'}
label_color=[color_map[l] for l in km_4.labels_]
plt.figure(figsize=(7,7))
plt.scatter(reduced_cr[:,0],reduced_cr[:,1],c=label_color,cmap='Spectral',alpha=0.1)
Out[131]:
<matplotlib.collections.PathCollection at 0xe33b2b0>

It is very difficult to draw iddividual plot for cluster, so we will use pair plot which will provide us all graph in one shot. To do that we need to take following steps

In [132]:
df_pair_plot=pd.DataFrame(reduced_cr,columns=['PC_' +str(i) for i in range(6)])
In [133]:
df_pair_plot['Cluster']=km_4.labels_ #Add cluster column in the data frame
In [134]:
df_pair_plot.head()
Out[134]:
PC_0 PC_1 PC_2 PC_3 PC_4 PC_5 Cluster
0 -0.242841 -2.759668 0.343061 -0.417359 -0.007100 0.019755 2
1 -3.975652 0.144625 -0.542989 1.023832 -0.428929 -0.572463 1
2 1.287396 1.508938 2.709966 -1.892252 0.010809 -0.599932 0
3 -1.047613 0.673103 2.501794 -1.306784 0.761348 1.408986 0
4 -1.451586 -0.176336 2.286074 -1.624896 -0.561969 -0.675214 0
In [135]:
#pairwise relationship of components on the data
sns.pairplot(df_pair_plot,hue='Cluster', palette= 'Dark2', diag_kind='kde',size=1.85)
Out[135]:
<seaborn.axisgrid.PairGrid at 0x13534b00>