# To convert categorical to numeric
df['col']=pd.to_numeric(df['col'],errors='coerce')
# To find null values
df.isnull().sum()
# Missing values imputation
df['col'].fillna(x,inplace=True)
# Outlier treatment
def outlier_capping(x):
x=x.clip_upper(x.quantile(0.99))
x=x.clip_lower(x.quantile(0.01))
return x
df_num=df_num.apply(lambda x:outlier_capping(x))
# An utility function to create dummy variable
def create_dummies( df, colname ):
col_dummies = pd.get_dummies(df[colname], prefix=colname).iloc[:, :-1]
# col_dummies.drop(col_dummies.columns[0], axis=1, inplace=True)
df = pd.concat([df, col_dummies], axis=1)
df.drop( colname, axis = 1, inplace = True )
return df
for c_feature in features:
df[c_feature] = df[c_feature].astype('category')
df = create_dummies(df , c_feature )
# Deciling
df['col']=pd.cut(df['col'],bins=10,labels=range(1,11))
# Train_test_split
from sklearn.model_selection import train_test_split
features=df.columns.difference(['Y'])
train_x,test_x,train_y,test_y=train_test_split(df[features],
df['Y'],
test_size=0.3,
random_state=42)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid={'n_estimators':np.arange(10,100)}
tree=GridSearchCV(RandomForestClassifier(oob_score=False,warm_start=True),param_grid,cv=2,n_jobs=-1)
tree.fit(train_x,train_y)
tree.best_params_
radm_clf=RandomForestClassifier(oob_score=True,n_estimators=86,n_jobs=-1,random_state=12)
radm_clf.fit(train_x,train_y)
indices = np.argsort(radm_clf.feature_importances_)[::-1]
feature_rank = pd.DataFrame( columns = ['rank', 'feature', 'importance'] )
for f in range(train_x.shape[1]):
feature_rank.loc[f] = [f+1,
train_x.columns[indices[f]],
radm_clf.feature_importances_[indices[f]]]
feature_rank.head(15)
rf_features=feature_rank.loc[0:15,['feature']]
rf_features=rf_features['feature'].tolist()
rf_features.append('Y')
df_new=df_old[rf_features]
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
col_x=pd.DataFrame(train_x).columns.tolist()
col_y=pd.DataFrame(train_y).columns.tolist()
ros=RandomOverSampler(random_state=42)
balanced_x,balanced_y=ros.fit_sample(train_x,train_y)
train_x=pd.DataFrame(balanced_x,columns=col_x)
train_y=pd.DataFrame(balanced_y,columns=col_y)