XGBoost关键建模代码(python)

前面对XGBoost的原理进行了介绍,这一篇放上相应的python建模关键代码。还是以jupyter为例,代码的目录如下:

图片

1. 基础配置
代码中用到的样本集以及两个自定义类可在文末点击阅读原文获取。

import scorecardpy as scimport syssys.path.append(r'E:自动建模代码xgboost')from dataprocess import *from xgboost_m import *
from IPython.core.interactiveshell import InteractiveShellInteractiveShell.ast_node_interactivity = "all" #展示多次运行结果import warningswarnings.filterwarnings("ignore")
model_path = r'D:testmodel' #设定本次建模文件存储路径import osif not os.path.exists(model_path): os.makedirs(model_path)os.chdir(model_path)

2. 数据读取

import pandas as pddata=pd.read_csv('D:/data.csv',encoding = 'gb2312',index_col=0)data.rename(columns={'SeriousDlqin2yrs':'label'},inplace=True)

3. 数据预处理

dep='label'print(data.shape)dp=DataProcess#变量基本描述信息(如饱和度,均值,IV值等)var_desc=dp.var_eda(df=data,need_target=1,target=dep)var_desc.to_csv(model_path+'\'+'var_desc.csv',index=False)#人工剔除指定变量x_remove=['ostype','gender']df_selected = data[[x for x in data.columns if x not in x_remove]] print(df_selected.shape)#删除非数值类型变量col_type = pd.DataFrame(df_selected.dtypes)col_type.columns = ['type']save_col = list(col_type.loc[(col_type['type']== 'float64')|(col_type['type'] == 'int64')|(col_type['type'] == 'int8')].index)df_selected = df_selected[save_col]rm_col = [x for x in df_selected.columns if x not in save_col]print(df_selected.shape)#变量筛选(饱和度、集中度和IV阈值)var_selected = list(var_desc['varname'].loc[(var_desc['missRate'] < 0.1) & (var_desc['concentricRate'] <= 0.95)& (var_desc['IV'] >= 0.01)]) var_selected.insert(0,dep)df_selected = df_selected[var_selected]print(df_selected.shape)#变量筛选(相关系数高于一定阈值,保留IV值大的)df_selected = dp.var_corr_delete(df_selected,var_desc,dep,0.95)print(df_selected.shape)
图片

4. 拆分样本集

from sklearn.model_selection import train_test_splitdep ='label'model_data=df_selected.copy()train_x,test_x,train_y,test_y = train_test_split(model_data.drop(dep,1),model_data[dep],test_size=0.35,random_state=27)train_y.sum() / len(train_y),test_y.sum() / len(test_y) df_train = train_xdf_train[dep] = train_ydf_train.to_pickle(model_path + '\' + 'df_train.pkl')df_test = test_xdf_test[dep] = test_ydf_test.to_pickle(model_path + '\' + 'df_test.pkl')

5. 模型构建

#模型设置model_name='model1'x_remove = []x_update = [x for x in df_train.columns if x not in x_remove] #变量手工调整train=df_train[x_update]select_feature=list(train.columns)select_feature.remove(dep)
#模型调参及模型保存parameter_space ={'max_depth':hp.choice("max_depth", [3,5]), 'min_child_weight':hp.choice("min_child_weight", [3,10]), 'gamma': hp.choice("gamma", [0.03,0.6]), 'subsample':hp.choice("subsample",[0.5,0.8]), 'colsample_bytree':hp.choice("colsample_bytree", [0.5,0.8]), 'reg_alpha':hp.choice("reg_alpha",[0,0.1,100]), 'learning_rate':hp.choice("learning_rate",arange(0.1,0.2,0.1)), 'n_estimators':hp.choice("n_estimators",arange(500,1000,500)) } max_evals=2train=train[select_feature+[dep]]xgb=xgb_model(indata=train,dep=dep,parameter_space=parameter_space,max_evals=max_evals)xgb_m=xgb.xgb_model_fiting()
pd.DataFrame(select_feature).to_csv(model_path + '\' + model_name + '_select_feature.csv')joblib.dump(xgb_m,model_path + '\' + model_name + '_xgb_m.m')
图片

6. 模型效果

#模型在训练集上的表现prob_train=xgb.xgb_model_proba(xgb_m,train)prob_train = pd.merge(pd.DataFrame(train[dep]).reset_index(drop=True),pd.DataFrame(prob_train).reset_index(drop=True),left_index=True,right_index=True)prob_train.columns = [dep,'prob']train_eva = sc.perf_eva(prob_train[dep], prob_train['prob'], title='train') #plot_type=['ks','lift','roc','pr'])#模型在测试集上的表现 df_test_model = df_test[select_feature+[dep]]prob_test=xgb.xgb_model_proba(xgb_m,df_test_model)prob_test = pd.merge(pd.DataFrame(df_test_model[dep]).reset_index(drop=True),pd.DataFrame(prob_test).reset_index(drop=True),left_index=True,right_index=True)prob_test.columns = [dep,'prob']test_eva = sc.perf_eva(prob_test[dep], prob_test['prob'], title='test')
图片
图片
#模型对尾部的提升都'''训练集'''tmp=prob_train.copy()tmp['prob_bin']=pd.qcut(tmp['prob'],20)out=dp.cal_lift(tmp,'prob_bin',dep,False).sort_index() #False代表变量值越大越坏out.style.bar(subset=['badtimes'],align='mid')'''测试集'''cut = [i.right for i in out.index] cut = [0] + cutcut[len(cut)-1] = 1print(cut)tmp=prob_test.copy()tmp['prob_bin']=pd.cut(tmp['prob'],cut)dp.cal_lift(tmp,'prob_bin',dep,True).style.bar(subset=['badtimes'],align='mid')
图片

7. 模型迭代

#模型设置model_name='model2'select_feature=list(pd.read_csv(model_path + '\' + 'model1' + '_select_feature.csv')['0'])xgb_m = joblib.load(model_path + '\' + 'model1' + '_xgb_m.m')imp=list(xgb_m.feature_importances_)select_feature=pd.DataFrame(zip(select_feature,imp))select_feature=list(select_feature[select_feature[1]>0.04][0])print(len(select_feature))
#其他部分如模型构建和效果展示 同前;也可自由发挥

8. 选定模型表现

'''选定模型'''model_name='model2' 
'''样本打分还原'''select_feature=list(pd.read_csv(model_path + '\' + model_name + '_select_feature.csv')['0'])df_train=pd.read_pickle(model_path + '\' + 'df_train.pkl')df_test=pd.read_pickle(model_path + '\' + 'df_test.pkl')xgb_final = joblib.load(model_path + '\' + model_name + '_xgb_m.m')
df_train = df_train[select_feature+[dep]]prob_train=xgb.xgb_model_proba(xgb_final,df_train)prob_train = pd.merge(pd.DataFrame(df_train[dep]).reset_index(drop=True),pd.DataFrame(prob_train).reset_index(drop=True),left_index=True,right_index=True)prob_train.columns = [dep,'prob']df_test = df_test[select_feature+[dep]]prob_test=xgb.xgb_model_proba(xgb_final,df_test_model)prob_test = pd.merge(pd.DataFrame(df_test[dep]).reset_index(drop=True),pd.DataFrame(prob_test).reset_index(drop=True),left_index=True,right_index=True)prob_test.columns = [dep,'prob']
'''模型表现'''#同前;可自由发挥

在工作中的XGB建模项目中,这个是我常用的一种建模代码结构,给有兴趣的朋友作个参考。

题图来源:网站Pexels

阅读原文

简介:FRM持证人|传播分享反欺诈风控知识。欢迎关注微信公众号:反欺诈攻防战
(0)
打赏 喜欢就点个赞支持下吧 喜欢就点个赞支持下吧

声明:本文来自“反欺诈攻防战”,分享链接:https://www.zyxiao.com/p/301454    侵权投诉

网站客服
网站客服
内容投稿 侵权处理
分享本页
返回顶部