データとの絆

もう一回もう一回行こうぜ 僕らの声

アイムアルーザー ずっと前から聞こえてた

准备工作

1
init_woody
Matplotlib env init complete.
Warnings off.
1
2
3
4
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
1
2
# 不省略行的查看数据
pd.set_option('display.max_columns', 200)

数据源准备

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# 导入train数据
train_idv_td = pd.read_csv("../data/2/train/IDV_TD.csv", encoding='utf-8') # 个人定期存款账户信息(IDV_TD)
train_idv_dpsa = pd.read_csv("../data/2/train/IDV_DPSA.csv", encoding='utf-8') # 个人活期存款账户信息(IDV_DPSA)
train_loan = pd.read_csv("../data/2/train/LOAN.csv", encoding='utf-8') # 贷款账户信息(LOAN)
train_bond = pd.read_csv("../data/2/train/BOND.csv", encoding='utf-8') # 国债账户信息(BOND)
train_fund = pd.read_csv("../data/2/train/FUND.csv", encoding='utf-8') # 基金账户信息(FUND)
train_prec_metal = pd.read_csv("../data/2/train/PREC_METAL.csv", encoding='utf-8') # 贵金属账户信息(PREC_METAL)
train_aget_insr = pd.read_csv("../data/2/train/AGET_INSR.csv", encoding='utf-8') # 代理保险账户信息(AGET_INSR)
train_thr_pty_cstd = pd.read_csv("../data/2/train/THR_PTY_CSTD.csv", encoding='utf-8') # 第三方存管账户信息(THR_PTY_CSTD)
train_idv_cust_basic = pd.read_csv("../data/2/train/IDV_CUST_BASIC.csv", encoding='utf-8') # 个人客户基本信息(IDV_CUST_BASIC)
train_tr_dc = pd.read_csv("../data/2/train/TR_DC.csv", encoding='utf-8') # 交易信息(TR_DC)
train_base_excg = pd.read_csv("../data/2/train/BASE_EXCG.csv", encoding='utf-8') # 汇率表(BASE_EXCG)
train_cust_result = pd.read_csv("../data/2/train/CUST_RESULT.csv", encoding='utf-8') # 客户标记(CUST_RESULT)
# A榜
# BASE_EXCG.csv 无
A_idv_td = pd.read_csv("../data/2/A/IDV_TD.csv", encoding='utf-8')
A_idv_dpsa = pd.read_csv("../data/2/A/IDV_DPSA.csv", encoding='utf-8')
A_loan = pd.read_csv("../data/2/A/LOAN.csv", encoding='utf-8')
A_bond = pd.read_csv("../data/2/A/BOND.csv", encoding='utf-8')
A_fund = pd.read_csv("../data/2/A/FUND.csv", encoding='utf-8')
A_prec_metal = pd.read_csv("../data/2/A/PREC_METAL.csv", encoding='utf-8')
A_aget_insr = pd.read_csv("../data/2/A/AGET_INSR.csv", encoding='utf-8')
A_thr_pty_cstd = pd.read_csv("../data/2/A/THR_PTY_CSTD.csv", encoding='utf-8')
A_idv_cust_basic = pd.read_csv("../data/2/A/IDV_CUST_BASIC.csv", encoding='utf-8')
A_tr_dc = pd.read_csv("../data/2/A/TR_DC.csv", encoding='utf-8')
A_customid = pd.read_csv("../data/2/A/CUSTOMID.csv", encoding='utf-8')
# B榜
B_idv_td = pd.read_csv("../data/2/B/IDV_TD.csv", encoding='utf-8')
B_idv_dpsa = pd.read_csv("../data/2/B/IDV_DPSA.csv", encoding='utf-8')
B_loan = pd.read_csv("../data/2/B/LOAN.csv", encoding='utf-8')
B_bond = pd.read_csv("../data/2/B/BOND.csv", encoding='utf-8')
B_fund = pd.read_csv("../data/2/B/FUND.csv", encoding='utf-8')
B_prec_metal = pd.read_csv("../data/2/B/PREC_METAL.csv", encoding='utf-8')
B_aget_insr = pd.read_csv("../data/2/B/AGET_INSR.csv", encoding='utf-8')
B_thr_pty_cstd = pd.read_csv("../data/2/B/THR_PTY_CSTD.csv", encoding='utf-8')
B_idv_cust_basic = pd.read_csv("../data/2/B/IDV_CUST_BASIC.csv", encoding='utf-8')
B_tr_dc = pd.read_csv("../data/2/B/TR_DC.csv", encoding='utf-8')
B_customid = pd.read_csv("../data/2/B/CUSTOMID.csv", encoding='utf-8')
/root/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2728: DtypeWarning: Columns (12) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
1
2
3
4
5
6
7
# 把汇率表中的CCY_LETE_CD字段转化为CCY_CD字段
train_base_excg = pd.read_csv("../data/2/train/BASE_EXCG.csv", encoding='utf-8') # 汇率表(BASE_EXCG)
train_base_excg['CCY_CD'] = train_base_excg['CCY_LETE_CD']
train_base_excg = train_base_excg.drop(['CCY_LETE_CD'], axis=1)
train_base_excg['RMB_MID_PRIC'].apply(lambda x: float(x))
# 把汇率表中的CCY_LETE_CD字段转化为CCY_CD字段
train_tr_dc['TR_DAT'].apply(lambda x: int(x))

公用函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# 按汇率计算价值(传入2个参数返回计算结果)
def compute_REAL_MONEY(train_IDV_DPSA_SAMPLE, A, B):
NEED = train_IDV_DPSA_SAMPLE[A]
RMB_MID_PRIC = train_IDV_DPSA_SAMPLE[B]
return NEED * float(RMB_MID_PRIC)

# one-hot 处理函数

# 本地评价
def evaluate(test_y, y_pred):
from sklearn import metrics
test_pct_1 = test_y.sum()/test_y.shape[0]
pred_pct_1 = y_pred.sum()/y_pred.shape[0]

roc_auc = metrics.roc_auc_score(test_y,y_pred)
acc = metrics.accuracy_score(test_y,y_pred)
Recall = metrics.recall_score(test_y,y_pred)
F1 = metrics.f1_score(test_y,y_pred)
Precision = metrics.precision_score(test_y,y_pred)
Confusion = metrics.confusion_matrix(test_y,y_pred)

print ('test_pct_1: %.4f' % test_pct_1)
print ('pred_pct_1: %.4f' % pred_pct_1)
print ('Precesion: %.4f' % Precision)
print ('Recall: %.4f' % Recall)
print ('F1-score: %.4f' % F1)

print('confusion matrix:')
print (metrics.confusion_matrix(test_y,y_pred))
return {'test_pct_1':test_pct_1,'pred_pct_1':pred_pct_1,'roc_auc':roc_auc,'acc':acc,'recall':Recall,'F1':F1,'Precision':Precision,'Confusuion':Confusion}

def feature_rank(model,num):
##计算特征得分
feature_score = model.get_fscore()
df_feature_score = pd.DataFrame({"feature_name":list(feature_score.keys()),"feature_score":list(feature_score.values())})
df_feature_score_sort = df_feature_score.sort_values(ascending=0,by=['feature_score'])

#result_name = "feature_score_1_A_%s_%s_%s_%6f.csv"%(time_str,feature_num,train_steps,F1_score)
#result_name_full = datas_dir_e + "out/" + result_name
#df_feature_score_sort.to_csv(result_name_full,index=None)
return df_feature_score_sort.head(num)

def stat_df(df):
stats = []
for col in df.columns:
stats.append((col, df[col].nunique(),df[col].isnull().sum()*100/df.shape[0],
df[col].value_counts(normalize=True,dropna=False).values[0]*100,df[col].dtype))
stats_df = pd.DataFrame(stats, columns=['特征','唯一数数量','缺失值占比','最多数占比','类型'])
stats_df.sort_values('缺失值占比',ascending=False)
return stats_df

def plot_feature_distribution(df1,df2,label1,label2,features,width=6,height=6):
i=0
sns.set_style('whitegrid')
plt.figure()
fig,ax = plt.subplots(width,height,figsize=(20,12))

for feature in features:
i+=1
plt.subplot(width,height,i)
sns.kdeplot(df1[feature],bw=0.5,label=label1)
sns.kdeplot(df2[feature],bw=0.5,label=label2)
plt.xlabel(feature,fontsize=9)
locs, labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show()

特征工程

个人客户基本信息表(IDV_CUST_BASIC) - 处理

1
2
3
4
5
6
7
# 删除没用的字段
train_idv_cust_basic_SAMPLE = train_idv_cust_basic.drop(['DATA_DAT', 'PROV_CD', 'CUST_SEX_CD', 'RES_CD', 'RES_STA_CD', 'NATY_CD',
'NATN_CD', 'CULT_DGR_CD', 'DGR_CD', 'PLC_STS_CD', 'PRFN', 'ADMI_POS_CD',
'SPEC_TECH_PRFN_QUA_CD', 'TITLE_RANK_CD', 'WORK_TYP_CD', 'GC_BRTH',
'UNIT_PROP_CD', 'WORK_YEAR', 'OCP_CD'], axis=1)
# 将个人客户基本信息表与结果整合
train_idv_cust_basic_SAMPLE = pd.merge(train_idv_cust_basic_SAMPLE, train_cust_result, on='CUST_NO')
1
2
3
4
5
features = [x for x in train_idv_cust_basic_SAMPLE.columns if x not in ['CUST_NO','FLAG']]
df0 = train_idv_cust_basic_SAMPLE[train_idv_cust_basic_SAMPLE.FLAG==0]
df1 = train_idv_cust_basic_SAMPLE[train_idv_cust_basic_SAMPLE.FLAG==1]

plot_feature_distribution(df0,df1,'0','1',features,2,2)
/root/anaconda3/lib/python3.6/site-packages/statsmodels/nonparametric/kde.py:454: RuntimeWarning: invalid value encountered in greater
  X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.
/root/anaconda3/lib/python3.6/site-packages/statsmodels/nonparametric/kde.py:454: RuntimeWarning: invalid value encountered in less
  X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.



<matplotlib.figure.Figure at 0x7f0167e93dd8>

FEATURE

1
2
3
4
5
6
7
8
# 声明函数
def IDV_CUST_BASIC_PARSE(dataset_IDV_CUST_BASIC):
# 删除个人用户信息表中无关元素
dataset_IDV_CUST_BASIC = dataset_IDV_CUST_BASIC.drop(['DATA_DAT', 'PROV_CD', 'CUST_SEX_CD', 'RES_CD', 'RES_STA_CD', 'NATY_CD',
'NATN_CD', 'CULT_DGR_CD', 'DGR_CD', 'PLC_STS_CD', 'PRFN', 'ADMI_POS_CD',
'SPEC_TECH_PRFN_QUA_CD', 'TITLE_RANK_CD', 'WORK_TYP_CD', 'GC_BRTH',
'UNIT_PROP_CD', 'WORK_YEAR', 'OCP_CD'], axis=1)
return dataset_IDV_CUST_BASIC

个人定期存款账户信息(IDV_TD)- 处理

1
2
# 查看数据比例
stat_df(train_idv_td)

客户表信息情况

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 编成函数
def IDV_TD_PARSE(dataset_IDV_TD):
train_idv_td_3 = dataset_IDV_TD[dataset_IDV_TD['DATA_DAT']==3734035200]
# 简化IDV_TD表
train_idv_td_3 = train_idv_td_3.drop(['DATA_DAT', 'ARG_CRT_DAT', 'DATA_DAT', 'CLS_ACCT_DAT', 'MATU_DAT', 'LAC', 'ACCT_STS_CD','DP_DAY_CD', 'RDEP_IND_CD', 'RDEP_DP_DAY_CD', 'RAT_CTG','FXDI_SA_ACCM', 'MTH_ACT_DAYS_TOT'], axis=1)
# 去重
train_idv_td_3.drop_duplicates(subset=None, keep='first', inplace=True)
# 将train_idv_td和train_base_excg合并
train_IDV_TD_SAMPLE_3 = pd.merge(train_idv_td_3, train_base_excg, on = 'CCY_CD')
# 执行汇率计算函数
train_IDV_TD_SAMPLE_3['CRBAL'] = train_IDV_TD_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('CRBAL', 'RMB_MID_PRIC'))
train_IDV_TD_SAMPLE_3['REG_CAP'] = train_IDV_TD_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('REG_CAP', 'RMB_MID_PRIC'))
train_IDV_TD_SAMPLE_3['FXDI_T_ACCM'] = train_IDV_TD_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('FXDI_T_ACCM', 'RMB_MID_PRIC'))
train_IDV_TD_SAMPLE_3['TDOP_SHD_PAY_INTS'] = train_IDV_TD_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('TDOP_SHD_PAY_INTS', 'RMB_MID_PRIC'))
train_IDV_TD_SAMPLE_3['MOTH_CR_ACCM'] = train_IDV_TD_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('MOTH_CR_ACCM', 'RMB_MID_PRIC'))
train_IDV_TD_SAMPLE_3['IDV_TD_SUM'] = train_IDV_TD_SAMPLE_3['TDOP_SHD_PAY_INTS'] + train_IDV_TD_SAMPLE_3['REG_CAP']
# 再次精简表
train_IDV_TD_SAMPLE_3 = train_IDV_TD_SAMPLE_3.drop(['CCY_CD', 'RMB_MID_PRIC'], axis=1)
# 数据去重
train_IDV_TD_SAMPLE_3.drop_duplicates(subset=None, keep='first', inplace=True)
# 将个人账户里的数据合并即金额加起来
train_IDV_TD_SAMPLE_SUM_3 = train_IDV_TD_SAMPLE_3[['CUST_NO', 'CRBAL', 'REG_CAP', 'FXDI_T_ACCM', 'TDOP_SHD_PAY_INTS', 'MOTH_CR_ACCM', 'IDV_TD_SUM']].groupby('CUST_NO').sum().reset_index()
# 区分字段
train_IDV_TD_SAMPLE_SUM_3 = train_IDV_TD_SAMPLE_SUM_3.rename(columns={'CRBAL':'IDV_TD_CRBAL', 'MOTH_CR_ACCM':'IDV_TD_MOTH_CR_ACCM'})
# @ 去除过分关联的特征
train_IDV_TD_SAMPLE_SUM_3 = train_IDV_TD_SAMPLE_SUM_3.drop(['TDOP_SHD_PAY_INTS', 'REG_CAP'], axis=1)
# 返回第三个时间段的数据
return train_IDV_TD_SAMPLE_SUM_3

个人活期存款账户信息(IDV_DPSA)- 处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# 编成函数
def IDV_DPSA_PARSE(dataset_IDV_DPSA):
# 划分三张表
train_idv_dpsa_1 = dataset_IDV_DPSA[dataset_IDV_DPSA['DATA_DAT']==3728764800].reset_index()
train_idv_dpsa_2 = dataset_IDV_DPSA[dataset_IDV_DPSA['DATA_DAT']==3731443200].reset_index()
train_idv_dpsa_3 = dataset_IDV_DPSA[dataset_IDV_DPSA['DATA_DAT']==3734035200].reset_index()
# 精简字段
train_idv_dpsa_3 = train_idv_dpsa_3.drop(['DATA_DAT', 'ARG_CRT_DAT', 'CLS_ACCT_DAT', 'MATU_DAT', 'LAC', 'ACCT_STS_CD', 'RAT_CTG', 'CUST_RANK_CD', 'DAY_TFO_SUM', 'MTH_ACT_DAYS_TOT'], axis=1)
# 将IDV_DPSA表与汇率表整合
train_IDV_DPSA_SAMPLE_3 = pd.merge(train_idv_dpsa_3, train_base_excg, on = 'CCY_CD')
# 去重
train_IDV_DPSA_SAMPLE_3.drop_duplicates(subset=None, keep='first', inplace=True)
# 执行汇率计算函数
train_IDV_DPSA_SAMPLE_3['FRZ_TOT_AMT'] = train_IDV_DPSA_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('FRZ_TOT_AMT', 'RMB_MID_PRIC'))
train_IDV_DPSA_SAMPLE_3['DAY_WD_ACT_AMT'] = train_IDV_DPSA_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('DAY_WD_ACT_AMT', 'RMB_MID_PRIC'))
train_IDV_DPSA_SAMPLE_3['DAY_CSH_DP_SUM'] = train_IDV_DPSA_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('DAY_CSH_DP_SUM', 'RMB_MID_PRIC'))
train_IDV_DPSA_SAMPLE_3['DAY_CSH_WD_SUM'] = train_IDV_DPSA_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('DAY_CSH_WD_SUM', 'RMB_MID_PRIC'))
train_IDV_DPSA_SAMPLE_3['DAY_TFI_SUM'] = train_IDV_DPSA_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('DAY_TFI_SUM', 'RMB_MID_PRIC'))
train_IDV_DPSA_SAMPLE_3['MOTH_CR_ACCM'] = train_IDV_DPSA_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('MOTH_CR_ACCM', 'RMB_MID_PRIC'))
train_IDV_DPSA_SAMPLE_3['BEG_MOTH_CRBAL'] = train_IDV_DPSA_SAMPLE_3.apply(compute_REAL_MONEY, axis = 1, args = ('BEG_MOTH_CRBAL', 'RMB_MID_PRIC'))
# 再次精简表
train_IDV_DPSA_SAMPLE_3 = train_IDV_DPSA_SAMPLE_3.drop(['CCY_CD', 'RMB_MID_PRIC', 'index'], axis=1)
# 避免重复字段
train_IDV_DPSA_SAMPLE_3 = train_IDV_DPSA_SAMPLE_3.rename(columns={'CRBAL':'IDV_DPSA_CRBAL', 'MOTH_CR_ACCM':'IDV_DPSA_MOTH_CR_ACCM'})
# 按 CUST_NO 求和
train_IDV_DPSA_SAMPLE_3 = train_IDV_DPSA_SAMPLE_3[['CUST_NO', 'IDV_DPSA_CRBAL', 'ITST_BRNG_ACCM', 'FRZ_TOT_AMT', 'DAY_WD_ACT_AMT', 'DAY_CSH_DP_SUM',
'DAY_CSH_WD_SUM', 'DAY_TFI_SUM', 'IDV_DPSA_MOTH_CR_ACCM', 'BEG_MOTH_CRBAL']].groupby('CUST_NO').sum().reset_index()
# @ 去除过分关联的特征
train_IDV_DPSA_SAMPLE_3 = train_IDV_DPSA_SAMPLE_3.drop(['BEG_MOTH_CRBAL', 'IDV_DPSA_MOTH_CR_ACCM'], axis=1)

# 返回第三个时间段的数据
return train_IDV_DPSA_SAMPLE_3

交易信息(TR_DC) - 处理

1
2
3
4
train_tr_dc_TR_TYPE = train_tr_dc.groupby(['CUST_NO','TR_TYPE'])['TR_TYPE'].count().unstack().reset_index()
train_tr_dc_TR_TYPE['TR_TYPE_TIMES'] = train_tr_dc_TR_TYPE.drop('CUST_NO',axis=1).sum(axis=1)
train_tr_dc_BOE = train_tr_dc_TR_TYPE[['CUST_NO','TR_TYPE_TIMES','EBMKO5RA']]
train_tr_dc_BOE.columns = ['CUST_NO', 'TR_TYPE_TIMES','EBMKO5RA']
1
train_tr_dc_BOE.head()

BOEING

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# 声明函数
def TR_DC_PARSE(dataset_TR_DC):
# 去除抹帐(1),无意义列- RED_BLU_CD,这里全是0
dataset_TR_DC = dataset_TR_DC[dataset_TR_DC['CAN_IND']==0].drop(['RED_BLU_CD'],axis=1)
# 将交易按照 正 / 负 分开
dataset_TR_DC['INCOME'] = dataset_TR_DC['TR_AMT'] > 0
myseries = dataset_TR_DC.groupby(['CUST_NO', 'INCOME'])['TR_AMT'].sum()
myseries = myseries.unstack()
TR_DC_IN_OUT = pd.DataFrame(myseries).reset_index()
TR_DC_IN_OUT['IN_SUM'] = TR_DC_IN_OUT[True]
TR_DC_IN_OUT['OUT_SUM'] = TR_DC_IN_OUT[False]
TR_DC_IN_OUT = TR_DC_IN_OUT.drop([True, False], axis=1)
# 根据BOE交易代码划分 仅使用 EBMKO5RA
train_tr_dc_TR_TYPE = dataset_TR_DC.groupby(['CUST_NO','TR_TYPE'])['TR_TYPE'].count().unstack().reset_index()
train_tr_dc_TR_TYPE['TR_TYPE_TIMES'] = train_tr_dc_TR_TYPE.drop('CUST_NO',axis=1).sum(axis=1)
train_tr_dc_BOE = train_tr_dc_TR_TYPE[['CUST_NO','TR_TYPE_TIMES','EBMKO5RA']]
train_tr_dc_BOE.columns = ['CUST_NO', 'TR_TYPE_TIMES','EBMKO5RA']
# 根据SVRTO交易码划分 使用SVRTO061和SVRTO161
train_tr_dc_SVRTO061 = train_tr_dc_TR_TYPE[['CUST_NO','SVRTO061']]
train_tr_dc_SVRTO161 = train_tr_dc_TR_TYPE[['CUST_NO','SVRTO161']]
train_tr_dc_SVRTO061.columns = ['CUST_NO', 'SVRTO061']
train_tr_dc_SVRTO161.columns = ['CUST_NO', 'SVRTO161']
# 计算每笔交易金额的均值
TR_DC_IN_MEAN = dataset_TR_DC[dataset_TR_DC['TR_AMT']>0][['CUST_NO', 'TR_AMT']].groupby('CUST_NO').mean().reset_index()
TR_DC_IN_MEAN.columns = ['CUST_NO', 'TR_DC_IN_MEAN']
TR_DC_OUT_MEAN = dataset_TR_DC[dataset_TR_DC['TR_AMT']<0][['CUST_NO', 'TR_AMT']].groupby('CUST_NO').mean().reset_index()
TR_DC_OUT_MEAN.columns = ['CUST_NO', 'TR_DC_OUT_MEAN']
# 合并各个特征 36041
# train_TR_DC_SAMPLE = pd.merge(TR_DC_IN_OUT, train_tr_dc_BOE, on='CUST_NO', how='outer')
train_TR_DC_SAMPLE = pd.merge(TR_DC_IN_OUT, TR_DC_IN_MEAN, on='CUST_NO', how='outer')
train_TR_DC_SAMPLE = pd.merge(train_TR_DC_SAMPLE, TR_DC_OUT_MEAN, on='CUST_NO', how='outer')
# train_TR_DC_SAMPLE = pd.merge(train_TR_DC_SAMPLE, train_tr_dc_SVRTO061, on='CUST_NO', how='outer')
# train_TR_DC_SAMPLE = pd.merge(train_TR_DC_SAMPLE, train_tr_dc_SVRTO161, on='CUST_NO', how='outer')
return train_TR_DC_SAMPLE

第三方存管账户信息(THR_PTY_CSTD) - 处理

1
2
3
4
5
6
# 提前删掉无用字段
train_thr_pty_cstd_temple = train_thr_pty_cstd.drop(['CCY_CD', 'CUST_CTG_CD', 'MTH_ACT_DAYS_TOT'], axis=1)
# 划分三张表
train_thr_pty_cstd_1 = train_thr_pty_cstd_temple[train_thr_pty_cstd_temple['DATA_DAT']==3728764800].reset_index()
train_thr_pty_cstd_2 = train_thr_pty_cstd_temple[train_thr_pty_cstd_temple['DATA_DAT']==3731443200].reset_index()
train_thr_pty_cstd_3 = train_thr_pty_cstd_temple[train_thr_pty_cstd_temple['DATA_DAT']==3734035200].reset_index()
1
2
3
4
5
6
7
8
# 清理没用字段
train_thr_pty_cstd_3 = train_thr_pty_cstd_3.drop(['index', 'DATA_DAT'], axis=1)
# 对字段进行处理
ARG_BAL = train_thr_pty_cstd_3.groupby(['CUST_NO'])['ARG_BAL'].sum().reset_index()
AVL_BAL = train_thr_pty_cstd_3.groupby(['CUST_NO'])['AVL_BAL'].sum().reset_index()
MTH_ARG_BAL_ACCM = train_thr_pty_cstd_3.groupby(['CUST_NO'])['MTH_ARG_BAL_ACCM'].sum().reset_index()
MTH_FUD_TF_INWD_AMT = train_thr_pty_cstd_3.groupby(['CUST_NO'])['MTH_FUD_TF_INWD_AMT'].sum().reset_index()
MTH_FUD_TF_OUT_AMT = train_thr_pty_cstd_3.groupby(['CUST_NO'])['MTH_FUD_TF_OUT_AMT'].sum().reset_index()
1
2
3
4
5
# 将字段合并
train_thr_pty_cstd_SAMPLE = pd.merge(ARG_BAL, AVL_BAL, on='CUST_NO', how='outer')
train_thr_pty_cstd_SAMPLE = pd.merge(train_thr_pty_cstd_SAMPLE, MTH_ARG_BAL_ACCM, on='CUST_NO', how='outer')
train_thr_pty_cstd_SAMPLE = pd.merge(train_thr_pty_cstd_SAMPLE, MTH_FUD_TF_INWD_AMT, on='CUST_NO', how='outer')
train_thr_pty_cstd_SAMPLE = pd.merge(train_thr_pty_cstd_SAMPLE, MTH_FUD_TF_OUT_AMT, on='CUST_NO', how='outer')
1
2
# @ 去除过分关联的特征
train_thr_pty_cstd_SAMPLE = train_thr_pty_cstd_SAMPLE.drop(['ARG_BAL', 'MTH_ARG_BAL_ACCM'], axis=1)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# 声明函数
def THR_PTY_CSTD_PARSE(dataset_THR_PTY_CSTD):
# 提前删掉无用字段
dataset_THR_PTY_CSTD = dataset_THR_PTY_CSTD.drop(['CCY_CD', 'CUST_CTG_CD', 'MTH_ACT_DAYS_TOT'], axis=1)
# 划分三张表
train_thr_pty_cstd_1 = dataset_THR_PTY_CSTD[dataset_THR_PTY_CSTD['DATA_DAT']==3728764800].reset_index()
train_thr_pty_cstd_2 = dataset_THR_PTY_CSTD[dataset_THR_PTY_CSTD['DATA_DAT']==3731443200].reset_index()
train_thr_pty_cstd_3 = dataset_THR_PTY_CSTD[dataset_THR_PTY_CSTD['DATA_DAT']==3734035200].reset_index()
# 清理没用字段
train_thr_pty_cstd_3 = train_thr_pty_cstd_3.drop(['index', 'DATA_DAT'], axis=1)
# 对字段进行处理
ARG_BAL = train_thr_pty_cstd_3.groupby(['CUST_NO'])['ARG_BAL'].sum().reset_index()
AVL_BAL = train_thr_pty_cstd_3.groupby(['CUST_NO'])['AVL_BAL'].sum().reset_index()
MTH_ARG_BAL_ACCM = train_thr_pty_cstd_3.groupby(['CUST_NO'])['MTH_ARG_BAL_ACCM'].sum().reset_index()
MTH_FUD_TF_INWD_AMT = train_thr_pty_cstd_3.groupby(['CUST_NO'])['MTH_FUD_TF_INWD_AMT'].sum().reset_index()
MTH_FUD_TF_OUT_AMT = train_thr_pty_cstd_3.groupby(['CUST_NO'])['MTH_FUD_TF_OUT_AMT'].sum().reset_index()
# 将字段合并
train_thr_pty_cstd_SAMPLE = pd.merge(AVL_BAL, ARG_BAL, on='CUST_NO', how='outer')
train_thr_pty_cstd_SAMPLE = pd.merge(train_thr_pty_cstd_SAMPLE, MTH_ARG_BAL_ACCM, on='CUST_NO', how='outer')
train_thr_pty_cstd_SAMPLE = pd.merge(train_thr_pty_cstd_SAMPLE, MTH_FUD_TF_INWD_AMT, on='CUST_NO', how='outer')
train_thr_pty_cstd_SAMPLE = pd.merge(train_thr_pty_cstd_SAMPLE, MTH_FUD_TF_OUT_AMT, on='CUST_NO', how='outer')
# @ 去除过分关联的特征
train_thr_pty_cstd_SAMPLE = train_thr_pty_cstd_SAMPLE.drop(['ARG_BAL', 'MTH_ARG_BAL_ACCM'], axis=1)

return train_thr_pty_cstd_SAMPLE

贷款账户信息(LOAN)- 处理

1
2
3
4
5
6
7
8
9
# 选择有用字段
train_loan_temple = train_loan[['CUST_NO', 'DATA_DAT', 'ARG_TYP_CD', 'CCY_CD', 'ARG_LIF_CYC_STA_CD', 'LN_STS_CD', 'CHANL_CD', 'LN_TERM', 'RPAY_MOD_CD', 'LAC', 'NON_MATU_CAP', 'ACD_NML_INTS', 'INTS_TOT_AMT'
, 'BUS_BREED_CD', 'NML_CAP_BAL', 'MTH_NML_CAP_ACCM']]
train_loan_temple = pd.merge(train_loan_temple, train_base_excg, on = 'CCY_CD')
# 划分三张表
train_loan_1 = train_loan_temple[train_loan_temple['DATA_DAT']==3728764800]
train_loan_2 = train_loan_temple[train_loan_temple['DATA_DAT']==3731443200]
train_loan_3 = train_loan_temple[train_loan_temple['DATA_DAT']==3734035200]
train_loan_3 = train_loan_3.drop(['DATA_DAT'], axis=1)
1
train_loan_3.drop_duplicates(subset=None, keep='first', inplace=True)
1
2
3
4
5
6
# 执行汇率计算函数
train_loan_3['ACD_NML_INTS'] = train_loan_3.apply(compute_REAL_MONEY, axis = 1, args = ('ACD_NML_INTS', 'RMB_MID_PRIC'))
train_loan_3['NON_MATU_CAP'] = train_loan_3.apply(compute_REAL_MONEY, axis = 1, args = ('NON_MATU_CAP', 'RMB_MID_PRIC'))
train_loan_3['NML_CAP_BAL'] = train_loan_3.apply(compute_REAL_MONEY, axis = 1, args = ('NML_CAP_BAL', 'RMB_MID_PRIC'))
train_loan_3['INTS_TOT_AMT'] = train_loan_3.apply(compute_REAL_MONEY, axis = 1, args = ('INTS_TOT_AMT', 'RMB_MID_PRIC'))
train_loan_3['MTH_NML_CAP_ACCM'] = train_loan_3.apply(compute_REAL_MONEY, axis = 1, args = ('MTH_NML_CAP_ACCM', 'RMB_MID_PRIC'))
1
2
3
4
5
6
7
8
9
10
11
12
# 平均应计正常利息
MEAN_ACD_NML_INTS = train_loan_3[['CUST_NO', 'ACD_NML_INTS']].groupby(['CUST_NO']).mean().reset_index()
# 平均应计未到期本金
MEAN_NON_MATU_CAP = train_loan_3[['CUST_NO', 'NON_MATU_CAP']].groupby(['CUST_NO']).mean().reset_index()
# 平均月内正常本金基数
MEAN_MTH_NML_CAP_ACCM = train_loan_3[['CUST_NO','MTH_NML_CAP_ACCM']].groupby(['CUST_NO']).mean().reset_index()
# 平均贷款期限
MEAN_LOAN_TERM = train_loan_3[['CUST_NO','LN_TERM']].groupby(['CUST_NO']).mean().reset_index()
# 平均贷款金额
MEAN_NML_CAP_BAL = train_loan_3[['CUST_NO', 'NML_CAP_BAL']].groupby(['CUST_NO']).mean().reset_index()
# 平均利息总额
MEAN_INTS_TOT_AMT = train_loan_3[['CUST_NO', 'INTS_TOT_AMT']].groupby(['CUST_NO']).mean().reset_index()
1
2
train_loan_3_SAMPLE = pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(MEAN_ACD_NML_INTS, MEAN_NON_MATU_CAP, on='CUST_NO', how='outer'), MEAN_MTH_NML_CAP_ACCM, on='CUST_NO', how='outer'), MEAN_LOAN_TERM, on='CUST_NO', how='outer'),
MEAN_NML_CAP_BAL, on='CUST_NO', how='outer'), MEAN_INTS_TOT_AMT, on='CUST_NO', how='outer')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def BY_MONTH_LOAN_METHOD(dataset_LOAN):
# 选择有用字段
train_loan_temple = dataset_LOAN[['CUST_NO', 'DATA_DAT', 'ARG_TYP_CD', 'CCY_CD', 'ARG_LIF_CYC_STA_CD', 'LN_STS_CD', 'CHANL_CD', 'LN_TERM', 'RPAY_MOD_CD', 'LAC', 'NON_MATU_CAP', 'ACD_NML_INTS', 'INTS_TOT_AMT' , 'BUS_BREED_CD', 'NML_CAP_BAL', 'MTH_NML_CAP_ACCM']]
train_loan_temple = pd.merge(train_loan_temple, train_base_excg, on = 'CCY_CD')
# 划分三张表
train_loan_1 = train_loan_temple[train_loan_temple['DATA_DAT']==3728764800]
train_loan_2 = train_loan_temple[train_loan_temple['DATA_DAT']==3731443200]
train_loan_3 = train_loan_temple[train_loan_temple['DATA_DAT']==3734035200]
train_loan_3 = train_loan_3.drop(['DATA_DAT'], axis=1)
train_loan_3.drop_duplicates(subset=None, keep='first', inplace=True)
# 执行汇率计算函数
train_loan_3['ACD_NML_INTS'] = train_loan_3.apply(compute_REAL_MONEY, axis = 1, args = ('ACD_NML_INTS', 'RMB_MID_PRIC'))
train_loan_3['NON_MATU_CAP'] = train_loan_3.apply(compute_REAL_MONEY, axis = 1, args = ('NON_MATU_CAP', 'RMB_MID_PRIC'))
train_loan_3['NML_CAP_BAL'] = train_loan_3.apply(compute_REAL_MONEY, axis = 1, args = ('NML_CAP_BAL', 'RMB_MID_PRIC'))
train_loan_3['INTS_TOT_AMT'] = train_loan_3.apply(compute_REAL_MONEY, axis = 1, args = ('INTS_TOT_AMT', 'RMB_MID_PRIC'))
train_loan_3['MTH_NML_CAP_ACCM'] = train_loan_3.apply(compute_REAL_MONEY, axis = 1, args = ('MTH_NML_CAP_ACCM', 'RMB_MID_PRIC'))
# 平均应计正常利息
MEAN_ACD_NML_INTS = train_loan_3[['CUST_NO', 'ACD_NML_INTS']].groupby(['CUST_NO']).mean().reset_index()
# 平均应计未到期本金
MEAN_NON_MATU_CAP = train_loan_3[['CUST_NO', 'NON_MATU_CAP']].groupby(['CUST_NO']).mean().reset_index()
# 平均月内正常本金基数
MEAN_MTH_NML_CAP_ACCM = train_loan_3[['CUST_NO','MTH_NML_CAP_ACCM']].groupby(['CUST_NO']).mean().reset_index()
# 平均贷款期限
MEAN_LOAN_TERM = train_loan_3[['CUST_NO','LN_TERM']].groupby(['CUST_NO']).mean().reset_index()
# 平均贷款金额
MEAN_NML_CAP_BAL = train_loan_3[['CUST_NO', 'NML_CAP_BAL']].groupby(['CUST_NO']).mean().reset_index()
# 平均利息总额
MEAN_INTS_TOT_AMT = train_loan_3[['CUST_NO', 'INTS_TOT_AMT']].groupby(['CUST_NO']).mean().reset_index()
train_loan_3_SAMPLE = pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(MEAN_ACD_NML_INTS, MEAN_NON_MATU_CAP, on='CUST_NO', how='outer'), MEAN_MTH_NML_CAP_ACCM, on='CUST_NO', how='outer'), MEAN_LOAN_TERM, on='CUST_NO', how='outer'),
MEAN_NML_CAP_BAL, on='CUST_NO', how='outer'), MEAN_INTS_TOT_AMT, on='CUST_NO', how='outer')

return train_loan_3_SAMPLE

基金账户信息(FUND)- 处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def BY_MONTH_FUND_METHOD(dataset_FOUND):
# 预处理
train_fund_temple = dataset_FOUND.drop(['CCY_CD', 'ARG_CRT_DAT', 'ARG_LIF_CYC_STA_CD', 'CHANL_CD', 'VLU_DAT', 'DATE_MATU', 'CLS_ACCT_DAT', 'LAST_ACT_CHNG_DAT', 'MTH_ACT_DAYS_TOT'], axis=1)
# 按月将金额分组
BY_MONTH_FUND_TABLE = train_fund_temple.groupby(['CUST_NO', 'DATA_DAT'])[['FUD_UNIT_NET_VAL', 'FUD_PROD_TYP_CD', 'RSK_RANK_CD', 'SHR',
'MOTH_BAL_ACCM', 'FUND_BAL', 'FUND_BAL_MOTH_BAL_ACCM']].sum().unstack()
# 声明函数 计算三个月的标准差、均值、最大值、最小值
def BY_MONTH_FUND(key):
train_fund_temple_STD = BY_MONTH_FUND_TABLE[key].std(axis=1).reset_index()
train_fund_temple_STD.columns = ['CUST_NO', str('FUND_'+key+'_STD')]
train_fund_temple_MEAN = BY_MONTH_FUND_TABLE[key].mean(axis=1).reset_index()
train_fund_temple_MEAN.columns = ['CUST_NO',str('FUND_'+key+'_MEAN')]
train_fund_temple_MAX = BY_MONTH_FUND_TABLE[key].max(axis=1).reset_index()
train_fund_temple_MAX.columns = ['CUST_NO',str('FUND_'+key+'_MAX')]
train_fund_temple_MIN = BY_MONTH_FUND_TABLE[key].min(axis=1).reset_index()
train_fund_temple_MIN.columns = ['CUST_NO',str('FUND_'+key+'_MIN')]
# 合并字段
train_fund_temple = pd.merge(train_fund_temple_STD, train_fund_temple_MEAN, on='CUST_NO', how='inner')
train_fund_temple = pd.merge(train_fund_temple, train_fund_temple_MAX, on='CUST_NO', how='inner')
train_fund_temple = pd.merge(train_fund_temple, train_fund_temple_MIN, on='CUST_NO', how='inner')
return train_fund_temple
train_fund_temple = BY_MONTH_FUND('RSK_RANK_CD')
for key in ['FUD_UNIT_NET_VAL', 'FUD_PROD_TYP_CD', 'SHR', 'MOTH_BAL_ACCM', 'FUND_BAL', 'FUND_BAL_MOTH_BAL_ACCM']:
train_fund_temple = pd.merge(train_fund_temple, BY_MONTH_FUND(key))
train_fund_temple.drop_duplicates(subset=None, keep='first', inplace=True)
# 定申定赎 开通标识
SUM_FUND_RATN_APLY_OPN_IND = dataset_FOUND[['CUST_NO', 'RATN_APLY_OPN_IND']].groupby(['CUST_NO']).sum().reset_index()
STD_FUND_RATN_APLY_OPN_IND = dataset_FOUND[['CUST_NO', 'RATN_APLY_OPN_IND']].groupby(['CUST_NO']).std().reset_index()
MEAN_FUND_RATN_APLY_OPN_IND = dataset_FOUND[['CUST_NO', 'RATN_APLY_OPN_IND']].groupby(['CUST_NO']).mean().reset_index()
MAX_FUND_RATN_APLY_OPN_IND = dataset_FOUND[['CUST_NO', 'RATN_APLY_OPN_IND']].groupby(['CUST_NO']).max().reset_index()
MIN_FUND_RATN_APLY_OPN_IND = dataset_FOUND[['CUST_NO', 'RATN_APLY_OPN_IND']].groupby(['CUST_NO']).min().reset_index()

SUM_FUND_RATN_REDM_OPN_IND = dataset_FOUND[['CUST_NO', 'RATN_REDM_OPN_IND']].groupby(['CUST_NO']).sum().reset_index()
STD_FUND_RATN_REDM_OPN_IND = dataset_FOUND[['CUST_NO', 'RATN_REDM_OPN_IND']].groupby(['CUST_NO']).std().reset_index()
MEAN_FUND_RATN_REDM_OPN_IND = dataset_FOUND[['CUST_NO', 'RATN_REDM_OPN_IND']].groupby(['CUST_NO']).mean().reset_index()
MAX_FUND_RATN_REDM_OPN_IND = dataset_FOUND[['CUST_NO', 'RATN_REDM_OPN_IND']].groupby(['CUST_NO']).max().reset_index()
MIN_FUND_RATN_REDM_OPN_IND = dataset_FOUND[['CUST_NO', 'RATN_REDM_OPN_IND']].groupby(['CUST_NO']).min().reset_index()
train_fund_temple = pd.merge(pd.merge(train_fund_temple, STD_FUND_RATN_APLY_OPN_IND, on='CUST_NO', how='outer'), MIN_FUND_RATN_REDM_OPN_IND, on='CUST_NO', how='outer')
return train_fund_temple

国债账户信息(BOND)- 处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def BY_MONTH_BOND_METHOD(dataset_BOND):
# 预处理
train_bond_temple = dataset_BOND.drop(['CCY_CD', 'ARG_CRT_DAT', 'MATU_DAT', 'MATU_DAT', 'PROD_CLS_CD', 'CERT_DAT', 'CLS_ACCT_DAT', 'ARG_LIF_CYC_STA_CD', 'NTNL_DEBT_INTS_TYP_CD', 'MTH_ACT_DAYS_TOT'], axis=1)
# 按月将金额分组
BY_MONTH_BOND_TABLE = train_bond_temple.groupby(['CUST_NO', 'DATA_DAT'])[['BOND_TERM_CD', 'ARG_CUR_BAL', 'HOD_SHR',
'UNIT_NET_VAL', 'NVTA_MOTH_ACCM']].sum().unstack()
# 声明函数 计算三个月的标准差、均值、最大值、最小值
def BY_MONTH_BOND(key):
train_bond_temple_STD = BY_MONTH_BOND_TABLE[key].std(axis=1).reset_index()
train_bond_temple_STD.columns = ['CUST_NO', str('BOND_'+key+'_STD')]
train_bond_temple_MEAN = BY_MONTH_BOND_TABLE[key].mean(axis=1).reset_index()
train_bond_temple_MEAN.columns = ['CUST_NO',str('BOND_'+key+'_MEAN')]
train_bond_temple_MAX = BY_MONTH_BOND_TABLE[key].max(axis=1).reset_index()
train_bond_temple_MAX.columns = ['CUST_NO',str('BOND_'+key+'_MAX')]
train_bond_temple_MIN = BY_MONTH_BOND_TABLE[key].min(axis=1).reset_index()
train_bond_temple_MIN.columns = ['CUST_NO',str('BOND_'+key+'_MIN')]
# 合并字段
train_bond_temple = pd.merge(train_bond_temple_STD, train_bond_temple_MEAN, on='CUST_NO', how='inner')
train_bond_temple = pd.merge(train_bond_temple, train_bond_temple_MAX, on='CUST_NO', how='inner')
train_bond_temple = pd.merge(train_bond_temple, train_bond_temple_MIN, on='CUST_NO', how='inner')
return train_bond_temple
train_bond_temple = BY_MONTH_BOND('BOND_TERM_CD')
for key in ['ARG_CUR_BAL', 'HOD_SHR', 'UNIT_NET_VAL', 'NVTA_MOTH_ACCM']:
train_bond_temple = pd.merge(train_bond_temple, BY_MONTH_BOND(key))
train_bond_temple.drop_duplicates(subset=None, keep='first', inplace=True)
return train_bond_temple

贵金属账户信息(PREC_METAL)- 处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def BY_MONTH_PREC_METAL_METHOD(dataset_PREC_METAL):
# 预处理
train_prec_metal_temple = dataset_PREC_METAL.drop(['PREC_METAL_BREED_CD', 'ARG_STS_CD', 'SIGD_DAT', 'SIGD_CHANL_CD', 'TEMN_DAT', 'TEMN_CHANL_CD', 'MTH_ACT_DAYS_TOT'], axis=1)
# 将train_prec_metal和train_base_excg合并
train_prec_metal_temple = pd.merge(train_prec_metal_temple, train_base_excg, on = 'CCY_CD')
# 执行汇率计算函数
train_prec_metal_temple['ARG_BAL'] = train_prec_metal_temple.apply(compute_REAL_MONEY, axis = 1, args = ('ARG_BAL', 'RMB_MID_PRIC'))
# 按月将金额分组
BY_MONTH_PREC_METAL_TABLE = train_prec_metal_temple.groupby(['CUST_NO', 'DATA_DAT'])[['BUS_CTG_CD', 'CNT', 'ARG_BAL',
'MTH_ARG_ACCM']].sum().unstack()
# 声明函数 计算三个月的标准差、均值、最大值、最小值
def BY_MONTH_PREC_METAL(key):
train_prec_metal_temple_STD = BY_MONTH_PREC_METAL_TABLE[key].std(axis=1).reset_index()
train_prec_metal_temple_STD.columns = ['CUST_NO', str('PREC_METAL_'+key+'_STD')]
train_prec_metal_temple_MEAN = BY_MONTH_PREC_METAL_TABLE[key].mean(axis=1).reset_index()
train_prec_metal_temple_MEAN.columns = ['CUST_NO',str('PREC_METAL_'+key+'_MEAN')]
train_prec_metal_temple_MAX = BY_MONTH_PREC_METAL_TABLE[key].max(axis=1).reset_index()
train_prec_metal_temple_MAX.columns = ['CUST_NO',str('PREC_METAL_'+key+'_MAX')]
train_prec_metal_temple_MIN = BY_MONTH_PREC_METAL_TABLE[key].min(axis=1).reset_index()
train_prec_metal_temple_MIN.columns = ['CUST_NO',str('PREC_METAL_'+key+'_MIN')]
# 合并字段
train_prec_metal_temple = pd.merge(train_prec_metal_temple_STD, train_prec_metal_temple_MEAN, on='CUST_NO', how='inner')
train_prec_metal_temple = pd.merge(train_prec_metal_temple, train_prec_metal_temple_MAX, on='CUST_NO', how='inner')
train_prec_metal_temple = pd.merge(train_prec_metal_temple, train_prec_metal_temple_MIN, on='CUST_NO', how='inner')
return train_prec_metal_temple
train_prec_metal_temple = BY_MONTH_PREC_METAL('BUS_CTG_CD')
for key in ['CNT', 'ARG_BAL', 'MTH_ARG_ACCM']:
train_prec_metal_temple = pd.merge(train_prec_metal_temple, BY_MONTH_PREC_METAL(key))
train_prec_metal_temple.drop_duplicates(subset=None, keep='first', inplace=True)

return train_prec_metal_temple

代理保险账户信息(AGET_INSR)- 处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def BY_MONTH_AGET_INSR_METHOD(dataset_AGET_INSR):
# 预处理
train_aget_insr_temple = dataset_AGET_INSR[['DATA_DAT', 'CUST_NO', 'PREM', 'CVAG', 'INSE_CNT', 'MTH_PREM_ACCM', 'BEG_MTH_PREM_BAL']]
# 按月将金额分组
BY_MONTH_AGET_INSR_TABLE = train_aget_insr_temple.groupby(['CUST_NO', 'DATA_DAT'])[['PREM', 'CVAG', 'INSE_CNT', 'MTH_PREM_ACCM', 'BEG_MTH_PREM_BAL']].sum().unstack()
# 声明函数 计算三个月的标准差、均值、最大值、最小值
def BY_MONTH_AGET_INSR(key):
train_aget_insr_temple_STD = BY_MONTH_AGET_INSR_TABLE[key].std(axis=1).reset_index()
train_aget_insr_temple_STD.columns = ['CUST_NO', str('AGET_INSR_'+key+'_STD')]
train_aget_insr_temple_MEAN = BY_MONTH_AGET_INSR_TABLE[key].mean(axis=1).reset_index()
train_aget_insr_temple_MEAN.columns = ['CUST_NO',str('AGET_INSR_'+key+'_MEAN')]
train_aget_insr_temple_MAX = BY_MONTH_AGET_INSR_TABLE[key].max(axis=1).reset_index()
train_aget_insr_temple_MAX.columns = ['CUST_NO',str('AGET_INSR_'+key+'_MAX')]
train_aget_insr_temple_MIN = BY_MONTH_AGET_INSR_TABLE[key].min(axis=1).reset_index()
train_aget_insr_temple_MIN.columns = ['CUST_NO',str('AGET_INSR_'+key+'_MIN')]
# 合并字段
train_aget_insr_temple = pd.merge(train_aget_insr_temple_STD, train_aget_insr_temple_MEAN, on='CUST_NO', how='inner')
train_aget_insr_temple = pd.merge(train_aget_insr_temple, train_aget_insr_temple_MAX, on='CUST_NO', how='inner')
train_aget_insr_temple = pd.merge(train_aget_insr_temple, train_aget_insr_temple_MIN, on='CUST_NO', how='inner')
return train_aget_insr_temple
train_aget_insr_temple = BY_MONTH_AGET_INSR('PREM')
for key in ['CVAG', 'INSE_CNT', 'MTH_PREM_ACCM', 'BEG_MTH_PREM_BAL']:
train_aget_insr_temple = pd.merge(train_aget_insr_temple, BY_MONTH_AGET_INSR(key))
train_aget_insr_temple.drop_duplicates(subset=None, keep='first', inplace=True)

return train_aget_insr_temple

表间融合

TRAIN集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# train 集 38256
dataset_train = pd.merge(IDV_CUST_BASIC_PARSE(CALC_AGE(train_idv_cust_basic)), IDV_TD_PARSE(train_idv_td), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, IDV_DPSA_PARSE(train_idv_dpsa), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, TR_DC_PARSE(train_tr_dc), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, BY_MONTH_IDV_TD_METHOD(train_idv_td), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, BY_MONTH_IDV_DPSA_METHOD(train_idv_dpsa), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, BY_MONTH_TR_DC_METHOD(train_tr_dc), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, THR_PTY_CSTD_PARSE(train_thr_pty_cstd), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, BY_MONTH_LOAN_METHOD(train_loan), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, BY_MONTH_FUND_METHOD(train_fund), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, BY_MONTH_BOND_METHOD(train_bond), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, BY_MONTH_PREC_METAL_METHOD(train_prec_metal), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, BY_MONTH_AGET_INSR_METHOD(train_aget_insr), on='CUST_NO', how='outer')
dataset_train = pd.merge(dataset_train, AUM_NUMS(train_idv_td, train_bond, train_fund, train_prec_metal, train_aget_insr, train_idv_cust_basic), on='CUST_NO', how='outer')
dataset_train = pd.merge(WEALTH(train_cust_result,train_idv_td,train_bond,train_fund,train_prec_metal,train_aget_insr,train_thr_pty_cstd), dataset_train, on='CUST_NO', how='outer')
/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/root/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/merge.py:558: UserWarning: merging between different levels can give an unintended result (1 levels on the left, 2 on the right)
  warnings.warn(msg, UserWarning)
/root/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py:2530: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
1
2
# 将个人客户基本信息表与结果整合
dataset_train = pd.merge(dataset_train, train_cust_result, on='CUST_NO')

TRAIN 剪枝

1
2
# 删除无用字段
dataset_train = dataset_train.drop(['CUST_NO'], axis=1)

A集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# A 集 4782
dataset_A = pd.merge(IDV_CUST_BASIC_PARSE(CALC_AGE(A_idv_cust_basic)), IDV_TD_PARSE(A_idv_td), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, IDV_DPSA_PARSE(A_idv_dpsa), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, TR_DC_PARSE(A_tr_dc), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, BY_MONTH_IDV_TD_METHOD(A_idv_td), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, BY_MONTH_IDV_DPSA_METHOD(A_idv_dpsa), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, BY_MONTH_TR_DC_METHOD(A_tr_dc), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, THR_PTY_CSTD_PARSE(A_thr_pty_cstd), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, BY_MONTH_LOAN_METHOD(A_loan), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, BY_MONTH_FUND_METHOD(A_fund), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, BY_MONTH_BOND_METHOD(A_bond), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, BY_MONTH_PREC_METAL_METHOD(A_prec_metal), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, BY_MONTH_AGET_INSR_METHOD(A_aget_insr), on='CUST_NO', how='outer')
dataset_A = pd.merge(dataset_A, AUM_NUMS(A_idv_td, A_bond, A_fund, A_prec_metal, A_aget_insr, A_idv_cust_basic), on='CUST_NO', how='outer')
dataset_A = pd.merge(WEALTH(A_customid,A_idv_td,A_bond,A_fund,A_prec_metal,A_aget_insr,A_thr_pty_cstd), dataset_A, on='CUST_NO', how='outer')
/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/root/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/merge.py:558: UserWarning: merging between different levels can give an unintended result (1 levels on the left, 2 on the right)
  warnings.warn(msg, UserWarning)
/root/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py:2530: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)

A 剪枝

1
2
# 删除无用字段
dataset_A = dataset_A.drop(['CUST_NO'], axis=1)

B集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# A 集 4782
dataset_B = pd.merge(IDV_CUST_BASIC_PARSE(CALC_AGE(B_idv_cust_basic)), IDV_TD_PARSE(B_idv_td), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, IDV_DPSA_PARSE(B_idv_dpsa), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, TR_DC_PARSE(B_tr_dc), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, BY_MONTH_IDV_TD_METHOD(B_idv_td), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, BY_MONTH_IDV_DPSA_METHOD(B_idv_dpsa), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, BY_MONTH_TR_DC_METHOD(B_tr_dc), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, THR_PTY_CSTD_PARSE(B_thr_pty_cstd), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, BY_MONTH_LOAN_METHOD(B_loan), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, BY_MONTH_FUND_METHOD(B_fund), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, BY_MONTH_BOND_METHOD(B_bond), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, BY_MONTH_PREC_METAL_METHOD(B_prec_metal), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, BY_MONTH_AGET_INSR_METHOD(B_aget_insr), on='CUST_NO', how='outer')
dataset_B = pd.merge(dataset_B, AUM_NUMS(B_idv_td, B_bond, B_fund, B_prec_metal, B_aget_insr, B_idv_cust_basic), on='CUST_NO', how='outer')
dataset_B = pd.merge(WEALTH(B_customid,B_idv_td,B_bond,B_fund,B_prec_metal,B_aget_insr,B_thr_pty_cstd), dataset_B, on='CUST_NO', how='outer')
/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/root/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/merge.py:558: UserWarning: merging between different levels can give an unintended result (1 levels on the left, 2 on the right)
  warnings.warn(msg, UserWarning)
/root/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py:2530: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)

B集剪枝

1
2
# 删除无用字段
dataset_B = dataset_B.drop(['CUST_NO'], axis=1)

欠采样

1
train_SAMPLE = pd.concat([dataset_train[dataset_train['FLAG']==0].sample(frac=0.20),dataset_train[dataset_train['FLAG']==1]],axis=0).sample(frac=1.0)

算法分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import numpy as np  
import pandas as pd
import xgboost as xgb
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

# 构建特征
#X = dataset_train.drop(['FLAG'],axis=1).fillna(0)
#y = dataset_train['FLAG']
X = train_SAMPLE.drop(['FLAG'],axis=1).fillna(0)
y = train_SAMPLE['FLAG']
# X = dataset_train_SAMPLE.drop(['FLAG'],axis=1)
# y = dataset_train_SAMPLE['FLAG']

# 用sklearn.cross_validation进行训练数据集划分
X, val_X, y, val_y = train_test_split(
X,
y,
test_size=0.01,
#test_size=0.125,
random_state=2020,
)

# SMOTE过采样
# smo = SMOTE(random_state=2019)
# X_smo, y_smo = smo.fit_sample(X, y)
# X_smo = pd.DataFrame(X_smo)
# X_smo.columns=['XXX']
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import numpy as np  
import pandas as pd
import xgboost as xgb
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

# 构建特征
X = dataset_train.drop(['FLAG'],axis=1).fillna(0)
y = dataset_train['FLAG']

# 用sklearn.cross_validation进行训练数据集划分
X, val_X, y, val_y = train_test_split(
X,
y,
test_size=0.125,
random_state=2019,
)
train_REAL = pd.concat([X,y],axis=1)
train_REAL = pd.concat([train_REAL[train_REAL['FLAG']==0].sample(frac=0.20),train_REAL[train_REAL['FLAG']==1]],axis=0).sample(frac=1.0)
X = train_REAL.drop(['FLAG'],axis=1)
y = train_REAL['FLAG']

XGB算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# xgb矩阵赋值  
xgb_val = xgb.DMatrix(val_X, label=val_y)
# xgb_train = xgb.DMatrix(X_smo, label=y_smo)
xgb_train = xgb.DMatrix(X, label=y)

# xgboost模型#
params = {
'booster': 'gbtree',
# 'objective': 'multi:softmax', # 多分类的问题、
# 'objective': 'multi:softprob', # 多分类概率
'objective': 'binary:logistic',
'eval_metric': 'logloss',
# 'num_class': 9, # 类别数,与 multisoftmax 并用
'gamma': 0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
'max_depth': 5, # 构建树的深度,越大越容易过拟合
'alpha': 0, # L1正则化系数
'lambda': 8, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
'subsample': 1, # 随机采样训练样本
'colsample_bytree': 0.6, # 生成树时进行的列采样
'min_child_weight': 3,
# 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
# ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
# 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0.
'eta': 0.01 ,
#'eta': 0.1, # 如同学习率 - result 0.554455
'seed': 1000,
'nthread': 24, # cpu 线程数
'missing': 1,
'scale_pos_weight': (np.sum(y==0)/np.sum(y==1)) # 用来处理正负样本不均衡的问题,通常取:sum(negative cases) / sum(positive cases)
# 'eval_metric': 'auc'
}

plst = list(params.items())
num_rounds = 30000 # 迭代次数
watchlist = [(xgb_train, 'train'), (xgb_val, 'val')]

# 交叉验证
result = xgb.cv(plst, xgb_train, num_boost_round=200, nfold=4, early_stopping_rounds=200, verbose_eval=True, folds=StratifiedKFold(n_splits=4).split(X, y))

# 训练模型并保存
# early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=400)
1
2
# 模型保存
model.save_model('./leezy_xgb.model') # 用于存储训练出的模型
1
2
3
4
5
6
7
8
9
10
11
12
13
# 读取模型
# model = xgb.Booster(model_file='5903-xgb.model')
# 本地验证
xgb_test = xgb.DMatrix(val_X)
preds = model.predict(xgb_test)
# 导出结果
threshold = 0.70
ans = []
for pred in preds:
result = 1 if pred > threshold else 0
ans.append(result)
pred_result= pd.Series(ans, dtype='int32')
evaluate(val_y, pred_result)
test_pct_1: 0.1581
pred_pct_1: 0.1836
Precesion: 0.5581
Recall: 0.6481
F1-score: 0.5998
confusion matrix:
[[3638  388]
 [ 266  490]]

{'Confusuion': array([[3638,  388],
        [ 266,  490]]),
 'F1': 0.5997552019583843,
 'Precision': 0.5580865603644647,
 'acc': 0.863237139272271,
 'pred_pct_1': 0.1836051861145964,
 'recall': 0.6481481481481481,
 'roc_auc': 0.7758872881823701,
 'test_pct_1': 0.15809284818067754}
1
2
3
4
5
6
7
8
9
10
11
# A集训练
xgb_A = xgb.DMatrix(dataset_A)
A_preds = model.predict(xgb_A)

# 导出结果
threshold = 0.40
ans = []
for pred in A_preds:
result = 1 if pred > threshold else 0
ans.append(result)
pred_A_result= pd.Series(ans, dtype='int32')
1
2
3
4
5
6
7
8
9
10
11
# B集训练
xgb_B = xgb.DMatrix(dataset_B)
B_preds = model.predict(xgb_B)

# 导出结果
threshold = 0.68
ans = []
for pred in B_preds:
result = 1 if pred > threshold else 0
ans.append(result)
pred_B_result= pd.Series(ans, dtype='int32')
1
2
3
4
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
# print(importance)
1
2
3
4
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]=(30,40)
xgb.plot_importance(model, max_num_features=100)
plt.show()
/root/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1320: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

IMPORTANCES

1
2
# 查看特征排名
feature_rank(model, 30)

提交结果

1
pred_B_result.value_counts()
0    3901
1     881
dtype: int64
1
2
# 合并结果
result = pd.concat([B_customid, pred_B_result],axis=1)
1
result.to_csv('111.csv',header=0,index=0)
1
init_woody
The prv extension is already loaded. To reload it, use:
  %reload_ext prv
Matplotlib env init complete.
Warnings off.
1
predict 2 000.csv
'提交次数已用完,请明日再试!'

深化特征工程

各类理财产品的持有数量

1
2
3
4
5
6
train_idv_td = pd.read_csv("../data/2/train/IDV_TD.csv", encoding='utf-8') # 个人定期存款账户信息(IDV_TD)
train_bond = pd.read_csv("../data/2/train/BOND.csv", encoding='utf-8') # 国债账户信息(BOND)
train_fund = pd.read_csv("../data/2/train/FUND.csv", encoding='utf-8') # 基金账户信息(FUND)
train_prec_metal = pd.read_csv("../data/2/train/PREC_METAL.csv", encoding='utf-8') # 贵金属账户信息(PREC_METAL)
train_aget_insr = pd.read_csv("../data/2/train/AGET_INSR.csv", encoding='utf-8') # 代理保险账户信息(AGET_INSR)
train_idv_cust_basic = pd.read_csv("../data/2/train/IDV_CUST_BASIC.csv", encoding='utf-8') # 个人客户基本信息(IDV_CUST_BASIC)
/root/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2728: DtypeWarning: Columns (12) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
1
2
3
4
5
# 定期的个数
train_idv_td_3 = train_idv_td[train_idv_td['DATA_DAT']==3734035200]
train_idv_td_AUM = train_idv_td_3[['CUST_NO', 'ARG_CRT_DAT']]
train_idv_td_num = train_idv_td_AUM.groupby('CUST_NO').count().reset_index()
train_idv_td_num.rename(columns={'ARG_CRT_DAT':'IDV_TD_NUM'}, inplace=True)
1
2
3
4
5
# 国债的个数
train_bond_3 = train_bond[train_bond['DATA_DAT']==3734035200]
train_bond_AUM = train_bond_3[['CUST_NO', 'ARG_CRT_DAT']]
train_bond_num = train_bond_AUM.groupby('CUST_NO').count().reset_index()
train_bond_num.rename(columns={'ARG_CRT_DAT':'BOND_NUM'}, inplace=True)
1
2
3
4
5
# 基金的个数
train_fund_3 = train_fund[train_fund['DATA_DAT']==3734035200]
train_fund_AUM = train_fund[['CUST_NO', 'ARG_CRT_DAT']]
train_fund_num = train_fund_AUM.groupby('CUST_NO').count().reset_index()
train_fund_num.rename(columns={'ARG_CRT_DAT':'FUND_NUM'}, inplace=True)
1
2
3
4
5
6
# 贵金属的个数
train_prec_metal_3 = train_prec_metal[train_prec_metal['DATA_DAT']==3734035200]
train_prec_metal_3.dropna(subset=['SIGD_DAT'], inplace=True)
train_prec_metal_AUM = train_prec_metal_3[['CUST_NO', 'SIGD_DAT']]
train_prec_metal_num = train_prec_metal_AUM.groupby('CUST_NO').count().reset_index()
train_prec_metal_num.rename(columns={'SIGD_DAT':'PREC_METAL_NUM'}, inplace=True)
/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
1
2
3
4
5
6
# 保险的个数
train_aget_insr_3 = train_aget_insr[train_aget_insr['DATA_DAT']==3734035200]
train_aget_insr_3.dropna(subset=['INSE_DAT'], inplace=True)
train_aget_insr_AUM = train_aget_insr_3[['CUST_NO', 'INSE_DAT']]
train_aget_insr_num = train_aget_insr_AUM.groupby('CUST_NO').count().reset_index()
train_aget_insr_num.rename(columns={'INSE_DAT':'AGET_INSR_NUM'}, inplace=True)
/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
1
2
# 客户ID
CUST_NO_LSIT = train_idv_cust_basic['CUST_NO'].reset_index().drop('index', axis=1)
1
2
3
# 合并为一个表
CUST_NO_AUM_LSIT = pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(CUST_NO_LSIT, train_idv_td_num, on='CUST_NO', how='outer'), train_bond_num, on='CUST_NO', how='outer'), train_fund_num,
on='CUST_NO', how='outer'), train_prec_metal_num, on='CUST_NO', how='outer'), train_aget_insr_num, on='CUST_NO', how='outer').fillna(0)
1
CUST_NO_AUM_LSIT['PRODUCTS_NUM'] = CUST_NO_AUM_LSIT[['IDV_TD_NUM','BOND_NUM','FUND_NUM','PREC_METAL_NUM','AGET_INSR_NUM']].sum(axis=1)
1
CUST_NO_AUM_LSIT[CUST_NO_AUM_LSIT['CUST_NO'] == 'a100d07faf0bc3c60d7d65abd704142a']
1
CUST_NO_AUM_LSIT.head(100)
1
MyAUMTest = pd.merge(CUST_NO_AUM_LSIT, train_cust_result, on='CUST_NO', how='outer')
1
MyAUMTest.head(10)
1
CUST_NO_AUM_LSIT.shape
(38256, 6)
1
2
# 关联关系
g = sns.heatmap(MyAUMTest[['FLAG', 'IDV_TD_NUM','BOND_NUM','FUND_NUM','PREC_METAL_NUM','AGET_INSR_NUM','PRODUCTS_NUM']].corr(), annot=True, cmap="coolwarm")
/root/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1320: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

HEATMAP

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 获取到所有投资种类的和
def AUM_NUMS(train_idv_td, train_bond, train_fund, train_prec_metal, train_aget_insr, train_idv_cust_basic):
# 定期的个数
train_idv_td_3 = train_idv_td[train_idv_td['DATA_DAT']==3734035200]
train_idv_td_AUM = train_idv_td_3[['CUST_NO', 'ARG_CRT_DAT']]
train_idv_td_num = train_idv_td_AUM.groupby('CUST_NO').count().reset_index()
train_idv_td_num.rename(columns={'ARG_CRT_DAT':'IDV_TD_NUM'}, inplace=True)
# 国债的个数
train_bond_3 = train_bond[train_bond['DATA_DAT']==3734035200]
train_bond_AUM = train_bond_3[['CUST_NO', 'ARG_CRT_DAT']]
train_bond_num = train_bond_AUM.groupby('CUST_NO').count().reset_index()
train_bond_num.rename(columns={'ARG_CRT_DAT':'BOND_NUM'}, inplace=True)
# 基金的个数
train_fund_3 = train_fund[train_fund['DATA_DAT']==3734035200]
train_fund_AUM = train_fund[['CUST_NO', 'ARG_CRT_DAT']]
train_fund_num = train_fund_AUM.groupby('CUST_NO').count().reset_index()
train_fund_num.rename(columns={'ARG_CRT_DAT':'FUND_NUM'}, inplace=True)
# 贵金属的个数
train_prec_metal_3 = train_prec_metal[train_prec_metal['DATA_DAT']==3734035200]
train_prec_metal_3.dropna(subset=['SIGD_DAT'], inplace=True)
train_prec_metal_AUM = train_prec_metal_3[['CUST_NO', 'SIGD_DAT']]
train_prec_metal_num = train_prec_metal_AUM.groupby('CUST_NO').count().reset_index()
train_prec_metal_num.rename(columns={'SIGD_DAT':'PREC_METAL_NUM'}, inplace=True)
# 保险的个数
train_aget_insr_3 = train_aget_insr[train_aget_insr['DATA_DAT']==3734035200]
train_aget_insr_3.dropna(subset=['INSE_DAT'], inplace=True)
train_aget_insr_AUM = train_aget_insr_3[['CUST_NO', 'INSE_DAT']]
train_aget_insr_num = train_aget_insr_AUM.groupby('CUST_NO').count().reset_index()
train_aget_insr_num.rename(columns={'INSE_DAT':'AGET_INSR_NUM'}, inplace=True)
# 客户ID
CUST_NO_LSIT = train_idv_cust_basic['CUST_NO'].reset_index().drop('index', axis=1)
# 合并为一个表
CUST_NO_AUM_LSIT = pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(CUST_NO_LSIT, train_idv_td_num, on='CUST_NO', how='outer'), train_bond_num, on='CUST_NO', how='outer'), train_fund_num,
on='CUST_NO', how='outer'), train_prec_metal_num, on='CUST_NO', how='outer'), train_aget_insr_num, on='CUST_NO', how='outer').fillna(0)
CUST_NO_AUM_LSIT['PRODUCTS_NUM'] = CUST_NO_AUM_LSIT[['IDV_TD_NUM','BOND_NUM','FUND_NUM','PREC_METAL_NUM','AGET_INSR_NUM']].sum(axis=1)

return CUST_NO_AUM_LSIT

三个月变动

定期账户
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def BY_MONTH_IDV_TD_METHOD(dataset_IDV_TD):
# 预处理
# 简化IDV_TD表
train_idv_td_temple = dataset_IDV_TD.drop(['ARG_CRT_DAT', 'CLS_ACCT_DAT', 'MATU_DAT', 'LAC', 'ACCT_STS_CD','DP_DAY_CD', 'RDEP_IND_CD', 'RDEP_DP_DAY_CD', 'RAT_CTG','FXDI_SA_ACCM', 'MTH_ACT_DAYS_TOT'], axis=1)
# 将train_idv_td和train_base_excg合并
train_idv_td_temple = pd.merge(train_idv_td_temple, train_base_excg, on = 'CCY_CD')
# 执行汇率计算函数
train_idv_td_temple['CRBAL'] = train_idv_td_temple.apply(compute_REAL_MONEY, axis = 1, args = ('CRBAL', 'RMB_MID_PRIC'))
train_idv_td_temple['REG_CAP'] = train_idv_td_temple.apply(compute_REAL_MONEY, axis = 1, args = ('REG_CAP', 'RMB_MID_PRIC'))
train_idv_td_temple['FXDI_T_ACCM'] = train_idv_td_temple.apply(compute_REAL_MONEY, axis = 1, args = ('FXDI_T_ACCM', 'RMB_MID_PRIC'))
train_idv_td_temple['TDOP_SHD_PAY_INTS'] = train_idv_td_temple.apply(compute_REAL_MONEY, axis = 1, args = ('TDOP_SHD_PAY_INTS', 'RMB_MID_PRIC'))
train_idv_td_temple['MOTH_CR_ACCM'] = train_idv_td_temple.apply(compute_REAL_MONEY, axis = 1, args = ('MOTH_CR_ACCM', 'RMB_MID_PRIC'))
train_idv_td_temple = train_idv_td_temple.drop(['CCY_CD', 'RMB_MID_PRIC'], axis=1)
# 按月将金额分类
BY_MONTH_TD = train_idv_td_temple.groupby(['CUST_NO', 'DATA_DAT'])[['CRBAL', 'REG_CAP', 'FXDI_T_ACCM', 'TDOP_SHD_PAY_INTS', 'MOTH_CR_ACCM']].sum().unstack()
# 声明函数 计算三个月的标准差、均值、最大值、最小值
def BY_MONTH_IDV_TD(key):
train_idv_td_temple_STD = BY_MONTH_TD[key].std(axis=1).reset_index()
train_idv_td_temple_STD.columns = ['CUST_NO', str('IDV_TD_'+key+'_STD')]
train_idv_td_temple_MEAN = BY_MONTH_TD[key].mean(axis=1).reset_index()
train_idv_td_temple_MEAN.columns = ['CUST_NO',str('IDV_TD_'+key+'_MEAN')]
train_idv_td_temple_MAX = BY_MONTH_TD[key].max(axis=1).reset_index()
train_idv_td_temple_MAX.columns = ['CUST_NO',str('IDV_TD_'+key+'_MAX')]
train_idv_td_temple_MIN = BY_MONTH_TD[key].min(axis=1).reset_index()
train_idv_td_temple_MIN.columns = ['CUST_NO',str('IDV_TD_'+key+'_MIN')]
# 合并字段
train_idv_td_temple = pd.merge(train_idv_td_temple_STD, train_idv_td_temple_MEAN, on='CUST_NO', how='inner')
train_idv_td_temple = pd.merge(train_idv_td_temple, train_idv_td_temple_MAX, on='CUST_NO', how='inner')
train_idv_td_temple = pd.merge(train_idv_td_temple, train_idv_td_temple_MIN, on='CUST_NO', how='inner')
train_idv_td_temple = train_idv_td_temple.fillna(0)
return train_idv_td_temple
train_idv_td_temple = BY_MONTH_IDV_TD('CRBAL')
for key in ['REG_CAP', 'FXDI_T_ACCM','TDOP_SHD_PAY_INTS', 'MOTH_CR_ACCM']:
train_idv_td_temple = pd.merge(train_idv_td_temple, BY_MONTH_IDV_TD(key))
train_idv_td_temple.drop_duplicates(subset=None, keep='first', inplace=True)

return train_idv_td_temple
活期账户
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def BY_MONTH_IDV_DPSA_METHOD(dataset_IDV_DPSA):
# 预处理
# 将IDV_DPSA表与汇率表整合
train_idv_dpsa_temple = pd.merge(dataset_IDV_DPSA, train_base_excg, on = 'CCY_CD', how='inner')
train_idv_dpsa_temple.drop_duplicates(subset=None, keep='first', inplace=True)
# 执行汇率计算函数
train_idv_dpsa_temple['CRBAL'] = train_idv_dpsa_temple.apply(compute_REAL_MONEY, axis=1, args = ('CRBAL', 'RMB_MID_PRIC'))
train_idv_dpsa_temple['ITST_BRNG_ACCM'] = train_idv_dpsa_temple.apply(compute_REAL_MONEY, axis=1, args = ('ITST_BRNG_ACCM', 'RMB_MID_PRIC'))
train_idv_dpsa_temple['FRZ_TOT_AMT'] = train_idv_dpsa_temple.apply(compute_REAL_MONEY, axis = 1, args = ('FRZ_TOT_AMT', 'RMB_MID_PRIC'))
train_idv_dpsa_temple['DAY_WD_ACT_AMT'] = train_idv_dpsa_temple.apply(compute_REAL_MONEY, axis = 1, args = ('DAY_WD_ACT_AMT', 'RMB_MID_PRIC'))
train_idv_dpsa_temple['DAY_CSH_DP_SUM'] = train_idv_dpsa_temple.apply(compute_REAL_MONEY, axis = 1, args = ('DAY_CSH_DP_SUM', 'RMB_MID_PRIC'))
train_idv_dpsa_temple['DAY_CSH_WD_SUM'] = train_idv_dpsa_temple.apply(compute_REAL_MONEY, axis = 1, args = ('DAY_CSH_WD_SUM', 'RMB_MID_PRIC'))
train_idv_dpsa_temple['DAY_TFI_SUM'] = train_idv_dpsa_temple.apply(compute_REAL_MONEY, axis = 1, args = ('DAY_TFI_SUM', 'RMB_MID_PRIC'))
train_idv_dpsa_temple['MOTH_CR_ACCM'] = train_idv_dpsa_temple.apply(compute_REAL_MONEY, axis = 1, args = ('MOTH_CR_ACCM', 'RMB_MID_PRIC'))
train_idv_dpsa_temple['BEG_MOTH_CRBAL'] = train_idv_dpsa_temple.apply(compute_REAL_MONEY, axis = 1, args = ('BEG_MOTH_CRBAL', 'RMB_MID_PRIC'))
train_idv_dpsa_temple = train_idv_dpsa_temple.drop(['CCY_CD', 'RMB_MID_PRIC'], axis=1)
# 按月将金额分组
BY_MONTH_DPSA = train_idv_dpsa_temple.groupby(['CUST_NO', 'DATA_DAT'])[['CRBAL', 'ITST_BRNG_ACCM',
'FRZ_TOT_AMT', 'DAY_WD_ACT_AMT', 'DAY_CSH_DP_SUM', 'DAY_CSH_WD_SUM',
'DAY_TFI_SUM', 'MOTH_CR_ACCM', 'BEG_MOTH_CRBAL']].sum().unstack()
# 声明函数 计算三个月的标准差、均值、最大值、最小值
def BY_MONTH_IDV_DPSA(key):
train_idv_dpsa_temple_STD = BY_MONTH_DPSA[key].std(axis=1).reset_index()
train_idv_dpsa_temple_STD.columns = ['CUST_NO', str('IDV_DPSA_'+key+'_STD')]
train_idv_dpsa_temple_MEAN = BY_MONTH_DPSA[key].mean(axis=1).reset_index()
train_idv_dpsa_temple_MEAN.columns = ['CUST_NO',str('IDV_DPSA_'+key+'_MEAN')]
train_idv_dpsa_temple_MAX = BY_MONTH_DPSA[key].max(axis=1).reset_index()
train_idv_dpsa_temple_MAX.columns = ['CUST_NO',str('IDV_DPSA_'+key+'_MAX')]
train_idv_dpsa_temple_MIN = BY_MONTH_DPSA[key].min(axis=1).reset_index()
train_idv_dpsa_temple_MIN.columns = ['CUST_NO',str('IDV_DPSA_'+key+'_MIN')]
# 合并字段
train_idv_dpsa_temple = pd.merge(train_idv_dpsa_temple_STD, train_idv_dpsa_temple_MEAN, on='CUST_NO', how='inner')
train_idv_dpsa_temple = pd.merge(train_idv_dpsa_temple, train_idv_dpsa_temple_MAX, on='CUST_NO', how='inner')
train_idv_dpsa_temple = pd.merge(train_idv_dpsa_temple, train_idv_dpsa_temple_MIN, on='CUST_NO', how='inner')
train_idv_dpsa_temple = train_idv_dpsa_temple.fillna(0)
return train_idv_dpsa_temple
train_idv_dpsa_temple = BY_MONTH_IDV_DPSA('CRBAL')
for key in ['ITST_BRNG_ACCM','FRZ_TOT_AMT', 'DAY_WD_ACT_AMT', 'DAY_CSH_DP_SUM',
'DAY_CSH_WD_SUM','DAY_TFI_SUM', 'MOTH_CR_ACCM', 'BEG_MOTH_CRBAL']:
train_idv_dpsa_temple = pd.merge(train_idv_dpsa_temple, BY_MONTH_IDV_DPSA(key))
train_idv_dpsa_temple.drop_duplicates(subset=None, keep='first', inplace=True)

return train_idv_dpsa_temple
交易信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def BY_MONTH_TR_DC_METHOD(dataset_TR_DC):
# 预处理
# 去除抹帐(1),无意义列,这里全是0
dataset_TR_DC = dataset_TR_DC[dataset_TR_DC['CAN_IND']==0].drop(['RED_BLU_CD', 'CRD_TYP1', 'TR_TYPE', 'TR_CHANL_CD', 'CAN_IND',
'CARD_USETYPE', 'CARD_ELECASH', 'CARD_MATERIAL'],axis=1)
dataset_TR_DC.drop_duplicates(subset=None, keep='first', inplace=True)
# 交易表时间段打上标签
# 2月份 [3726432000, 3728937600), 3月份 [3728937600, 3731616000), 4月份 [3731616000, 3734035200]
dataset_TR_DC['TR_DAT'] = pd.cut(dataset_TR_DC['TR_DAT'], [3726432000, 3728937600, 3731616000, 3734035200], labels=['2月份', '3月份', '4月份'])
# 将交易按照 正 / 负 分开
train_tr_dc['INCOME'] = train_tr_dc['TR_AMT'] > 0
myseries = train_tr_dc.groupby(['CUST_NO', 'INCOME'])['TR_AMT'].sum()
myseries = myseries.unstack().fillna(0)
TR_DC_IN_OUT = pd.DataFrame(myseries).reset_index()
TR_DC_IN_OUT['IN_SUM'] = TR_DC_IN_OUT[True]
TR_DC_IN_OUT['OUT_SUM'] = TR_DC_IN_OUT[False]
TR_DC_IN_OUT = TR_DC_IN_OUT.drop([True, False], axis=1)
BY_MONTH_DC = dataset_TR_DC.groupby(['CUST_NO', 'TR_DAT'])[['TR_AMT']].sum().unstack()
# 声明函数 计算三个月的标准差、均值、最大值、最小值
def BY_MONTH_TR_DC(key):
train_tr_dc_temple_STD = BY_MONTH_DC[key].std(axis=1).reset_index()
train_tr_dc_temple_STD.columns = ['CUST_NO', str('TR_DC_'+key+'_STD')]
train_tr_dc_temple_MEAN = BY_MONTH_DC[key].mean(axis=1).reset_index()
train_tr_dc_temple_MEAN.columns = ['CUST_NO',str('TR_DC_'+key+'_MEAN')]
train_tr_dc_temple_MAX = BY_MONTH_DC[key].max(axis=1).reset_index()
train_tr_dc_temple_MAX.columns = ['CUST_NO',str('TR_DC_'+key+'_MAX')]
train_tr_dc_temple_MIN = BY_MONTH_DC[key].min(axis=1).reset_index()
train_tr_dc_temple_MIN.columns = ['CUST_NO',str('TR_DC_'+key+'_MIN')]
# 合并字段
train_tr_dc_temple = pd.merge(train_tr_dc_temple_STD, train_tr_dc_temple_MEAN, on='CUST_NO', how='inner')
train_tr_dc_temple = pd.merge(train_tr_dc_temple, train_tr_dc_temple_MAX, on='CUST_NO', how='inner')
train_tr_dc_temple = pd.merge(train_tr_dc_temple, train_tr_dc_temple_MIN, on='CUST_NO', how='inner')
train_tr_dc_temple = train_tr_dc_temple.fillna(0)
return train_tr_dc_temple
train_tr_dc_temple = BY_MONTH_TR_DC('TR_AMT')
return train_tr_dc_temple
1
train_tr_dc['TR_DAT'] = pd.cut(train_tr_dc['TR_DAT'], [3726432000, 3728937600, 3731616000, 3734035200], labels=['FEB', 'MAR', 'APR'])
1
BY_MONTH_TR_DC_METHOD = train_tr_dc[['TR_DAT']]
1
sns.countplot(x='TR_DAT', data=BY_MONTH_TR_DC_METHOD)
<matplotlib.axes._subplots.AxesSubplot at 0x7f0167e23a58>

COUNTPLOT

年龄

1
2
3
4
5
6
7
8
9
10
11
12
# 返回单个 或 series 的ms对应日期, 再被 2015 减得到的年数
def adj_date(series, datatype='series'):
import time
from datetime import datetime
initial_year = 2015
adj_msj_obj = datetime.strptime("2042-08-31 16:00:00.123", "%Y-%m-%d %H:%M:%S.%f")
adj_ms = int(time.mktime(adj_msj_obj.timetuple())*1000 + adj_msj_obj.microsecond/1000.0)/1000
current_ms = train_idv_cust_basic['DATA_DAT'][0] # 3736713600
if(datatype=='single'):
return initial_year - int(time.strftime("%Y%m", time.localtime(current_ms-adj_ms)))
else:
return series.map(lambda x: initial_year - int(time.strftime("%Y", time.localtime(x - adj_ms))) if pd.notna(x) else x)
1
2
3
4
5
6
7
# 年龄计算函数
def CALC_AGE(dataset_idv_cust_basic):
dataset_idv_cust_basic['GC_BRTH'] = dataset_idv_cust_basic['GC_BRTH'].fillna(dataset_idv_cust_basic['GC_BRTH'].mean())
dataset_idv_cust_basic['AGE'] = adj_date(dataset_idv_cust_basic['GC_BRTH'])
dataset_idv_cust_basic['AGE'] = np.where(dataset_idv_cust_basic['AGE'] > 100, 31, dataset_idv_cust_basic['AGE'])
dataset_idv_cust_basic['AGE'] = np.where(dataset_idv_cust_basic['AGE'] < 16, 31, dataset_idv_cust_basic['AGE'])
return dataset_idv_cust_basic
1
AGE = CALC_AGE(train_idv_cust_basic)['AGE']
1
AGE.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f01681b5cc0>

/root/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1320: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

HIST

所有资产

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# td定期、bond国债、fund基金、prec_metal贵金属、aget_insr保险
def WEALTH(target,td,bond,fund,prec_metal,agent_insurance,thr_pty_cstd):
'''
# 销户日期必须 > 2月底,合约建立日期必须 < 4月底
#gp = df_tr_train.groupby(['CUST_NO','TR_CD']).agg({'TR_AMT':['sum','mean','max','min','count','std']})
if len(suffix)==1:
td = td[td['DATA_DAT']==time_point[1]]
bond = bond[bond['DATA_DAT']==time_point[1]]
fund = fund[fund['DATA_DAT']==time_point[1]]
prec_metal = prec_metal[prec_metal['DATA_DAT']==time_point[1]]
agent_insurance = agent_insurance[agent_insurance['DATA_DAT']==time_point[1]]
thr_pty_cstd = thr_pty_cstd[thr_pty_cstd['DATA_DAT']==time_point[1]]
elif len(suffix)==2:
td = td[td['DATA_DAT']>time_point[1]]
bond = bond[bond['DATA_DAT']>time_point[1]]
fund = fund[fund['DATA_DAT']>time_point[1]]
prec_metal = prec_metal[prec_metal['DATA_DAT']>time_point[1]]
agent_insurance = agent_insurance[agent_insurance['DATA_DAT']>time_point[1]]
thr_pty_cstd = thr_pty_cstd[thr_pty_cstd['DATA_DAT']>time_point[1]]
'''

# 选取各表合并特征
td = td[['CUST_NO','DATA_DAT','CRBAL','MOTH_CR_ACCM']].groupby(['CUST_NO','DATA_DAT']).sum().unstack().reset_index()

bond = bond[['CUST_NO','DATA_DAT','ARG_CUR_BAL','NVTA_MOTH_ACCM']].groupby(['CUST_NO','DATA_DAT']).sum().unstack().reset_index()

fund = fund[['CUST_NO','DATA_DAT','FUND_BAL','FUND_BAL_MOTH_BAL_ACCM']].groupby(['CUST_NO','DATA_DAT']).sum().unstack().reset_index()

prec_metal = prec_metal[['CUST_NO','DATA_DAT','ARG_BAL','MTH_ARG_ACCM']].groupby(['CUST_NO','DATA_DAT']).sum().unstack().reset_index()

agent_insurance = agent_insurance[['CUST_NO','DATA_DAT','BEG_MTH_PREM_BAL','MTH_PREM_ACCM']].groupby(['CUST_NO','DATA_DAT']).sum().unstack().reset_index()

thr_pty_cstd = thr_pty_cstd[['CUST_NO','DATA_DAT','AVL_BAL','MTH_ARG_BAL_ACCM']].groupby(['CUST_NO','DATA_DAT']).sum().unstack().reset_index()

# 理财各月的金额、持有产品的数量、
wealth = pd.merge(target[['CUST_NO']],td,on=['CUST_NO'],how='outer')
wealth = pd.merge(wealth,bond,on=['CUST_NO'],how='outer')
wealth = pd.merge(wealth,fund,on=['CUST_NO'],how='outer')
wealth = pd.merge(wealth,prec_metal,on=['CUST_NO'],how='outer')
wealth = pd.merge(wealth,thr_pty_cstd,on=['CUST_NO'],how='outer')
wealth = pd.merge(wealth,agent_insurance,on=['CUST_NO'],how='outer').fillna(0)

# 理财各月的金额、持有产品的数量、
#wealth['WEALTH_BAL'] = wealth['CRBAL'] + wealth['ARG_CUR_BAL'] + wealth['FUND_BAL'] + wealth['ARG_BAL'] + wealth['BEG_MTH_PREM_BAL'] #+ wealth['AVL_BAL']
#wealth['WEALTH_ACCM'] = wealth['MOTH_CR_ACCM'] + wealth['NVTA_MOTH_ACCM'] + wealth['FUND_BAL_MOTH_BAL_ACCM'] + wealth['MTH_ARG_ACCM'] + wealth['MTH_PREM_ACCM'] #+ wealth['MTH_ARG_BAL_ACCM']


#wealth = wealth.groupby(['CUST_NO'])['WEALTH_BAL','WEALTH_ACCM'].sum().reset_index()

# 持有的理财产品种类(不同期限/合约建立日期、相同品种的算不同产品)

# 首份理财合约建立时间、最近一份理财合约建立时间

# 3个月内买入理财产品次数

# 3个月存款积数之和(活期,活期+定期+三方存管)
X = wealth.copy()
X['WEALTH_BAL_1'] = X[('CRBAL', 3728764800)]+X[('ARG_CUR_BAL', 3728764800)]+X[('FUND_BAL', 3728764800)]+X[('ARG_BAL', 3728764800)]+X[('AVL_BAL', 3728764800)]+X[('BEG_MTH_PREM_BAL', 3728764800)]
X['WEALTH_BAL_2'] = X[('CRBAL', 3731443200)]+X[('ARG_CUR_BAL', 3731443200)]+X[('FUND_BAL', 3731443200)]+X[('ARG_BAL', 3731443200)]+X[('AVL_BAL', 3731443200)]+X[('BEG_MTH_PREM_BAL', 3731443200)]
X['WEALTH_BAL_3'] = X[('CRBAL', 3734035200)]+X[('ARG_CUR_BAL', 3734035200)]+X[('FUND_BAL', 3734035200)]+X[('ARG_BAL', 3734035200)]+X[('AVL_BAL', 3734035200)]+X[('BEG_MTH_PREM_BAL', 3734035200)]
X['WEALTH_ACCM_1'] = X[('MOTH_CR_ACCM', 3728764800)]+X[('NVTA_MOTH_ACCM', 3728764800)]+X[('FUND_BAL_MOTH_BAL_ACCM', 3728764800)]+X[('MTH_ARG_ACCM', 3728764800)]+X[('MTH_ARG_BAL_ACCM', 3728764800)]+X[('MTH_PREM_ACCM', 3728764800)]
X['WEALTH_ACCM_2'] = X[('MOTH_CR_ACCM', 3731443200)]+X[('NVTA_MOTH_ACCM', 3731443200)]+X[('FUND_BAL_MOTH_BAL_ACCM', 3731443200)]+X[('MTH_ARG_ACCM', 3731443200)]+X[('MTH_ARG_BAL_ACCM', 3731443200)]+X[('MTH_PREM_ACCM', 3731443200)]
X['WEALTH_ACCM_3'] = X[('MOTH_CR_ACCM', 3734035200)]+X[('NVTA_MOTH_ACCM', 3734035200)]+X[('FUND_BAL_MOTH_BAL_ACCM', 3734035200)]+X[('MTH_ARG_ACCM', 3734035200)]+X[('MTH_ARG_BAL_ACCM', 3734035200)]+X[('MTH_PREM_ACCM', 3734035200)]

X2 = X[['CUST_NO','WEALTH_BAL_1','WEALTH_BAL_2','WEALTH_BAL_3','WEALTH_ACCM_1','WEALTH_ACCM_2','WEALTH_ACCM_3']].set_index('CUST_NO')

X2['WEALTH_BAL_MAX'] = X2[['WEALTH_BAL_1','WEALTH_BAL_2','WEALTH_BAL_3']].max(axis=1)
X2['WEALTH_BAL_MEAN'] = X2[['WEALTH_BAL_1','WEALTH_BAL_2','WEALTH_BAL_3']].mean(axis=1)
X2['WEALTH_BAL_STD'] = X2[['WEALTH_BAL_1','WEALTH_BAL_2','WEALTH_BAL_3']].std(axis=1).fillna(0.0)
X2['WEALTH_BAL_CV'] = X2['WEALTH_BAL_STD']/X2['WEALTH_BAL_MEAN']
X2['WEALTH_ACCM_MAX'] = X2[['WEALTH_ACCM_1','WEALTH_ACCM_2','WEALTH_ACCM_3']].max(axis=1)
X2['WEALTH_ACCM_MEAN'] = X2[['WEALTH_ACCM_1','WEALTH_ACCM_2','WEALTH_ACCM_3']].mean(axis=1)
X2['WEALTH_ACCM_STD'] = X2[['WEALTH_ACCM_1','WEALTH_ACCM_2','WEALTH_ACCM_3']].std(axis=1).fillna(0.0)
X2['WEALTH_ACCM_CV'] = X2['WEALTH_ACCM_STD']/X2['WEALTH_ACCM_MEAN']

X3 = X2.fillna(0.0).drop(['WEALTH_BAL_1','WEALTH_BAL_2','WEALTH_BAL_3','WEALTH_ACCM_1','WEALTH_ACCM_2','WEALTH_ACCM_3','WEALTH_BAL_STD','WEALTH_ACCM_STD'],axis=1).reset_index()

return X3[['CUST_NO','WEALTH_ACCM_MEAN','WEALTH_ACCM_CV']]

结果挖掘

1
2
train_cust_result_1 = train_cust_result[train_cust_result['FLAG']==1]
train_cust_result_0 = train_cust_result[train_cust_result['FLAG']==0]
1
from lightgbm import LGBMRegressor
1
! pip install lightgbm-2.2.3-py2.py3-none-manylinux1_x86_64.whl
Processing ./lightgbm-2.2.3-py2.py3-none-manylinux1_x86_64.whl
Requirement already satisfied: scikit-learn in /root/anaconda3/lib/python3.6/site-packages (from lightgbm==2.2.3)
Requirement already satisfied: scipy in /root/anaconda3/lib/python3.6/site-packages (from lightgbm==2.2.3)
Requirement already satisfied: numpy in /root/anaconda3/lib/python3.6/site-packages (from lightgbm==2.2.3)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.2.3
1
! cp ./xgb_rfm-Copy1.ipynb ./xgb_rfm-Copy2.ipynb
1
! pip install imbalanced-learn
Requirement already satisfied: imbalanced-learn in /root/anaconda3/lib/python3.6/site-packages
Requirement already satisfied: scipy>=0.17 in /root/anaconda3/lib/python3.6/site-packages (from imbalanced-learn)
Requirement already satisfied: joblib>=0.11 in /root/anaconda3/lib/python3.6/site-packages (from imbalanced-learn)
Requirement already satisfied: scikit-learn>=0.21 in /root/anaconda3/lib/python3.6/site-packages (from imbalanced-learn)
Requirement already satisfied: numpy>=1.11 in /root/anaconda3/lib/python3.6/site-packages (from imbalanced-learn)
0%