基于 Decision Tree ID3 算法实现贷款风险审批
import numpy as npimport pandas as pd
data = pd.read_csv('dataset/loans.csv').sample(40000)display(data.sample(10))display(data.shape)# grade: 贷款级别# sub_grade: 贷款细分级别# short_emp: 一年以内短期雇佣# emp_length_num: 受雇年限# home_ownership:居住状态(自有,按揭,租住)# dti:贷款占收入比例# purpose:贷款用途# term:贷款周期# last_delinq_none:贷款申请人是否有不良记录# last_major_derog_none:贷款申请人是否有还款逾期90天以上记录# revol_util:透支额度占信用比例# total_rec_late_fee:逾期罚款总额# safe_loans:贷款是否安全
grade sub_grade short_emp emp_length_num home_ownership dti purpose term last_delinq_none last_major_derog_none revol_util total_rec_late_fee safe_loans19847 C C3 0 11 MORTGAGE 20.18 debt_consolidation 36 months 1 1 80.2 0.0000 124186 D D2 0 5 RENT 21.19 debt_consolidation 36 months 0 1 47.6 0.0000 140013 B B5 0 11 MORTGAGE 30.90 debt_consolidation 36 months 0 1 90.1 0.0000 -15405 B B1 1 1 MORTGAGE 22.17 debt_consolidation 36 months 1 1 40.1 0.0000 -120203 F F4 0 3 RENT 21.94 debt_consolidation 60 months 0 1 34.5 0.0000 112408 B B5 0 7 RENT 8.56 debt_consolidation 36 months 1 1 34.4 0.0000 -131935 B B2 0 3 RENT 9.68 debt_consolidation 36 months 1 1 78.6 0.0000 14621 D D1 0 3 MORTGAGE 3.78 other 36 months 1 1 70.4 14.9409 -130060 C C3 0 6 MORTGAGE 14.60 debt_consolidation 60 months 0 1 39.0 0.0000 18089 D D3 0 7 OWN 17.85 debt_consolidation 36 months 1 1 70.7 0.3500 -1
(40000, 13)
from sklearn.preprocessing import LabelEncoderfrom collections import defaultdict
# 将以上非数值数据映射为数值型。d = defaultdict(LabelEncoder)data = data.apply(lambda x : d[x.name].fit_transform(x))X_train = data.iloc[:800, :-1]y_train = data.iloc[:800, -1]test_X = data.iloc[800:, :-1]test_y = data.iloc[800:, -1]display(X_train)# 特征二值化# 由于特征值太过复杂,不利于处理。我们根据均值来二值化for i in X_train.columns:mean = np.mean(X_train[i])for j in range(len(X_train[i])):X_train[i].values[j] = 1 if X_train[i].values[j]>mean else 0for i in test_X.columns:mean = np.mean(test_X[i])for j in range(len(test_X[i])):test_X[i].values[j] = 1 if test_X[i].values[j]>mean else 0display(X_train)
grade sub_grade short_emp emp_length_num home_ownership dti purpose term last_delinq_none last_major_derog_none revol_util total_rec_late_fee4104 2 12 1 1 0 1764 2 0 0 1 320 018915 0 3 0 2 3 1608 1 0 0 1 240 044721 1 8 0 7 3 3240 1 0 1 1 561 021933 0 2 0 3 0 2328 1 0 1 1 325 020111 2 14 0 8 0 1048 2 0 0 1 280 0... ... ... ... ... ... ... ... ... ... ... ... ...13839 3 16 0 11 0 2025 2 1 1 1 817 015773 3 17 0 11 2 1483 2 0 0 1 958 027211 2 14 0 6 3 1701 2 1 1 1 157 017372 2 11 0 11 0 489 2 1 0 0 279 05410 2 14 0 11 0 1785 1 1 0 0 755 0
800 rows × 12 columns
grade sub_grade short_emp emp_length_num home_ownership dti purpose term last_delinq_none last_major_derog_none revol_util total_rec_late_fee4104 1 1 1 0 0 1 0 0 0 1 0 018915 0 0 0 0 1 1 0 0 0 1 0 044721 0 0 0 1 1 1 0 0 1 1 0 021933 0 0 0 0 0 1 0 0 1 1 0 020111 1 1 0 1 0 0 0 0 0 1 0 0... ... ... ... ... ... ... ... ... ... ... ... ...13839 1 1 0 1 0 1 0 1 1 1 1 015773 1 1 0 1 1 0 0 0 0 1 1 027211 1 1 0 0 1 1 0 1 1 1 0 017372 1 0 0 1 0 0 0 1 0 0 0 05410 1 1 0 1 0 1 0 1 0 0 1 0
800 rows × 12 columns
from collections import Counterfrom tqdm import tqdm
class DecisionTreeID3:def __init__(self):passdef calc_entropy(self):'''计算信息熵、条件熵和信息增益'''# 把标签放在数据最后一列,方便create_tree处理self.X = pd.concat([self.X, self.y], axis=1)# ************ 计算信息熵 **************yes = np.asarray(self.X[self.X[self.X.columns[-1]]==1])no = np.asarray(self.X[self.X[self.X.columns[-1]]==0])P_yes = len(yes)/len(self.X)P_no = len(no)/len(self.X)self.HX = - P_yes*np.log2(P_yes)- P_no*np.log2(P_no)# display("信息熵 = " +str(self.HX))# ************ 计算条件熵 **************# H存放条件熵self.Gda = []# 遍历每一特征列。除了标签列for i in self.X.columns[:-1]:# 存放条件熵Hi = 0# 获取当前特征每种情况出现的次数,以便计算每个情况各自的概率condProbCollections = Counter(self.X[i]).items()# 每种特征可能有N中情况,累加for k in condProbCollections:# 获取当前条件X发生的总样本(包含所有列)samples_of_current = self.X[self.X[i]==k[0]]# 获取当前条件X发生的总样本(仅包含当前特征列)samples_of_current_features = samples_of_current[i]# 获取当前条件X发生下,被判定为安全和不安全的总次数total = len(samples_of_current_features)# 安全总次数k_safe = len(samples_of_current[samples_of_current[samples_of_current.columns[-1]]==1])# 不安全总次数k_unsafe = total - k_safe# 计算安全和不安全的概率P_k_safe = k_safe/totalP_k_unsafe = k_unsafe/total# 累加条件熵log_P_k_safe = 0 if P_k_safe==0 else np.log2(P_k_safe) # 防止出现0值报错log_P_k_unsafe = 0 if P_k_unsafe==0 else np.log2(P_k_unsafe) # 防止出现0值报错Hi += - (total/len(self.X))*(P_k_safe * log_P_k_safe + P_k_unsafe * log_P_k_unsafe)# 保存信息增益self.Gda.append({"value":self.HX - Hi, "feature":i})# print("信息增益为")# print(self.Gda)def create_tree(self, node=False):'''构建决策树结构。决策树信息存储在JSON字典当中Parameters-----node: Series类型,用于传递当前节点包含的信息Return-----tree: 构建好的树json结构'''# 递归出口if len(self.Gda)==0:return node.iloc[:1,-1].values[0]# 获取第一个特征列feature = self.Gda[0]['feature']# 删除该列,以便递归时不会重复到达这里del self.Gda[0]# 获取当前特征每种情况出现的次数,以便计算每个情况各自的概率condProbCollections = Counter(self.X[feature]).items()# print(feature)# print(condProbCollections)# 定义树字典。必须使用特征feature作为键名tree = {feature:{}}for [value,counts] in condProbCollections:# print(condProbCollections)# print(value)tree[feature][value] = self.create_tree(self.X[self.X[feature]==value])# 保存树结构self.tree = treereturn treedef fit(self, X, y):'''训练Parameters-----X: 训练数据,形如 [样本数量,特征数量]y: 类数组类型,形状为:[样本数量]'''self.X = Xself.y = yself.calc_entropy()self.create_tree()def predict_item(self, x, node=False):'''构建决策树结构。决策树信息存储在JSON字典当中Parameters-----node: Series类型,用于传递当前节点包含的信息Return-----tree: 构建好的树json结构'''if node==0 or node==1:return nodelabel = -1# 获取当前节点名key = next(iter(node))# 如果当前节点的值等于0,递归0下面的分支,否则1if x[key].values[0] == 0:label = self.predict_item(x, node=node[key][0])else:label = self.predict_item(x, node=node[key][1])return labeldef predict(self, X):'''对样本进行预测Parameters:X: 类数组类型,可以是List也可以是Ndarray,形状为: [样本数量,特征数量]Returns:数组类型,预测结果'''result = []for i in range(len(X)):result.append(self.predict_item(X.iloc[i:i+1,], node=self.tree))return result
dt = DecisionTreeID3()dt.fit(X_train, y_train)result = dt.predict(test_X)display(np.sum(result==test_y)/len(result))
a=[{"a":1},{"b":2},{"c":3}]c = iter(a)display(next(c))display(next(c))display(next(c))
{'a': 1}{'b': 2}{'c': 3}
