Input: 训练集D={(x1, y1), (x2, y2), ..., (xm, ym)};
属性集A={a1, a2, ..., ad}.
Output: 以node为根节点的一个决策树
Process:
## 通过给定样本集D和属性集A构建决策树
TreeGenerate(D, A){
1: 生成结点node;
2: if D 中样本全属于同一类别C then
3: 将node标记为 C类 叶节点; return
4: end if
5: if A = ∅ OR D中样本在A上取值相同 then
6: 将node标记为叶节点,其类别标记为D中样本数最多的类; return
7: end if
8: 从 A 中选择最优化分属性 a*
9: for a* 的每一值a[i] do
10: 为node生成一个分支; 令Dv表示D中在 a* 上取值为 a[i] 的样本子集;
11: if Dv is empty then
12: 将分支结点标记为叶节点,其类别为D中样本最多的类; return
13: else
14: 以 TreeGenerate(Dv, A\{a*}) 为分支结点;
15: end if
16: end for
使用Python实现的递归构建决策树如下:
def treeGrow(dataset,features):
# 停止条件(1)(2)
if len(classList) == classList.count(classList[0]): #no more feature
return classifyMaxLabel(classList)
if len(dataSet[0]) == 1: # pure dataset
return classList[0]
# 特征选择
bestFeature = findBestSplit(dataset)
del bestFeature
# 划分特征并递归调用
SplitData = SplitDataset(dataset, bestFeature)
tree = treeGrow(SplitData,features)
return tree
# 筛选出信息增益最大的特征,返回特征索引,该特征的信息增益
def Maxinformation_gain(data,labels):
feature_gain ={}
data_labels = [y[-1] for y in data]
entropyD = entropy(data_labels)
# 计算每一个feature的信息增益
for f in range(len(labels)):
featureVal = [value[f] for value in data]
entropyF = ConditionalEntropy(featureVal,data_labels)
feature_gain[f] = entropyD-entropyF
result = max(feature_gain.items(),key=lambda x:x[1])
return result[0],result[1]
def Maxinformation_gain_ratio(data,labels):
# 计算每个特征的信息增益
result = {}
data_labels = [y[-1] for y in data]
entropyD = entropy(data_labels)
# 计算每一个feature的信息增益
for f in range(len(labels)):
featureVal = [value[f] for value in data]
entropyF = ConditionalEntropy(featureVal,data_labels)
feature_gain = entropyD-entropyF
feature_data_en = FeatureDataEntropy(featureVal)
result[f] = feature_gain/feature_data_en