机器学习——决策树

发布时间:2019-03-15 23:48:28编辑:auto阅读(2151)

    决策树是一种用于分类和回归的非参数监督学习方法。目标是创建一个模型,通过从数据特性中推导出简单的决策规则来预测目标变量的值

    导入类库

    1 import numpy as np
    2 import pandas as pd
    3 from sklearn.feature_extraction import DictVectorizer
    4 from sklearn.tree import DecisionTreeClassifier
    5 from sklearn.model_selection import train_test_split

    简单版

     1 def decide_play1():
     2     df = pd.read_csv('dtree.csv')
     3     dict_train = df.to_dict(orient='record')
     4 
     5     dv = DictVectorizer(sparse=False)
     6     dv_train = dv.fit_transform(dict_train)
     7     # print(dv_train)
     8     # dv_train1 = np.append(dv_train, dv_train[:, 5].reshape(-1, 1), axis=1)
     9     # dv_train2 = np.delete(dv_train1, 5, axis=1)
    10     # print('*' * 50)
    11     # print(dv_train2)
    12 
    13     # print(dv_train[:,:5])
    14     # print(dv_train[:,6:])
    15     # print(dv_train[:,5])
    16     y = dv_train[:, 5]
    17     x = np.delete(dv_train, 5, axis=1)
    18     print(x)
    19     print(y)
    20     dtc = DecisionTreeClassifier()
    21     dtc.fit(x, y.reshape(-1, 1))
    22     print(dtc.predict(np.array([x[3]])))

    正式版

     1 def decide_play():
     2     # ID3
     3     df = pd.read_csv('dtree.csv')
     4     # 将数据转换为字典格式,orient='record'参数指定数据格式为{column:value,column:value}的形式
     5     dict_train = df.loc[:, ['Outlook', 'Temperatur', 'Humidity', 'Windy']].to_dict(orient='record')
     6     dict_target = pd.DataFrame(df['PlayGolf'], columns=['PlayGolf']).to_dict(orient='record')
     7 
     8 
     9     # 训练数据字典向量化
    10     dv_train = DictVectorizer(sparse=False)
    11     x_train = dv_train.fit_transform(dict_train)
    12 
    13     # 目标数据字典向量化
    14     dv_target = DictVectorizer(sparse=False)
    15     y_target = dv_target.fit_transform(dict_target)
    16 
    17     # 创建训练模型并训练
    18     d_tree = DecisionTreeClassifier()
    19     d_tree.fit(x_train, y_target)
    20 
    21     data_predict = {
    22         'Humidity': 85,
    23         'Outlook': 'sunny',
    24         'Temperatur': 85,
    25         'Windy': False
    26     }
    27 
    28     x_data = dv_train.transform(data_predict)
    29     print(dv_target.inverse_transform(d_tree.predict(x_data)))
    30 
    31 
    32 if __name__ == '__main__':
    33     decide_play()

    泰坦尼克生存率决策

     1 import numpy as np
     2 import pandas as pd
     3 from sklearn.feature_extraction import DictVectorizer
     4 from sklearn.model_selection import train_test_split
     5 from sklearn.tree import DecisionTreeClassifier
     6 from sklearn.metrics import r2_score
     7 
     8 
     9 def titanic_tree():
    10     # 获取数据
    11     df = pd.read_csv('Titanic.csv')
    12     # df = df.fillna(0)
    13     # dict_train = df.loc[:, ['Pclass', 'Age', 'Sex']].to_dict(orient='record')
    14     # dict_target = pd.DataFrame(df['Survived'], columns=['Survived']).to_dict(orient='record')
    15     # x_train, x_test, y_train, y_test = train_test_split(dict_train, dict_target, test_size=0.25)
    16 
    17     # 处理数据,找出特征值和目标值
    18     x = df.loc[:, ['Pclass', 'Age', 'Sex']]
    19     y = df.loc[:, ['Survived']]
    20     # 缺失值处理
    21     x['Age'].fillna(x['Age'].mean(), inplace=True)
    22     # 分割数据集到训练集和测试集
    23     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    24     # print(y_test)
    25     dv_train = DictVectorizer(sparse=False)
    26     x_train = dv_train.fit_transform(x_train.to_dict(orient='record'))
    27     x_test = dv_train.transform(x_test.to_dict(orient='record'))
    28 
    29     dv_target = DictVectorizer(sparse=False)
    30     y_target = dv_target.fit_transform(y_train.to_dict(orient='record'))
    31     y_test = dv_target.transform(y_test.to_dict(orient='record'))
    32     # print(y_test)
    33     # 用决策树进行预测
    34     d_tree = DecisionTreeClassifier()
    35     d_tree.fit(x_train, y_train)
    36 
    37     data_predict = {
    38         'Pclass': 1,
    39         'Age': 38,
    40         'Sex': 'female'
    41 
    42     }
    43 
    44     x_data = dv_train.transform(data_predict)
    45     print(dv_target.inverse_transform(d_tree.predict(x_data).reshape(-1,1)))
    46     # print(d_tree.predict(x_test))
    47     # print(y_test)
    48     # 预测准确率
    49     # print(d_tree.score(x_test, y_test))
    50 
    51 
    52 if __name__ == '__main__':
    53     titanic_tree()

     (Decision Tree)及其变种是另一类将输入空间分成不同的区域,每个区域有独立参数的算法。

    决策树分类算法是一种基于实例的归纳学习方法,它能从给定的无序的训练样本中,提炼出树型的分类模型。树中的每个非叶子节点记录了使用哪个特征来进行类别的判断,每个叶子节点则代表了最后判断的类别。根节点到每个叶子节点均形成一条分类的路径规则。而对新的样本进行测试时,只需要从根节点开始,在每个分支节点进行测试,沿着相应的分支递归地进入子树再测试,一直到达叶子节点,该叶子节点所代表的类别即是当前测试样本的预测类别

关键字