通过使用sklearn决策树，简单练习案例分析

最新推荐文章于 2024-08-15 03:11:58 发布

WBerica

最新推荐文章于 2024-08-15 03:11:58 发布

阅读量667

点赞数 1

分类专栏：机器学习文章标签：机器学习决策树 python

本文链接：https://blog.csdn.net/WBerica/article/details/107102607

版权

机器学习专栏收录该内容

1 篇文章 0 订阅

订阅专栏


# 根据电影中的类型，票房，产地，预测去不去看电影
import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

from sklearn import tree

# 导入数据集，
film_data = open('film.csv', 'rt')
reader = csv.reader(film_data)

# 表头数据
headers = next(reader)
# 打印一下数据集的头部
print(headers)

# 预测数据列表
feature_list = []
# 结果集列表
result_list = []
# 便利打印数据，组装成新的数据
for row in reader:
    # print(row[1:-1])
    # print(headers[1:-1])
    result_list.append(row[-1])
    feature_list.append(dict(zip(headers[1:-1], row[1:-1])))
print(result_list, feature_list)
# ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no']
# [{'type': 'anime', 'country': 'Janan', 'gross': 'low'}, {'type': 'science', 'country': 'America', 'gross': 'low'}, {'type': 'anime', 'country': 'America', 'gross': 'low'}, {'type': 'action', 'country': 'America', 'gross': 'high'}, {'type': 'action', 'country': 'China', 'gross': 'high'}, {'type': 'anime', 'country': 'China', 'gross': 'low'}, {'type': 'science', 'country': 'France', 'gross': 'low'}, {'type': 'action', 'country': 'China', 'gross': 'low'}]

# 调用sklearn 自带的特征提取类
vec = DictVectorizer()
# 传入训练数据模型
dummyX = vec.fit_transform(feature_list).toarray()
"生成的这个二维数组 是根据传入数据特征值进行分类的，如果特征值比比较多的话，二维数组会比较大，此二维数组前四位是代表国家，中间两位票房，最后三位数据代表电影类型，测试数据也是一样"
"""
[[0. 0. 0. 1. 0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 1.]
 [1. 0. 0. 0. 0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0. 1. 0. 0.]
 [0. 1. 0. 0. 1. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 1. 0. 1. 0.]
 [0. 0. 1. 0. 0. 1. 0. 0. 1.]
 [0. 1. 0. 0. 0. 1. 1. 0. 0.]]
"""
#标签二值化
dummyY = preprocessing.LabelBinarizer().fit_transform(result_list)
"""
[[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]]
"""
print(dummyX)
print(dummyY)
# 调用sklearn 的 决策树 训练模型
clf = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
clf = clf.fit(dummyX, dummyY)
# print('clf:' + str(clf))



A = ([[0, 0, 0, 1, 0, 1, 0, 1, 0]])
B = ([[0, 0, 1, 0, 0, 1, 0, 1, 0]])
C = ([[1, 0, 0, 0, 1, 0, 1, 0, 0]])
# 传入数据 验证训练模型
predict_result = clf.predict(A)
print('预测结果' + str(predict_result))