# -*-coding:utf-8-*-
import numpy as np
import pandas as pd
"""
数据要求:read_num, book_id
"""
pf = pd.read_csv('new_data.csv', encoding='gbk')
print type(pf)
unit = pf['read_unit']
unit = unit.str.split(' ') # 原来是空格分隔的
dapartment = unit.str[0] # 学院
# print dapartment
major = unit.str[1] # 专业
# print major
data = pf[['read_sex', 'book_id']]
print type(data)
data.insert(2, 'dapartment', dapartment)
# data.insert(3, 'major', major)
# data.columns = ['sex', 'book', 'dapartment', 'major']
data.columns = ['sex', 'book', 'dapartment']
print type(data)
print '------------------------------------------------------'
"""
算法:获取标签
"""
def add_label(s):
l = []
m = []
for i in range(len(s)):
if i == 0:
m = []
l = [1]
else:
m.append(s[i - 1])
if s[i] in m:
if m.index(s[i]) == 0:
l.append(1)
else:
l.append(l[m.index(s[i])])
else:
l.append(max(l) + 1)
return l
sex = data['sex']
print type(sex)
sex = add_label(sex) # 添加标签
dapartment = data['dapartment']
dapartment = add_label(dapartment) # 添加标签
book = data['book']
book = add_label(book) # 添加标签
# major = data['major']
# major = add_label(major)
# print 'sex:', sex[:50]
# print 'department:', dapartment[:50]
# print 'book:', book[:50]
# print 'major:', major[:50]
"""
不同学院的学生借阅书籍的不同
"""
diff_dep = []
diff_dep.append(sex)
diff_dep.append(dapartment)
m = np.array(diff_dep).T
print m # data
# print len(m) # 182508
print m[:10]
n = book # target
print n[150000:]
"""
决策树训练数据和预测数据
"""
train_data = m[: 150000]
test_data = m[150000:]
train_target = n[: 150000]
test_target = n[150000:]
# 导入决策树DTC包
from sklearn.tree import DecisionTreeClassifier
# 训练
clf = DecisionTreeClassifier()
# 注意均使用训练数据集和样本类标
clf.fit(train_data, train_target)
print clf
# 预测结果
predict_target = clf.predict(test_data)
print predict_target
# 预测结果与真实结果比对
print predict_target == test_target
print sum(predict_target == test_target)
# 输出准确率 召回率 F值
from sklearn import metrics
print metrics.classification_report(test_target, predict_target)
# print metrics.confusion_matrix(test_target, predict_target)
【数据分析】图书馆数据-08决策树
最新推荐文章于 2023-06-20 10:18:10 发布
本文介绍了一个基于决策树算法的图书推荐系统实现过程。通过对学生性别、学院等特征进行标签编码,并利用决策树模型预测不同学院学生偏好的书籍类型。通过训练与测试数据集对比,评估了模型的准确性。
1301

被折叠的 条评论
为什么被折叠?



