大数据挖掘·期末·课后习题

最新推荐文章于 2024-07-18 00:08:52 发布

君士睿心

最新推荐文章于 2024-07-18 00:08:52 发布

阅读量221

点赞数 1

文章标签：数据挖掘数据结构 python

本文链接：https://blog.csdn.net/weixin_73693331/article/details/140133120

版权

第一章

1、

# (1)定义一个元组t1=(1,2,R,py,Matlab')和一个空列表1ist1
t1 = (1, 2, 'R', 'py', 'Matlab')
list1 = []
# (2)以while循环的方式，用append()函数依次向list1中添加t1中的元素。
i = 0
while i < len(t1):
    list1.append(t1[i])
    i += 1
print(list1)
# (3)定义一个空字典，命名为dict1。
dict1 = {}
# (4)定义一个嵌套列表Li=["k',[3,4,5],(1,2,6),18,50],采用for循环的方式，用setdefault(0)
# 函数依次将Li中的元素添加到dict1中，其中Li元素对应的键依次为a、b、c、d、e。
Li = ['k', [3, 4, 5], (1, 2, 6), 18, 50]
keys = ['a', 'b', 'c', 'd', 'e']
for item, key in zip(Li, keys):
    # 使用setdefault来确保键存在，并且其对应的值是一个列表
    # 如果键不存在，则将其设置为空列表，然后追加元素
    dict1.setdefault(key, []).append(item)
print("dict1:\n", dict1)

2、

def comput(r, h):
    # 表面积
    S = 2 * 3.14 * r * (r + h)
    # 体积
    V = 3.14 * r ** 2 * h
    return S, V
r = 10
h = 11
S, V = comput(r, h)
print(f"S:{S:.2f}, V:{V:.2f}")

第二章

1、

import numpy as np
# 1
list1 = [1, 2, 4, 6, 7, 8]
tup1 = (1, 2, 3, 4, 5, 6)
# (1)
N1 = np.array(list1)
# (2)
N2 = np.array(tup1)
print("数组N1", N1)
print("数组N2", N2)
# (3)
N3 = np.ones((1, 6))
print("数组N3", N3)
# (4)
N4 = np.vstack((N1, N2, N3))
print("数组N4\n", N4)
# (5)
np.save("N4.npy", N4)
N4_n = np.load('N4.npy')
# print("数组N5\n", N4_n)
print(f"数组N5:\n{N4_n}")

2、

import numpy as np
# 2
# (1)
N4_n = np.load('N4.npy')
print("加载数组N4\n", N4_n)
# (2)
# 提取第一行的第2和4元素
f = N4_n[0, [1, 3]]
print("f:", f)
# 提取第三行的第1和5元素
t = N4_n[2, [0, 4]]
print("t:", t)
# 将提取的元素组合成一个新的二维数组N5
N5 = np.array([f, t])
# 打印新的二维数组
print("N5\n", N5)
list1 = [1, 2, 4, 6, 7, 8]
N1 = np.array(list1)
N1_1 = N1.reshape((2, 3))
print("N1_1\n", N1_1)
N6 = np.hstack((N5, N1_1))
print("N6", N6)

3、

import numpy as np
# (1)
# 生成两个2x2的矩阵
mat1 = np.random.rand(2, 2)
mat2 = np.random.rand(2, 2)
# 打印生成的矩阵
print("matrix1", mat1)
print("matrix2", mat2)
# 计算矩阵乘积
pro = np.dot(mat1, mat2)  # 使用np.dot进行矩阵乘法
# 打印矩阵乘积
print("矩阵乘积:", pro)

# (2)
# 定义矩阵A
A = np.array([[3, -1],
              [-1, 3]])
# 计算特征值和特征向量
eva, eve = np.linalg.eig(A)
# 打印特征值
print("特征值:", eva)
# 打印特征向量
for eig in eve.T:
    print("特征向量：", eig)
# (3)
    # 定义矩阵A
    A = np.array([[4, 11, 14],
                  [8, 7, -2]])
    # 进行奇异值分解
    U, S, Vt = np.linalg.svd(A, full_matrices=False)
    # U是一个m x k的矩阵，其中k是A的秩
    # S是一个1维数组，包含奇异值（按降序排列）
    # Vt是一个k x n的矩阵（V的转置），V是一个n x k的矩阵
    # S是奇异值数组，需要转换为对角矩阵Σ
    Sigma = np.zeros((A.shape[0], A.shape[1]))
    for i in range(min(A.shape[0], A.shape[1])):
        Sigma[i, i] = S[i]
    # Vt是V的转置，如果需要V，取其转置即可
    V = Vt.T
    # 打印结果
    print(f"m x k的矩阵:\n{U},\n对角矩阵:\n{Sigma},"
          f"\nn x k的矩阵：\n{V}")
# (4)
# 创建矩阵D
D = np.array([[4, 6, 8],
              [4, 6, 9],
              [5, 6, 8]])

# 计算行列式D的值
det_D = np.linalg.det(D)

# 计算矩阵D的转置D1
Dt = D.T
# 计算行列式D1的值
det_Dt = np.linalg.det(Dt)
# 打印结果
print("行列式D的值为:", det_D)
print("矩阵D的转置Dt为:")
print(Dt)
print("行列式Dt的值为:", det_Dt)

第三章

1、未给数据

import pandas as pd
# 1（1）
data = {'姓名': ['小红', '小红', '小红', '张明', '张明', '张明', '小李', '小李', '小李', '小江', '小江', '小江'],
        '科目': ['语', '数', '英', '语', '数', '英', '语', '数', '英', '语', '数', '英'],
        '成绩': ['98', '80', '67', '78', '68', '86', '96', '68', '67', '80', '68', '69']}
df = pd.DataFrame(data)
df.to_csv('student_scores.txt', sep='，', index=False)
# 将成绩列转换为数值类型，防止计算平均值出错
df['成绩'] = pd.to_numeric(df['成绩'], errors='coerce')
# (2)
# 根据姓名切片
pd1 = df[df['姓名'] == '小红']
pd2 = df[df['姓名'] == '张明']
pd3 = df[df['姓名'] == '小江']
pd4 = df[df['姓名'] == '小李']
print("小红的成绩数据框：")
print(pd1)
print("\n张明的成绩数据框：")
print(pd2)
print("\n小江的成绩数据框：")
print(pd3)
print("\n小李的成绩数据框：")
print(pd4)
# （3）
# 计算每个同学各科成绩的平均分
M1 = round(pd2['成绩'].mean(),2)
M2 = round(pd2['成绩'].mean(),2)
M3 = round(pd3['成绩'].mean(),2)
M4 = round(pd4['成绩'].mean(),2)
print("小红的各科成绩平均分：")
print(M1)
print("\n张明的各科成绩平均分：")
print(M2)
print("\n小江的各科成绩平均分：")
print(M3)
print("\n小李的各科成绩平均分：")
print(M4)

1-2、给出数据

import pandas as pd
# (1)
pd = pd.read_table("成绩单.txt", sep='，', engine='python')
print(pd)
# (2)
pd1 = pd.iloc[0:3]
pd2 = pd.iloc[3:6]
pd3 = pd.iloc[6:9]
pd4 = pd.iloc[9:12]
print("小红的成绩数据框：")
print(pd1)
print("\n张明的成绩数据框：")
print(pd2)
print("\n小江的成绩数据框：")
print(pd3)
print("\n小李的成绩数据框：")
print(pd4)
# (3)
M1 = pd1.mean(numeric_only=True)
M2 = pd2.mean(numeric_only=True)
M3 = pd3.mean(numeric_only=True)
M4 = pd4.mean(numeric_only=True)
# 可约分
print(f"小红的各科成绩平均分：{M1.iloc[0]:.2f}")
# print(M1)
print("\n张明的各科成绩平均分：")
print(M2)
print("\n小江的各科成绩平均分：")
print(M3)
print("\n小李的各科成绩平均分：")
print(M4)

2、

import numpy as np
import pandas as pd
# (1)
df = pd.read_excel('df.xlsx')
print(df)
# (2)
df1 = df.iloc[:, [2, 3]]
print(df1)
Nt = np.array(df1)
print(Nt)
# (3)
TF = (df['交易日期'] <= '2017-01-16') & (df['交易日期'] >= '2017-01-05')
print("逻辑数组 TF：")
print(TF)
# (4)
S = sum(Nt[TF, 1])
print("交易量数据求和 S：", S)

第四章

import matplotlib.pyplot as plt
import pandas as pd
# (1)
df = pd.read_excel('data.xlsx')
print(df)
# (2)
plt.rcParams['font.sans-serif']='SimHei'
x1 = df.iloc[:10,:][["日期"]].values[:,0]
y1 = df.iloc[:10,:][["猪肉的价格（元）"]].values[:,0]
plt.figure(figsize=(25,10))
plt.title("猪肉价格走势图",fontsize=20)
plt.xlabel("日期",fontsize=20)
plt.ylabel("猪肉价格",fontsize=20)
plt.xticks(rotation=45)
plt.plot(x1,y1)
x2 = df[["日期"]].values[:,0]
y2 = df[["牛肉的价格（元）"]].values[:,0]
plt.figure(figsize=(25,10))
plt.title("牛肉价格走势图",fontsize=20)
plt.xlabel("日期",fontsize=20)
plt.ylabel("牛肉的价格（元）",fontsize=20)
plt.xticks(rotation=45)
plt.plot(x2,y2,)
# (3)
x1 = df[['日期']].values[:,0]
y1 = df[["猪肉的价格（元）"]].values[:,0]
plt.figure(figsize=(25,22))
plt.subplot(2,  1,  1)
plt.title("猪肉价格走势图",fontsize=20)
plt.xlabel("日期",fontsize=20)
plt.ylabel("猪肉的价格（元）",fontsize=20)
plt.xticks(rotation=45)
plt.plot(x1,y1)

x1 = df[['日期']].values[:,0]
y1 = df[["牛肉的价格（元）"]].values[:,0]
plt.subplot(2,  1,  2)
plt.title("牛肉价格走势图",fontsize=20)
plt.xlabel("日期",fontsize=20)
plt.ylabel("牛肉的价格（元）",fontsize=20)
plt.xticks(rotation=45)
plt.plot(x1,y1)
plt.show()

第五章

1、

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression as LR
# （1）
df = pd.read_excel("1.xlsx")
# print("==="*40)
# print(df)
# 读取数据确定自变x因变y
x = df.iloc[:, 1:6]
y = df.iloc[:, 6]
print("==="*40, f"\n自变x:{x}")
print("==="*40, f"\n自变y:{y}")
# 创建线性回归对象
lr = LR()
lr.fit(x, y)  # 拟合
c_x = lr.coef_  # x对应的回归系数
c_b = lr.intercept_  # 回归系数常数项
# 回归系数
print("==="*40, f"\n回归系数:{c_x}")
# 回归系数常数项
print("==="*40, f"\n回归系数常数项:{c_b}")
# (2) 判定系数
Slr = lr.score(x, y)  # 判定系数
print('判定系数', Slr)
# (3) 样本预测值
x1 = pd.DataFrame(np.array([[4, 1.5, 10, 17, 9]]), columns=x.columns)
Y = lr.predict(x1)
print('==='*40, f"\n样本预测值:{Y}")

2、

import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression as LR
from sklearn import svm
df = pd.read_excel("2.xlsx")
# print(df)
# 划分数据集为测试数据和训练数据
x_train = df.iloc[:20, 1:4]
y_train = df.iloc[:20, 4]
x_test = df.iloc[20:, 1:4]
# 神经网络模型
rgs = MLPRegressor()
rgs.fit(x_train, y_train)
score = rgs.score(x_train, y_train)
print(f"预测准确度：{score}")
Y1 = rgs.predict(x_test)
print("++"*20, f"\n神经网络模型预测:{Y1}")
# 逻辑回归模型
clf = LR()
clf.fit(x_train, y_train)
rv = clf.score(x_train, y_train)
R = clf.predict(x_test)
print("++"*40, f"\n逻辑回归模型预测:{R}")
print(f"预测准确度：{rv}")
# 支持向量机模型
# 创建算法svm对象
c_range = np.linspace(0.01, 30, 100)
score = []
for i in c_range:
    obj = svm.SVC(C=i)
    obj.fit(x_train, y_train)
    v = obj.score(x_train, y_train)
    score.append(v)
print(f"最高得分:{max(score)}\n 对应的C值: {c_range[score.index(max(score))]}")
C = c_range[score.index(max(score))]
model = svm.SVC(C=C)
model.fit(x_train, y_train)
predict_y = model.predict(x_test)
#
print(f"测试数据的预测结果:{predict_y}")
print("=="*40)

3、

（1）

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
df = pd.read_excel('3.xlsx')
# print(df)
print("=="*20)
X = df.iloc[:, 1:]
R = X.corr()
# 采用均值-方差(数据规范化)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
# 利用pca创建主成分分析对象pca(直接使用算法)
pca = PCA(n_components=0.9)
# 调用fit（）方法，对数据进行拟合
pca.fit(X)
# 调用pca中的transform（）方法
Y = pca.transform(X)
tzxl = pca.components_  # 特征向量
tzz = pca.explained_variance_  # 特征值
gxl = pca.explained_variance_ratio_  # 贡献率
print(Y)
print("=="*20)
print('特征向量', tzxl)
print("=="*20)
print('特征值', tzz)
print("=="*20)
print('贡献率', gxl)

（2）

暂无

4、

暂无

5、

import pandas as pd
import numpy as np
from tabulate import tabulate
# (1)
df = pd.read_excel("5.xlsx", header=None, index_col=0)
df.reset_index(inplace=True, drop=True)
# print("原始数据\n", df)
# 将数据转化为布尔值(0, 1)
items = ["西红柿", "排骨", "鸡蛋", "茄子", "袜子", "鞋子", "水果刀", "土豆", "香蕉", "肥皂", "毛巾", "酸奶", "苹果"]
# print(items)
dic_data = dict()
for i in range(len(items)):
    z = np.zeros(len(df))
    list = []
    for c in range(len(df.iloc[0, :])):
        temp = df.iloc[:, c] == items[i]
        list.extend(temp[temp == True].index)
    print('?', list)
    z[list] = 1
    dic_data.setdefault(items[i], z)
data = pd.DataFrame(dic_data)
print("布尔值数据表：\n", tabulate(data, headers="keys", tablefmt="simple_grid"))
# (2)
# 计算支持度和置信度
# 初始化关联规则
list1 = []
list2 = []
list3 = []
s = 0.2
c = 0.4
for j in range(len(items)):
    for t in range(len(items)):
        if items[j] != items[t]:
            ser1 = data[items[j]]
            ser2 = data[items[t]]
            # 将数据转换为逻辑值
            # (ser1 & ser2)
            l1 = ser1 == 1
            l2 = ser2 == 1
            l12 = np.zeros(len(ser1))
            l12[l1 & l2] = 1
            ldata = np.zeros(len(ser1))
            lall = np.zeros(len(data))
            l12[l1 & l2] = 1
            ldata[l1] = 1
            lall[True] = 1
            # 计算支持度
            support = sum(l12)/sum(lall)
            # 计算置信度
            confidence = sum(l12)/sum(ldata)
            # 生成关联规则
            if support >= s and confidence >= c:
                list1.append(items[j] + "---" + items[t])
                list2.append(support)
                list3.append(confidence)
R = {"关联规则：": list1, "支持度：": list2, "置信度：": list3}
result = pd.DataFrame(R)
print(result)

答案大部分由ai生成，受制于个人水平影响无法完全修改，可能会有所错误。