ex-1
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# 读取Excel文件
df = pd.read_excel(r"C:\Users\20905\Desktop\dataset.xlsx", engine='openpyxl', na_values='-')
# 显示数据
#print(df)
# 剔除日期时间类型的列和换手率的列
df = df.drop(df.columns[0], axis=1)
df = df.drop(df.columns[-1], axis=1)
# 删除第一行带日期的数据
df = df.drop(df.index[0])
# 将数据集中的所有值转换为浮点型
df = df.astype(float)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文字体为黑体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
# 计算每列的均值和方差
mean_values = df.apply(np.mean)
variance_values = df.apply(np.var)
# 输出均值和方差
print("每列的均值:")
print(mean_values)
print("\n每列的方差:")
print(variance_values)
# 绘制均值的直方图
plt.figure(figsize=(10, 6))
plt.hist(mean_values, bins=20, alpha=0.5, color='blue')
plt.xlabel('Mean Value')
plt.ylabel('Frequency')
plt.title('Histogram of Mean Values')
plt.grid(True)
plt.show()
# 绘制方差的直方图
plt.figure(figsize=(10, 6))
plt.hist(variance_values, bins=20, alpha=0.5, color='green')
plt.xlabel('Variance Value')
plt.ylabel('Frequency')
plt.title('Histogram of Variance Values')
plt.grid(True)
plt.show()
# 绘制直方图
plt.figure(figsize=(10, 6))
for column in df.columns:
plt.hist(df[column], bins=20, alpha=0.5, label=column)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of Dataset Columns')
plt.legend()
plt.show()
from sklearn.preprocessing import StandardScaler
# 初始化StandardScaler对象
scaler = StandardScaler()
# 对每列进行Z分数标准化
df_standardized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
# 保存标准化后的数据集到Excel文件
df_standardized.to_excel("standardized_data.xlsx", index=False)
df_after = pd.read_excel(r"C:\Users\20905\standardized_data.xlsx", engine='openpyxl', na_values='-')
print(df_after)
ex-2
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
#调用一下GPU
import torch
if torch.cuda.is_available():
device = torch.device("cuda")
print("GPU is available")
else:
device = torch.device("cpu")
print("GPU is not available")
# 读取数据
data = pd.read_csv(r'C:\Users\20905\Desktop\prices.txt')
data.columns=['area','price']#增加特征名
# 准备数据
X = data[['area']] # 选择需要作为特征的列
y = data['price'] # 选择目标列
#data.head()
# 建立线性回归模型
model = LinearRegression()
# 拆分数据集为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练模型
model.fit(X, y)
# 预测
y_pred = model.predict(X_test)
# 绘制散点图
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Actual data')
plt.scatter(X_test, y_pred, color='red', label='Predicted values')
plt.plot(X, model.predict(X), color='red', label='Linear regression')
plt.title('Linear Regression')
plt.xlabel('price')
plt.ylabel('area')
plt.legend()
plt.show()
ex-3
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
# 调用一下GPU
import torch
if torch.cuda.is_available():
device = torch.device("cuda")
print("GPU is available")
else:
device = torch.device("cpu")
print("GPU is not available")
# 读取数据
data = pd.read_csv(r'C:\Users\20905\Desktop\prices.txt')
data.columns = ['area', 'price'] # 增加特征名
# 准备数据
X = data[['area']] # 选择需要作为特征的列
y = data['price'] # 选择目标列
# 建立多项式回归模型
degree = 4 # 多项式的阶数
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
# 拆分数据集为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练模型
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 绘制散点图
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Actual data')
plt.scatter(X_test, y_pred, color='red', label='Predicted values')
plt.plot(X, model.predict(X), color='red', label='Polynomial regression (degree {})'.format(degree))
plt.title('Polynomial Regression')
plt.xlabel('Area')
plt.ylabel('Price')
plt.legend()
plt.show()
ex-4
# 导入 numpy 库和 pandas 库
import numpy as np
import pandas as pd
# 从 sklearn 库中导入预处理模块 Imputer
from sklearn.impute import SimpleImputer
# 导入自动生成训练集和测试的模块 train_test_split
from sklearn.model_selection import train_test_split
# 导入预测结果评估模块 classification_report
from sklearn.metrics import classification_report
# 从 sklearn 库中导入分类器模块:
# K近邻分类器 KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
# 编写数据导入函数
def load_datasets(feature_paths, label_paths):
# 定义 feature 数组变量,列数量和特征维度一致为 41
features = np.empty(shape=(0, 41))
# 定义空的标签变量,列数与定义空的标签维度一致为 1
labels = np.empty(shape=(0, 1))
# 读取特征文件
for file in feature_paths:
# 使用 pandas 库的 read_table函数读取一个特征文件的内容,
# 其中指定分隔符为逗号、缺失值为问号且文件不包含表头行
data = pd.read_table(file, sep=',', na_values='?', header=None)
# 使用 SimpleImputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。
imp = SimpleImputer(strategy='mean')
imp.fit(data)
data_cleaned = imp.transform(data)
# 将预处理后的数据加入 feature
features = np.vstack((features, data_cleaned))
# 读取标签文件
for file in label_paths:
# 遵循与处理特征文件相同的思想
data = pd.read_table(file, sep=',', na_values='?', header=None)
# 将读取到的新数据加入 label 集合
labels = np.vstack((labels, data))
# 最后函数将特征集合 feature 与标签集合 label 返回。
return features, labels
if __name__ == '__main__':
''' 设置数据路径 feature_paths 和 label_paths。 '''
feature_paths = [r'C:\Users\20905\Desktop\dataset\A\A.feature']
label_paths = [r'C:\Users\20905\Desktop\dataset\A\A.label']
''' 读入数据
使用 python 的分片方法,将数据路径中的前 4 个值作为训练集,
并作为参数传入 load_dataset() 函数中,得到训练集合的特征 x_train,训练集的标签 y_train。
'''
x_train, y_train = load_datasets(feature_paths[:4], label_paths)
''' 将最后一个值对应的数据作为测试集,送入 load_dataset() 函数中,得到测试集合的特征 x_test,测试集的标签 y_test'''
x_test, y_test = load_datasets([feature_paths[-1]], [label_paths[-1]])
''' 使用 train_test_split() 函数,通过设置测试集比例 test_size 为 0,
将数据随机打乱,便于后续分类器的初始化和训练。
'''
x_train, _, y_train, _ = train_test_split(x_train, y_train, test_size=0.2)
# 使用默认参数创建 K 近邻分类器,并将训练集 x_train 和 y_train 送入 fit() 函数进行训练,训练后的分类器保存到变量 knn 中。
knn = KNeighborsClassifier()
knn.fit(x_train, y_train.ravel())
print('Training done')
# 使用测试集 x_test,进行分类器预测,得到分类结果 answer_knn。
answer_knn = knn.predict(x_test)
print('Prediction done')
# 使用 classification_report 函数对分类结果,
# 从精确率 precision、召回率 recall、f1值 f1-score 和支持度 support 四个维度进行衡量
print(classification_report(y_test, answer_knn))
ex-5
#1. 建立工程,导入 sklearn相关包
import pandas as pd # pandas用来加载CSV数据的工具包
import numpy as np # numpy支持高级大量的维度数组与矩阵运算,此外也针对数组运算提供大量的数学函数库。
from sklearn import svm # SVM算法
from sklearn import model_selection # 引入交叉验证法
#2.数据加载 &&数据预处理
# read_csv的详细使用地址:http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
# pd.read_csv(数据源,encoding=编码格式为gbk,parse_dates=第0列解析为日期,index_col=用作行索引的列编号))
data = pd.read_csv(r"C:\Users\20905\Desktop\000777.csv", encoding='gbk', parse_dates=[0], index_col=0)
# DataFrame.sort_index(axis=0(按0列排),ascending=True(升序),inplace=False(排序后是否覆盖原数据))data
# 选取 5列数据作为特征:收盘价 列数据作为特征:收盘价 最高价 最低价 开盘价 成交量
# dayfeature:选取 150 天的数据
dayfeature = 150
# featurenum:选取的 5个特征 *天数
featurenum = 5 * dayfeature
x = np.zeros((data.shape[0] - dayfeature, featurenum + 1))
y = np.zeros((data.shape[0] - dayfeature))
for i in range(0, data.shape[0] - dayfeature):
# 将数据中的 “收盘价 ”“ 最高价 ”“最低价”“ 开盘价 ”“ 成交量 ”存入 x数
x[i, 0:featurenum] = np.array(data[i:i + dayfeature] \
[[u'收盘价', u'最高价', u'最低价', u'开盘价', u'成交量']]).reshape((1, featurenum))
# 最后一列记录当天的开盘价
x[i, featurenum] = data.ix[i + dayfeature][u'开盘价']
for i in range(0, data.shape[0] - dayfeature):
# 如果当天收盘价高于开盘价,用1代表涨,0代表跌
if data.ix[i + dayfeature][u'收盘价'] >= data.ix[i + dayfeature][u'开盘价']:
y[i] = 1
else:
y[i] = 0
# 3.创建SVM,并进行交叉验证
# 调用SVM函数,并设置kernel参数,默认是rbf,其它:‘linear’‘poly’‘sigmoid’
clf = svm.SVC(kernel='rbf')
# x和y的验证集 和测试,切分为8:2的测试集
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2, random_state=0)
# 训练数据进行训练
clf.fit(x_train, y_train)
# 将预测数据和测试集的验证数据做对比
result = clf.score(x_test, y_test)
print("SVM分类的正确率(每次交叉验证的结果)为:")
print(result)
ex-6
import numpy as np #导入numpy工具包
from os import listdir #使用listdir模块,用于访问本地文件
from sklearn import neighbors
def img2vector(fileName):
retMat = np.zeros([1024],int) #定义返回的矩阵,大小为1*1024
fr = open(fileName) #打开包含32*32大小的数字文件
lines = fr.readlines() #读取文件的所有行
for i in range(32): #遍历文件所有行
for j in range(32): #并将01数字存放在retMat中
retMat[i*32+j] = lines[i][j]
return retMat
def readDataSet(path):
fileList = listdir(path) #获取文件夹下的所有文件
numFiles = len(fileList) #统计需要读取的文件的数目
dataSet = np.zeros([numFiles,1024],int) #用于存放所有的数字文件
hwLabels = np.zeros([numFiles])#用于存放对应的标签(与神经网络的不同)
for i in range(numFiles): #遍历所有的文件
filePath = fileList[i] #获取文件名称/路径
digit = int(filePath.split('_')[0]) #通过文件名获取标签
hwLabels[i] = digit #直接存放数字,并非one-hot向量
dataSet[i] = img2vector(path +'/'+filePath) #读取文件内容
return dataSet,hwLabels
#read dataSet
train_dataSet, train_hwLabels = readDataSet(r'C:\Users\20905\Desktop\digits\trainingDigits')
#3. 构建KNN分类器
#KNN是一种懒惰学习法,没有学习过程,只在预测时去查找最近邻的点,数据集的输入就是构建KNN分类器的过程。
knn = neighbors.KNeighborsClassifier(algorithm='kd_tree', n_neighbors=3)
knn.fit(train_dataSet, train_hwLabels)
#read testing dataSet
dataSet,hwLabels = readDataSet(r'C:\Users\20905\Desktop\digits\testDigits')
res = knn.predict(dataSet) #对测试集进行预测\
error_num = np.sum(res != hwLabels) #统计分类错误的数目
num = len(dataSet) #测试集的数目
print("Total num:",num," Wrong num:", \
error_num," WrongRate:",error_num / float(num))
import numpy as np #1.导入numpy工具包
from os import listdir #使用listdir模块,用于访问本地文件
from sklearn.neural_network import MLPClassifier
#2.加载训练数据
def img2vector(fileName):
retMat = np.zeros([1024],int) #定义返回的矩阵,大小为1*1024
fr = open(fileName) #打开包含32*32大小的数字文件
lines = fr.readlines() #读取文件的所有行
for i in range(32): #遍历文件所有行
for j in range(32): #并将01数字存放在retMat中
retMat[i*32+j] = lines[i][j]
return retMat
def readDataSet(path):
fileList = listdir(path) #获取文件夹下的所有文件
numFiles = len(fileList) #统计需要读取的文件的数目
dataSet = np.zeros([numFiles,1024],int) #用于存放所有的数字文件
hwLabels = np.zeros([numFiles,10]) #用于存放对应的one-hot标签
for i in range(numFiles): #遍历所有的文件
filePath = fileList[i] #获取文件名称/路径
digit = int(filePath.split('_')[0]) #通过文件名获取标签
hwLabels[i][digit] = 1.0 #将对应的one-hot标签置1
dataSet[i] = img2vector(path +'/'+filePath) #读取文件内容
return dataSet,hwLabels
#read dataSet
train_dataSet, train_hwLabels = readDataSet(r'C:\Users\20905\Desktop\digits\trainingDigits')
# 3.训练神经网络
#构建神经网络:
#设置网络的隐藏层数、各隐藏层神经元个数、激活函数、学习率、优化方法、最大迭代次数。
clf = MLPClassifier(hidden_layer_sizes=(100,),
activation='logistic', solver='adam',
learning_rate_init = 0.0001, max_iter=2000)
print(clf)
clf.fit(train_dataSet,train_hwLabels)
#read testing dataSet 4.测试集评价
dataSet,hwLabels = readDataSet(r'C:\Users\20905\Desktop\digits\testDigits')
res = clf.predict(dataSet) #对测试集进行预测
error_num = 0 #统计预测错误的数目
num = len(dataSet) #测试集的数目
for i in range(num): #遍历预测结果
#比较长度为10的数组,返回包含01的数组,0为不同,1为相同
#若预测结果与真实结果相同,则10个数字全为1,否则不全为1
if np.sum(res[i] == hwLabels[i]) < 10:
error_num += 1
print("Total num:",num," Wrong num:", \
error_num," WrongRate:",error_num / float(num))
ex-7
# 1. 建立工程,导入sklearn相关包
# 导入Numpy库
import numpy as np
# 导入KMeans算法
from sklearn.cluster import KMeans
# 2. 加载数据,创建K-means算法实例,并进行训练,获得标签
def loadData(filePath):
fr = open(filePath, 'r+') # r+:读写打开一个文本文件
lines = fr.readlines() # .readlines()一次读取整个文件(类似于.read())
retData = []
retCityName = []
for line in lines:
items = line.strip().split(",")
retCityName.append(items[0])
retData.append([float(items[i]) for i in range(1, len(items))])
return retData, retCityName # 返回值:返回城市名称,以及该城市的各项消费信息
if __name__ == '__main__':
data, cityName = loadData(r'C:\Users\20905\Desktop\city.txt') # 假设数据文件名为city.txt
km = KMeans(n_clusters=4)
label = km.fit_predict(data)
expenses = np.sum(km.cluster_centers_,axis=1)
#print(expenses)
CityCluster = [[],[],[],[]]
for i in range(len(cityName)):
CityCluster[label[i]].append(cityName[i])
for i in range(len(CityCluster)):
print("Expenses:%.2f" % expenses[i])
print(CityCluster[i])
ex-8
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import pandas as pd
# 加载鸢尾花数据集
data = load_iris()
y = data.target
X = data.data
# 打印四维数据表格
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y
print("鸢尾花数据集:")
print(df)
# 使用PCA进行降维
pca = PCA(n_components=2)
reduced_X = pca.fit_transform(X)
# 准备绘图数据
red_x, red_y = [], []
blue_x, blue_y = [], []
green_x, green_y = [], []
for i in range(len(reduced_X)):
if y[i] == 0:
red_x.append(reduced_X[i][0])
red_y.append(reduced_X[i][1])
elif y[i] == 1:
blue_x.append(reduced_X[i][0])
blue_y.append(reduced_X[i][1])
else:
green_x.append(reduced_X[i][0])
green_y.append(reduced_X[i][1])
# 绘制二维PCA降维结果
plt.scatter(red_x, red_y, c='r', marker='x', label=data.target_names[0])
plt.scatter(blue_x, blue_y, c='b', marker='D', label=data.target_names[1])
plt.scatter(green_x, green_y, c='g', marker='.', label=data.target_names[2])
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA of Iris Dataset')
plt.legend()
plt.show()
ex-9
from numpy.random import RandomState
import matplotlib.pyplot as plt
from sklearn import decomposition
import scipy.io
n_row, n_col = 2, 3
n_components = n_row * n_col
image_shape = (64, 64)
###############################################################################
# Load faces data
# dataset = fetch_olivetti_faces(shuffle=True, random_state=RandomState(0))
dataset = scipy.io.loadmat(r'C:\Users\20905\scikit_learn_data\olivettifaces.mat')
faces = dataset['faces']
# 检查 faces 的形状并调整为 (400, 64, 64) 如果必要
print(faces.shape)
if faces.shape == (4096, 400):
faces = faces.T # 转置使其变为 (400, 4096)
if faces.shape == (400, 4096):
faces = faces.reshape((400, 64, 64))
print(faces.shape) # 确认 faces 形状为 (400, 64, 64)
###############################################################################
def plot_gallery(title, images, n_col=n_col, n_row=n_row, image_shape=image_shape):
plt.figure(figsize=(2. * n_col, 2.26 * n_row))
plt.suptitle(title, size=16)
for i, comp in enumerate(images):
plt.subplot(n_row, n_col, i + 1)
vmax = max(comp.max(), -comp.min())
plt.imshow(comp.reshape(image_shape), cmap=plt.cm.gray,
interpolation='nearest', vmin=-vmax, vmax=vmax)
plt.xticks(())
plt.yticks(())
plt.subplots_adjust(0.01, 0.05, 0.99, 0.94, 0.04, 0.)
plot_gallery("First centered Olivetti faces", faces[:n_components])
###############################################################################
estimators = [
('Eigenfaces - PCA using randomized SVD',
decomposition.PCA(n_components=6, whiten=True)),
('Non-negative components - NMF',
decomposition.NMF(n_components=6, init='nndsvda', tol=5e-3))
]
###############################################################################
for name, estimator in estimators:
print("Extracting the top %d %s..." % (n_components, name))
print(faces.shape)
# 将 faces 数据展平为二维数组
faces_reshaped = faces.reshape((faces.shape[0], -1)) # (400, 4096)
estimator.fit(faces_reshaped)
components_ = estimator.components_
# 对 components_ 进行重塑以进行绘图
plot_gallery(name, components_.reshape((n_components, *image_shape)))
plt.show()
ex-10
#1. 建立工程并导入 sklearn 包
import numpy as np
import PIL.Image as image
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
#2. 加载图片并进行预处理
def loadData(filePath):
f = open(filePath,'rb')
data = []
img = image.open(f)
m,n = img.size
for i in range(m):
for j in range(n):
x,y,z = img.getpixel((i,j))
data.append([x/256.0,y/256.0,z/256.0])
f.close()
return np.mat(data),m,n
#3. 加载 Kmeans 聚类算法
imgData,row,col = loadData(r'C:\Users\20905\Desktop\bull.jpg')
label = KMeans(n_clusters=4).fit_predict(imgData)
#4:对像素点进行聚类并输出
label = label.reshape([row,col])
pic_new = image.new("L", (row, col))
for i in range(row):
for j in range(col):
pic_new.putpixel((i,j), int(256/(label[i][j]+1)))
pic_new.save("聚类后的图片1.jpg", "JPEG")
# 默认保存下来的图为灰度的图,如果用了plt.imshow则可以显示出rgb彩色图, imshow中参数设为 gray 也可以显示为灰度图。
plt.imshow(pic_new)
723

被折叠的 条评论
为什么被折叠?



