目录
1. pandas 案例
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('./data/IMDB-Movie-Data.csv')
# 1.平均分,导演人数
# 平均分
data['Rating'].mean()
# 获取导演人数
np.unique(data['Director']).size
# 2.rating,runtime的分布情况
# 创建画布
plt.figure(figsize=(20,8))
# 绘制图像
res = plt.hist(data['Rating'], bins=20)
# 设置x轴刻度
plt.xticks(res[1])
# 显示图像
plt.show()
# 3.统计电影分类
# 遍历所有的电影类型数据,累加对应类型的数量
# (1)拿到所有的电影类型
genre_temp = [i.split(',') for i in data['Genre'] ]
genre_list = np.unique([i for j in genre_temp for i in j])
# (2)构建一个记录电影类型数量的容器 Series
genre_s = pd.Series(np.zeros((len(genre_list),)), index=genre_list)
# (3)遍历所有的电影,统计电影类型的数量
for i in genre_temp:
for j in i:
genre_s[j] += 1
# 绘制柱状图
genre_s.plot(kind='bar',figsize=(20,8))
plt.show()
2. scikit-learn 的数据集API
from sklearn.datasets import load_iris
from sklearn.datasets import fetch_20newsgroups
# 获取小数据集 bunch字典
iris = load_iris()
# 获取特征值
iris.data
# 获取目标值
iris.target
# 获取特征名
iris.feature_names
# 获取标签名
iris.target_names
# 获取大数据集
data = fetch_20newsgroups(subset='train') # 需下载数据集
3. 查看数据分布
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# 构建一个df数据集
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target
# 使用sns绘制散点图
sns.lmplot(data=iris_df, x='sepal width (cm)', y='petal width (cm)', hue='label', fit_reg=False) # fit_reg 是否线性拟合
# 使用plt绘制散点图
plt.figure(figsize=(20,8))
plt.scatter(iris_df['sepal width (cm)'], iris_df['petal width (cm)'], c=iris_df['label'])
plt.show()
4. 数据集划分
# random_state 随机种子相同,随机划分结果也相同
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=10)
5. 去量钢化
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
# 读取数据
data = pd.read_csv('./data/dating.txt')
# 归一化处理 转换器
# 实例化转换器
transfer = MinMaxScaler()
# fit:计算转换数据的参数(最小值和最大值),保存在转换器中
transfer.fit(data.iloc[:, :3])
# 转换数据
transfer.transform(data.iloc[:, :3])
# 计算参数和转换数据
transfer.fit_transform(data.iloc[:, :3])
# 标准化处理 转换器
# 实例化转换器
transfer = StandardScaler()
# fit:计算转换数据的参数(平均值和标准差),保存在转换器中
transfer.fit(data.iloc[:, :3])
# 转换数据
transfer.transform(data.iloc[:, :3])
# 计算参数和转换数据
transfer.fit_transform(data.iloc[:, :3])
6. 鸢尾花种类预测
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
# 1、获取数据集
iris = load_iris()
# 2、数据基本处理
# x_train,x_test,y_train,y_test为训练集特征值、测试集特征值、训练集目标值、测试集目标值
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
# 3、特征工程:标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4、机器学习(模型训练)
estimator = KNeighborsClassifier(n_neighbors=9)
estimator.fit(x_train, y_train)
# 5、模型评估
# 方法1:比对真实值和预测值
y_predict = estimator.predict(x_test)
print("预测结果为:\n", y_predict)
print("比对真实值和预测值:\n", y_predict == y_test)
# 方法2:直接计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
7. 交叉验证和网格搜索的使用
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
# 1、获取数据集
iris = load_iris()
# 2、数据基本处理 数据集划分
# x_train,x_test,y_train,y_test为训练集特征值、测试集特征值、训练集目标值、测试集目标值
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22) # random_state 随机种子
# 3、特征工程:标准化
# 3.1 实例化转换器
transfer = StandardScaler()
# 3.2 转换数据
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4、机器学习(模型训练)
# 4.1 建立模型 实例化 估计器
# estimator = KNeighborsClassifier(n_neighbors=9)
estimator = KNeighborsClassifier(algorithm='kd_tree')
estimator.fit(x_train, y_train)
# 交叉验证和网格搜索
# 构建参数字典
param_dict = {'n_neighbors':[1,3,5,9,11]}
estimator_gscv = GridSearchCV(estimator, param_grid=param_dict, cv=4)
# 4.2 训练模型
estimator_gscv.fit(x_train, y_train)
# 5、模型评估
# 准确率
score = estimator_gscv.score(x_test, y_test) # 最优模型的准确率
# 获取最优模型
estimator_gscv.best_estimator_
# 获取最优超参数
estimator_gscv.best_params_
# 获取cv结果
estimator_gscv.cv_results_
8. 预测 facebook 签到位置
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
# 1、获取数据 read_csv
data = pd.read_csv('./data/train.csv')
# 2、数据基本处理
# 数据集划分,把时间戳提出明显的时间特征(周几,几号,几点), 剔除签到人数少的place_id
time = pd.to_datetime(data['time'], unit='s')
time = pd.DatetimeIndex(time)
data['weekday'] = time.weekday
data['day'] = time.day
data['hour'] = time.hour
# 剔除签到人数少的place_id
temp = data.groupby('place_id')['row_id'].count()
res = temp[temp > 3].index
data = data[data['place_id'].isin(res)]
# 数据集划分
x = data[['x','y','accuracy','weekday','day','hour']]
y = data['place_id']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
# 3、特征工程 标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4、机器学习
# 4.1 创建模型
estimator = KNeighborsClassifier(algorithm='kd_tree')
# 构建参数字典
param_dict = {'n_neighbors':[1,3,5]}
# 初始化GSCV估计器
estimator_gscv = GridSearchCV(estimator, param_grid=param_dict, cv=3)
# 4.2 训练模型
estimator_gscv.fit(x_train, y_train)
# 5、模型评估 分类 准确率
estimator_gscv.score(x_test,y_test)
9. 线性回归 api 初步使用
from sklearn.linear_model import LinearRegression
# 获取数据
x = [[80, 86],
[82, 80],
[85, 78],
[90, 90],
[86, 82],
[82, 90],
[78, 80],
[92, 94]]
y = [84.2, 80.6, 80.1, 90, 83.2, 87.6, 79.4, 93.4]
# 机器学习
estimator = LinearRegression()
estimator.fit(x,y)
# 获取回归系数
estimator.coef_
# 获取偏置
estimator.intercept_