1 sklearn玩具数据集介绍
数据量小,数据在sklearn库的本地,只要安装了sklearn,不用上网就可以获取
2 sklearn现实世界数据集介绍
数据量大,数据只能通过网络获取
3 sklearn加载玩具数据集
示例1:鸢尾花数据
#鸢尾花数据集 from sklearn.datasets import load_iris data=load_iris()#鸢尾花数据 print(data)#打印数据集 print(data,data)#特征就是数据集中的那些除了目标以外的数据集 print(data.target)#目标就是数据集中的那些只有目标数据集 print(data.feature_names)#特征名称 print(data.target_names)#目标名称 print(data.DESCR)#数据集中的描述
鸢尾花数据集介绍
特征有:
花萼长 sepal length
花萼宽sepal width
花瓣长 petal length
花瓣宽 petal width
三分类:
0-Setosa山鸢尾
1-Versicolour变色鸢尾
2-Virginica维吉尼亚鸢尾
from sklearn.datasets import load_iris import numpy as np import pandas as pd iris=load_iris() data=iris.data target=iris.target # print(data) # print(target) target=target.reshape(len(target),1) # print(target) arr=np.hstack((data,target)) data_frame1=pd.DataFrame(arr,columns=["花萼长(cm)","花萼宽(cm)","花瓣长(cm)","花瓣宽(cm)","类别"]) # print(data_frame1) data_frame1 from sklearn.model_selection import train_test_split X=data_frame1[["花萼长(cm)","花萼宽(cm)","花瓣长(cm)","花瓣宽(cm)"]].values Y=data_frame1["类别"].values x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=4) print(x_train,x_test,y_train,y_test)
示例2:分析糖尿病数据集
这是回归数据集,有442个样本,有可能就有442个目标值。
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split data=load_breast_cancer() print(data.feature_names) x_train,x_test,y_train,y_test=train_test_split(data.data,data.target,test_size=0.2,random_state=42) # print("x:\n",x_train,x_test) # print("y:\n",y_train,y_test) print("x:\n",x_train.shape,x_test.shape) print("y:\n",y_train.shape,y_test.shape)
4 sklearn获取现实世界数据集
示例:获取20分类新闻数据
from sklearn import datasets path=datasets.get_data_home() # print(path) re=datasets.fetch_20newsgroups(data_home="./src",subset="all",return_X_y=True) x,y=re print(len(x)) print(len(y))
5 本地csv数据
方式1:打开计事本,写出如下数据,数据之间使用英文下的逗号, 保存文件后把后缀名改为csv
csv文件可以使用excel打开
, milage,Liters,Consumtime,target 40920,8.326976,0.953952,3 14488,7.153469,1.673904,2 26052,1.441871,0.805124,1 75136,13.147394,0.428964,1
方式2:创建excel 文件, 填写数据,以csv为后缀保存文件
(2) pandas加载csv
使用pandas的read_csv(“文件路径”)函数可以加载csv文件,得到的结果为数据的DataFrame形式
pd.read_csv("./src/ss.csv")
#加载本地的csv文件数据集 import pandas as pd from sklearn.model_selection import train_test_split data=pd.read_csv("./src/instacart/orders.csv") print(data.shape) train,test=train_test_split(data,test_size=0.8,random_state=40) # print("x:\n",train) # print("y:\n",test) M=data.loc[...,["order_id","user_id","eval_set","order_number","order_dow"]] N=data.loc[...,["order_hour_of_day","days_since_prior_order"]] print(M) print(N)
6 数据集的划分
列表、ndarray数据集划分
from sklearn.model_selection import train_test_split data1=[1,2,3,4,5] data2=["a","b","c","d","e"] #列表 arr=train_test_split(data2,data1,test_size=0.4) print(arr) x,y,m,n=train_test_split(data1,data2,test_size=0.2,random_state=2) print(x,y,m,n) print(type(x),type(y),type(m),type(n)) #ndarray
二维数组、DataFram数据集划分
from sklearn.model_selection import train_test_split import numpy as np import pandas as pd data1=np.arange(0,25,1) data1.shape=(5,5)#二维数组 print(data1) data1 = pd.DataFrame(data1, index=[1,2,3,4,5], columns=["one","two","three","four","five"])#DataFram print(data1) a,b=train_test_split(data1,test_size=0.4,random_state=10) print("a:\n",a) print("b:\n",b)
字典数据集划分
划分非稀疏矩阵
为了将字典列表转换为特征向量,可以使用 DictVectorizer
。这个转换器旨在处理类别数据和数值数据的混合型数据集。以下是其主要工作原理:
-
处理类别特征:
-
对于每个不同的类别特征,
DictVectorizer
会创建一个新的二进制特征。如果原始数据中的某个样本属于该类别,则对应的特征值为1;如果不属于,则为0。
-
-
处理数值特征:
-
数值特征则保持不变,直接作为特征的一部分。
-
from sklearn.feature_extraction import DictVectorizer data = [{'city':'成都', 'age':30, 'temperature':20}, {'city':'重庆','age':33, 'temperature':60}, {'city':'北京', 'age':42, 'temperature':80}, {'city':'上海', 'age':22, 'temperature':70}, {'city':'成都', 'age':72, 'temperature':40}, {'city':'成都', 'age':12, 'temperature':49} ] transfer=DictVectorizer(sparse=True) data_new=transfer.fit_transform(data) x=data_new.toarray() train,test=train_test_split(x,test_size=0.2) print(train) print(test)
葡萄酒数据集划分
from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split wine=load_wine() print(wine.feature_names) x_train,x_test,y_train,y_test=train_test_split(wine.data,wine.target,test_size=0.2,random_state=20) print("x:\n",x_train,x_test) print("y:\n",y_train,y_test) print("x:\n",x_train.shape,x_test.shape) print("y:\n",y_train.shape,y_test.shape)
现实世界数据集划分
from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split x,y=fetch_california_housing(data_home="./src",return_X_y=True) x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=44) print("x:\n",x_train,x_test) print("y:\n",y_train,y_test) print("x:\n",x_train.shape,x_test.shape) print("y:\n",y_train.shape,y_test.shape)