导入data类型的数据集
使用pandas的read_table读取data数据
data=pd.read_table("./wdbc.data")
将data数据转化为csv文件,并读取csv文件
data.to_csv('./machine.csv',sep=" ",index=False)
csv=pd.read_csv("./machine.csv",header=None)
将文件将数据集分成feature和target
X,Y=csv.iloc[:,2:],csv.iloc[:,1]
将target数据集中的非字符串转化成数字类型
Y_class={"M":-1,"B":1}
Y=Y.map(Y_class)
将pandas的DataFrame类型转为numpy的ndarray类型
X,Y=X.to_numpy(),Y.to_numpy()
归一化
std = StandardScaler()
X_norm = std.fit_transform(X)
划分数据集
X_train = X_norm[:int(len(X_norm)*0.7)]
X_test = X_norm[int(len(X_norm)*0.7):]
y_train = Y[:int(len(X_norm)*0.7)]
y_test = Y[int(len(X_norm)*0.7):]
总代码
from sklearn import datasets
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
def datasets():
data=pd.read_table("/home/ylkong/Text Summary/wdbc.data")
data.to_csv('./machine.csv',sep=" ",index=False)
csv=pd.read_csv("./machine.csv",header=None)
X,Y=csv.iloc[:,2:],csv.iloc[:,1]
Y_class={"M":-1,"B":1}
Y=Y.map(Y_class)
X,Y=X.to_numpy(),Y.to_numpy()
std = StandardScaler()
X_norm = std.fit_transform(X)
X_train = X_norm[:int(len(X_norm)*0.7)]
X_test = X_norm[int(len(X_norm)*0.7):]
y_train = Y[:int(len(X_norm)*0.7)]
y_test = Y[int(len(X_norm)*0.7):]
return X_train,y_train,X_test,y_test