Pandas
- read_csv ()
- df.columns.tolist() #获取所有的列名字
- df.dtypes.value_counts() # 有多少数据类型统计
- Select_dtypes 连续数据和离散数据的 分拆
str_list = df.select_dtypes(include=[‘object’])
print(str_list.columns) #
encoder = LabelBinarizer()
str_1hot = encoder.fit_transform(str_list)
float_list = df.select_dtypes(include=[‘int64’,‘float64’])
print(float_list.columns)
scal = StandardScaler()
floot_tr = scal.fit_transform(float_list)
import pandas
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
data = pandas.DataFrame(pandas.read_excel(r’C:\DATA\deeplearning\IPA2_09-20-2019 19-16-43.xlsx’))
df=data
col_list = df.columns.tolist() #
type_list =df.dtypes.value_counts() #
print(type_list)
str_list = df.select_dtypes(include=[‘object’])
print(str_list.columns)
float_list = df.select_dtypes(include=[‘float64’])
#print(float_list.describe())
float_list.dropna()
#na 数据处理删除空行
df2=df.dropna(axis=0,subset = [“ColumnName”]) # dropna(null) from the df axis =0 行操作
nulllist=df2[df2[‘columnName’].isnull()] # list null in df.columnname列
单列 转换
newdata = float_list[‘TE Margin Accuracy’].values
std = StandardScaler()
float_tr = std.fit_transform(newdata.reshape(-1,1))
print(float_tr)
float_list
删除空列
for i in float_list.columns:
if float_list[i].count() == 0:
float_list.drop(labels=i, axis=1, inplace=True)
#全部数据转换 axis =1 列操作
#print(float_list.describe())
newdata = float_list.values
std = StandardScaler()
float_tr = std.fit_transform(newdata)
float_tr.shape
#离散变量按照一列一列循环转换为code
obj_list = df.select_dtypes(include=[‘object’])
print(obj_list.columns)
encode = LabelBinarizer()
for i in obj_list.columns:
obj_list[i] = encode.fit_transform(obj_list[i])
obj_list
#合并数值和离散编码数据成一个数组
newdf = np.hstack((float_tr,obj_list))
print(newdf.shape)
插入均值
from sklearn.preprocessing import Imputer
imp = Imputer(strategy=‘mean’)
print(‘Mean:\n’, imp.fit_transform(data))
插入中位数
imp = Imputer(strategy=‘median’)
print(‘Median:\n’, imp.fit_transform(data))
LabelEncoder: xxx为属性名
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dataset[‘xxx’] = encoder.fit_transform(dataset[‘xxx’])
OneHotEncoder:单独特征编码 xxx为属性名
import pandas as pd
dataset= dataset.join(pd.get_dummies(dataset.xxx))
OneHotEncoder:所有特征编码
import pandas as pd
dataset = dataset .get_dummies(dataset )