-
读文件
>>>import pandas as pd
>>> from io import StringIO
>>> csv_data = \
... '''A,B,C,D
... 1.0,2.0,3.0,4.0
... 5.0,6.0,,8.0
... 10.0,11.0,12.0,'''
>>> # If you are using Python 2.7, you need
>>> # to convert the string to unicode:
>>> # csv_data = unicode(csv_data)
>>> df = pd.read_csv(StringIO(csv_data))
>>> print (df)
A B C D
0 1.0 2.0 3.0 4.0
1 5.0 6.0 NaN 8.0
2 10.0 11.0 12.0 NaN
缺失数据处理
- 计算每列为空数量
print (df.isnull().sum())
A 0
B 0
C 1
D 1
- 删除有空值的行(axis is stands for “坐标系”)
- 删除所有值为空的行
- 删除特定列里出现空值的行
- 转移缺失值
- 处理分类数据
- 映射原始数据(新数据替换原始数据)
- 编码
- 给名词型数据编码One Hot Encoding
- 分离训练集和测试集(带原始数据获取)
>>> df_wine = pd.read_csv('https://archive.ics.uci.edu/'
'ml/machine-learning-databases/'
'wine/wine.data', header=None)
>>> df_wine.columns = ['Class label', 'Alcohol',
... 'Malic acid', 'Ash',
... 'Alcalinity of ash', 'Magnesium',
... 'Total phenols', 'Flavanoids',
... 'Nonflavanoid phenols',
... 'Proanthocyanins',
... 'Color intensity', 'Hue',
... 'OD280/OD315 of diluted wines',
... 'Proline']
>>> print('Class labels', np.unique(df_wine['Class label']))
Class labels [1 2 3]
>>> print(df_wine.head())
>>> from sklearn.model_selection import train_test_split
>>> X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
>>> X_train, X_test, y_train, y_test =\
... train_test_split(X, y,
... test_size=0.3,
... random_state=0,
... stratify=y)
- 标准化数据(将其放入相同数量级下)
>>> from sklearn.preprocessing import MinMaxScaler
>>> mms = MinMaxScaler()
>>> X_train_norm = mms.fit_transform(X_train)
>>> X_test_norm = mms.transform(X_test)
图像(matplotlib)
散点图(scatter plot)
两变量(two variables)
import matplotlib.pyplot as plt
import numpy as np
#createdata
N=100
x_data=np.random.rand(N)
y_data=np.random.rand(N)
def scatterplot(x_data, y_data, x_label="", y_label="", title="", color = "r", yscale_log=False):
# Create the plot object
_, ax = plt.subplots()
# Plot the data, set the size (s), color and transparency (alpha)
# of the points
ax.scatter(x_data, y_data, s = 10, color = color, alpha = 0.75)
if yscale_log == True:
ax.set_yscale('log')
# Label the axes and provide a title
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
plt.show(scatterplot(x_data,y_data))
三变量(three variables)
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
#createdata
N=100
x_data=np.random.rand(N)
y_data=np.random.rand(N)
z_data=np.random.rand(N)
def scatterplot(x_data, y_data, z_data, x_label="", y_label="", z_label="", title="", color = "r", yscale_log=False):
# Create the plot object
# figure,ax3d = plt.subplots()
figure = plt.figure()
# Plot the data, set the size (s), color and transparency (alpha)
# of the points
# ax.scatter(x_data, y_data, s = 10, color = color, alpha = 0.75)
ax3d=Axes3D(figure)
ax3d.scatter3D(x_data,y_data,z_data,color=color)
if yscale_log == True:
ax3d.set_yscale('log')
# Label the axes and provide a title
ax3d.set_title(title)
ax3d.set_xlabel(x_label)
ax3d.set_ylabel(y_label)
ax3d.set_zlabel(z_label)
scatterplot(x_data,y_data,z_data,'x','y','z',title='3 variables')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
#createdata
N=100
x_data=np.random.rand(N)
y1_data=np.random.rand(N)
y2_data=np.random.rand(N)
def scatterplot(x_data, y1_data,y2_data, x_label="", y_label="", title="", color = "r", yscale_log=False):
# Create the plot object
_, ax = plt.subplots()
# Plot the data, set the size (s), color and transparency (alpha)
# of the points
ax.scatter(x_data, y1_data, s = 10, color = color, alpha = 0.75)
ax.scatter(x_data, y2_data, s=10, color='g', alpha=0.75)
if yscale_log == True:
ax.set_yscale('log')
# Label the axes and provide a title
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
scatterplot(x_data,y1_data,y2_data)
plt.show()
线形图(line plot)
def lineplot(x_data, y_data, x_label="", y_label="", title=""):
# Create the plot object
_, ax = plt.subplots()
# Plot the best fit line, set the linewidth (lw), color and
# transparency (alpha) of the line
ax.plot(x_data, y_data, lw = 2, color = '#539caf', alpha = 1)
# Label the axes and provide a title
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
plt.show(lineplot(x_data,y_data,x_label=="x_label",y_label="y_label",title="title"))
统计直方图(Histograms)
变量为连续值,会把连续值分为几个不重叠的区间,统计其个数或概率,区间个数称为bin
n_bins = 10
def histogram(x_data, n_bins, cumulative=False, x_label = "", y_label = "", title = ""):
_, ax = plt.subplots()
ax.hist(x_data, n_bins, cumulative = cumulative, color = '#539caf')
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
plt.show(histogram(x_data,n_bins))
条形统计图(Bar chart)
变量为离散值,一般是分类。
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0.1,0.1,0.8,0.8])
langs = ['C', 'C++', 'Java', 'Python', 'PHP']
students = [23,17,35,29,12]
ax.bar(langs,students)
plt.show()
盒图(Box plot)
反映方差,盒子上边缘之上是数据的25%,盒子上半部分是25%-50%,下半部分是50%-75%,下边缘以下是75%-100%。
# Import libraries
import matplotlib.pyplot as plt
import numpy as np
# Creating dataset
np.random.seed(10)
data = np.random.normal(100, 20, 200)
fig = plt.figure(figsize=(10, 7))
# Creating plot
plt.boxplot(data)
# show plot
plt.show()