DataMining数据预处理

  • 读文件

>>>import pandas as pd
>>> from io import StringIO
>>> csv_data = \
... '''A,B,C,D
... 1.0,2.0,3.0,4.0
... 5.0,6.0,,8.0
... 10.0,11.0,12.0,'''
>>> # If you are using Python 2.7, you need
>>> # to convert the string to unicode:
>>> # csv_data = unicode(csv_data)
>>> df = pd.read_csv(StringIO(csv_data))
>>> print (df)

A B C D

0 1.0 2.0 3.0 4.0

1 5.0 6.0 NaN 8.0

2 10.0 11.0 12.0 NaN

缺失数据处理

  • 计算每列为空数量
 print (df.isnull().sum())

A 0

B 0

C 1

D 1

  • 删除有空值的行(axis is stands for “坐标系”)
  • 删除所有值为空的行
  • 删除特定列里出现空值的行 
  • 转移缺失值
  • 处理分类数据
  • 映射原始数据(新数据替换原始数据)
  •  编码
  • 给名词型数据编码One Hot Encoding 
  • 分离训练集和测试集(带原始数据获取)
>>> df_wine = pd.read_csv('https://archive.ics.uci.edu/'
'ml/machine-learning-databases/'
'wine/wine.data', header=None)
>>> df_wine.columns = ['Class label', 'Alcohol',
... 'Malic acid', 'Ash',
... 'Alcalinity of ash', 'Magnesium',
... 'Total phenols', 'Flavanoids',
... 'Nonflavanoid phenols',
... 'Proanthocyanins',
... 'Color intensity', 'Hue',
... 'OD280/OD315 of diluted wines',
... 'Proline']
>>> print('Class labels', np.unique(df_wine['Class label']))
Class labels [1 2 3]
>>> print(df_wine.head())

>>> from sklearn.model_selection import train_test_split
>>> X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
>>> X_train, X_test, y_train, y_test =\
... train_test_split(X, y,
... test_size=0.3,
... random_state=0,
... stratify=y)
  • 标准化数据(将其放入相同数量级下)
>>> from sklearn.preprocessing import MinMaxScaler
>>> mms = MinMaxScaler()
>>> X_train_norm = mms.fit_transform(X_train)
>>> X_test_norm = mms.transform(X_test)

图像(matplotlib)

散点图(scatter plot)

两变量(two variables) 

import matplotlib.pyplot as plt
import numpy as np

#createdata 
N=100
x_data=np.random.rand(N)
y_data=np.random.rand(N)

def scatterplot(x_data, y_data, x_label="", y_label="", title="", color = "r", yscale_log=False):

    # Create the plot object
    _, ax = plt.subplots()

    # Plot the data, set the size (s), color and transparency (alpha)
    # of the points
    ax.scatter(x_data, y_data, s = 10, color = color, alpha = 0.75)

    if yscale_log == True:
        ax.set_yscale('log')

    # Label the axes and provide a title
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
plt.show(scatterplot(x_data,y_data))

 三变量(three variables)

import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import  Axes3D
#createdata
N=100
x_data=np.random.rand(N)
y_data=np.random.rand(N)
z_data=np.random.rand(N)

def scatterplot(x_data, y_data, z_data, x_label="", y_label="", z_label="", title="", color = "r", yscale_log=False):
# Create the plot object
    # figure,ax3d = plt.subplots()
    figure = plt.figure()
    # Plot the data, set the size (s), color and transparency (alpha)
    # of the points
    # ax.scatter(x_data, y_data, s = 10, color = color, alpha = 0.75)
    ax3d=Axes3D(figure)
    ax3d.scatter3D(x_data,y_data,z_data,color=color)
    if yscale_log == True:
        ax3d.set_yscale('log')
    # Label the axes and provide a title
    ax3d.set_title(title)
    ax3d.set_xlabel(x_label)
    ax3d.set_ylabel(y_label)
    ax3d.set_zlabel(z_label)
scatterplot(x_data,y_data,z_data,'x','y','z',title='3 variables')
plt.show()

 

import matplotlib.pyplot as plt
import numpy as np
#createdata
N=100
x_data=np.random.rand(N)
y1_data=np.random.rand(N)
y2_data=np.random.rand(N)
def scatterplot(x_data, y1_data,y2_data, x_label="", y_label="", title="", color = "r", yscale_log=False):
    # Create the plot object
    _, ax = plt.subplots()
    # Plot the data, set the size (s), color and transparency (alpha)
    # of the points
    ax.scatter(x_data, y1_data, s = 10, color = color, alpha = 0.75)
    ax.scatter(x_data, y2_data, s=10, color='g', alpha=0.75)
    if yscale_log == True:
        ax.set_yscale('log')
    # Label the axes and provide a title
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
scatterplot(x_data,y1_data,y2_data)
plt.show()

线形图(line plot)

def lineplot(x_data, y_data, x_label="", y_label="", title=""):
    # Create the plot object
    _, ax = plt.subplots()

    # Plot the best fit line, set the linewidth (lw), color and
    # transparency (alpha) of the line
    ax.plot(x_data, y_data, lw = 2, color = '#539caf', alpha = 1)

    # Label the axes and provide a title
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
plt.show(lineplot(x_data,y_data,x_label=="x_label",y_label="y_label",title="title"))

统计直方图(Histograms

 变量为连续值,会把连续值分为几个不重叠的区间,统计其个数或概率,区间个数称为bin

n_bins = 10
def histogram(x_data, n_bins, cumulative=False, x_label = "", y_label = "", title = ""):
    _, ax = plt.subplots()
    ax.hist(x_data, n_bins, cumulative = cumulative, color = '#539caf')
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
plt.show(histogram(x_data,n_bins))

条形统计图(Bar chart)

变量为离散值,一般是分类。

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0.1,0.1,0.8,0.8])
langs = ['C', 'C++', 'Java', 'Python', 'PHP']
students = [23,17,35,29,12]
ax.bar(langs,students)
plt.show()

 

 盒图(Box plot)

 反映方差,盒子上边缘之上是数据的25%,盒子上半部分是25%-50%,下半部分是50%-75%,下边缘以下是75%-100%。

# Import libraries
import matplotlib.pyplot as plt
import numpy as np
# Creating dataset
np.random.seed(10)
data = np.random.normal(100, 20, 200)
fig = plt.figure(figsize=(10, 7))
# Creating plot
plt.boxplot(data)
# show plot
plt.show()

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值