DataMining数据预处理

最新推荐文章于 2024-04-29 22:27:21 发布

YK_Forever

最新推荐文章于 2024-04-29 22:27:21 发布

阅读量169

点赞数

分类专栏：笔记文章标签： python 数据挖掘 matplotlib

本文链接：https://blog.csdn.net/m0_50207094/article/details/119192355

版权

笔记专栏收录该内容

9 篇文章 1 订阅

订阅专栏

读文件

>>>import pandas as pd
>>> from io import StringIO
>>> csv_data = \
... '''A,B,C,D
... 1.0,2.0,3.0,4.0
... 5.0,6.0,,8.0
... 10.0,11.0,12.0,'''
>>> # If you are using Python 2.7, you need
>>> # to convert the string to unicode:
>>> # csv_data = unicode(csv_data)
>>> df = pd.read_csv(StringIO(csv_data))
>>> print (df)

A B C D

0 1.0 2.0 3.0 4.0

1 5.0 6.0 NaN 8.0

2 10.0 11.0 12.0 NaN

缺失数据处理

计算每列为空数量

 print (df.isnull().sum())

A 0

B 0

C 1

D 1

删除有空值的行(axis is stands for “坐标系”)

删除所有值为空的行
删除特定列里出现空值的行
转移缺失值
处理分类数据
映射原始数据（新数据替换原始数据）
编码
给名词型数据编码One Hot Encoding

分离训练集和测试集（带原始数据获取）

>>> df_wine = pd.read_csv('https://archive.ics.uci.edu/'
'ml/machine-learning-databases/'
'wine/wine.data', header=None)
>>> df_wine.columns = ['Class label', 'Alcohol',
... 'Malic acid', 'Ash',
... 'Alcalinity of ash', 'Magnesium',
... 'Total phenols', 'Flavanoids',
... 'Nonflavanoid phenols',
... 'Proanthocyanins',
... 'Color intensity', 'Hue',
... 'OD280/OD315 of diluted wines',
... 'Proline']
>>> print('Class labels', np.unique(df_wine['Class label']))
Class labels [1 2 3]
>>> print(df_wine.head())

>>> from sklearn.model_selection import train_test_split
>>> X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
>>> X_train, X_test, y_train, y_test =\
... train_test_split(X, y,
... test_size=0.3,
... random_state=0,
... stratify=y)

标准化数据（将其放入相同数量级下）

>>> from sklearn.preprocessing import MinMaxScaler
>>> mms = MinMaxScaler()
>>> X_train_norm = mms.fit_transform(X_train)
>>> X_test_norm = mms.transform(X_test)

图像(matplotlib)

散点图(scatter plot)

两变量(two variables)

import matplotlib.pyplot as plt
import numpy as np

#createdata 
N=100
x_data=np.random.rand(N)
y_data=np.random.rand(N)

def scatterplot(x_data, y_data, x_label="", y_label="", title="", color = "r", yscale_log=False):

    # Create the plot object
    _, ax = plt.subplots()

    # Plot the data, set the size (s), color and transparency (alpha)
    # of the points
    ax.scatter(x_data, y_data, s = 10, color = color, alpha = 0.75)

    if yscale_log == True:
        ax.set_yscale('log')

    # Label the axes and provide a title
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
plt.show(scatterplot(x_data,y_data))

三变量(three variables)

import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import  Axes3D
#createdata
N=100
x_data=np.random.rand(N)
y_data=np.random.rand(N)
z_data=np.random.rand(N)

def scatterplot(x_data, y_data, z_data, x_label="", y_label="", z_label="", title="", color = "r", yscale_log=False):
# Create the plot object
    # figure,ax3d = plt.subplots()
    figure = plt.figure()
    # Plot the data, set the size (s), color and transparency (alpha)
    # of the points
    # ax.scatter(x_data, y_data, s = 10, color = color, alpha = 0.75)
    ax3d=Axes3D(figure)
    ax3d.scatter3D(x_data,y_data,z_data,color=color)
    if yscale_log == True:
        ax3d.set_yscale('log')
    # Label the axes and provide a title
    ax3d.set_title(title)
    ax3d.set_xlabel(x_label)
    ax3d.set_ylabel(y_label)
    ax3d.set_zlabel(z_label)
scatterplot(x_data,y_data,z_data,'x','y','z',title='3 variables')
plt.show()

import matplotlib.pyplot as plt
import numpy as np
#createdata
N=100
x_data=np.random.rand(N)
y1_data=np.random.rand(N)
y2_data=np.random.rand(N)
def scatterplot(x_data, y1_data,y2_data, x_label="", y_label="", title="", color = "r", yscale_log=False):
    # Create the plot object
    _, ax = plt.subplots()
    # Plot the data, set the size (s), color and transparency (alpha)
    # of the points
    ax.scatter(x_data, y1_data, s = 10, color = color, alpha = 0.75)
    ax.scatter(x_data, y2_data, s=10, color='g', alpha=0.75)
    if yscale_log == True:
        ax.set_yscale('log')
    # Label the axes and provide a title
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
scatterplot(x_data,y1_data,y2_data)
plt.show()

线形图(line plot)

def lineplot(x_data, y_data, x_label="", y_label="", title=""):
    # Create the plot object
    _, ax = plt.subplots()

    # Plot the best fit line, set the linewidth (lw), color and
    # transparency (alpha) of the line
    ax.plot(x_data, y_data, lw = 2, color = '#539caf', alpha = 1)

    # Label the axes and provide a title
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
plt.show(lineplot(x_data,y_data,x_label=="x_label",y_label="y_label",title="title"))

统计直方图(Histograms)

变量为连续值，会把连续值分为几个不重叠的区间，统计其个数或概率，区间个数称为bin

n_bins = 10
def histogram(x_data, n_bins, cumulative=False, x_label = "", y_label = "", title = ""):
    _, ax = plt.subplots()
    ax.hist(x_data, n_bins, cumulative = cumulative, color = '#539caf')
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
plt.show(histogram(x_data,n_bins))

条形统计图(Bar chart)

变量为离散值，一般是分类。

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0.1,0.1,0.8,0.8])
langs = ['C', 'C++', 'Java', 'Python', 'PHP']
students = [23,17,35,29,12]
ax.bar(langs,students)
plt.show()

盒图(Box plot)

反映方差，盒子上边缘之上是数据的25%，盒子上半部分是25%-50%，下半部分是50%-75%，下边缘以下是75%-100%。

# Import libraries
import matplotlib.pyplot as plt
import numpy as np
# Creating dataset
np.random.seed(10)
data = np.random.normal(100, 20, 200)
fig = plt.figure(figsize=(10, 7))
# Creating plot
plt.boxplot(data)
# show plot
plt.show()

YK_Forever

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
DataMining数据预处理

读文件import pandas as pd>>> from io import StringIO>>> csv_data = \... '''A,B,C,D... 1.0,2.0,3.0,4.0... 5.0,6.0,,8.0... 10.0,11.0,12.0,'''>>> df = pd.read_csv(StringIO(csv_data))>>> print (df)A B C D0 1.0 2
复制链接

扫一扫