一、数据规范化
import pymysql
import pandas as pd
import numpy as np
conn = pymysql.connect(host="",user="root",passwd="root",db="csdn")
sql = "select price,comment from taob"
data=pd.read_sql(sql,conn)
#离差标准化
data2 = (data-data.min())/(data.max() - data.min())
# 标准差标准化
data3 = (data-data.mean())/data.std()
# 小数定标规范化
k = np.ceil(np.log10(data.abs().max()))
data4 = data / 10**k
二、离散化
连续数据离散化
# 等宽离散化
data5 = data[u"price"].copy()
data6 = data5.T
data7 = data6.values
k =3
c1 = pd.cut(data7,k,labels = ["便宜","适中","贵"])
# 非等宽离散化
k1 = [0,50,100,300,500,2000,data7.max()]
c2 = pd.cut(data7,k1)
# 等频率离散化
# 一维聚类离散