算法中几种常用的数据预处理的方法~
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer
from sklearn.feature_selection import VarianceThreshold
'''#特征提取的包——》字典的特征向量化'''
def dictvev():
"""
字典数据抽取
:return:None
"""
# 实例化,字典向量化
dict = DictVectorizer(sparse=False)
# 调用fit_transform
data = dict.fit_transform([{'city':'北京','pos':'北方','temperature':100},
{'city':'上海','pos':'东方','temperature':60},
{'city':'重庆','pos':'南方','temperature':70},
{'city':'深圳','pos':'南方','temperature':30}])
print(dict.get_feature_names())
print(dict.inverse_transform(data))
print(data)
return None
dictvev()
def countvec():
"""
对文本进行特征值化
:return: None
"""
cv = CountVectorizer()
data = cv.fit_transform(['this is a test test','we have a test'])
print(cv.get_feature_names()) #按单词出现的词频排的
print(data.toarray())
return None
countvec()
def cutword():
"""
分词:一句话就是一个向量,出现的标1,没出现的标0
:return: 返回值是一个元组的形式
"""
con1 = jieba.cut('床前明月光,我要学python。')
con2 = jieba.cut('床前明月光,疑是地上霜。')
con3 = jieba.cut('生存或死亡,这是一个问题')
# 转化成列表
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
# 把列表转换成字符串
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1,c2,c3
# print(cutword())
def hanzivec():
"""
中文特征值化
:return: None
"""
c1,c2,c3 = cutword()
cv = CountVectorizer()
data = cv.fit_transform([c1,c2,c3])
# for f_name in cv.get_feature_names():
# print(f_name)
print(cv.get_feature_names())
print(data.toarray())
return None
hanzivec()
def tfidfvec():
"""
中文特征值化
# 词频 TF:在一篇文章中出现该词与文章中总词数的比值 出现次数/文章总词数
# 逆向词频 IDF: log(文章的总数/该词出现的文章数)
:return: None
"""
c1,c2,c3 = cutword()
print(c1,c2,c3)
tf = TfidfVectorizer()
data = tf.fit_transform([c1,c2,c3])
print(tf.get_feature_names())
print(data.toarray())
return None
tfidfvec()
def stand():
"""
标准化缩放
# 把每一列的值缩放到平均值为0,标准差为1的大小区间内
:return:
"""
std = StandardScaler()
data = std.fit_transform([[1.,-1.,3.],
[2.,4.,2.],
[4.,6.,-1.]])
print(data)
return None
# stand()
def mm():
"""
归一化处理:把数据都放在2-3之间处理
:return: None
"""
mm = MinMaxScaler(feature_range=(2,3))
data = mm.fit_transform([[90,2,10,40],
[60,4,15,45],
[75,3,13,46]])
print(data)
return None
# mm()
def var():
"""
特征选择--删除低方差的特征
:return:None
"""
# 方差阈值
var = VarianceThreshold(threshold=1.0)
data = var.fit_transform([[0,2,0,3],
[0,1,4,3],
[0,1,1,3]])
print(data)
return None
var()