coding=utf-8
“”"
author:lei
function: 特征抽取
“”"
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
text_list = [“life is short, i like python”, “life is too long, i dislike python”]
text_list2 = {“人生苦短”, “我用python”, “人生漫长”, “我不用python”}
def dictvec():
“”"
字典数据抽取:把字典中一些类别的数据,分别转换成特征
数组形式,有类别的形式
:return:
“”"
# 实例化
dict = DictVectorizer(sparse=False) # sparse 矩阵转换为数组形式
data = dict.fit_transform([{"city": "上海", "temp": 60}, {"city": "深圳", "temp": 80}])
print(dict.get_feature_names()) # list,获取对应的特征值的数组列表
print(dict.inverse_transform(data)) # 得到原来的数据
print(data) # ndarray one-hot 编码
def countvec():
“”"
对文本进行特征值化
1、统计所有文章中所有的词,重复的只看做一次
2、对每篇文章,在词的列表中进行统计每个词出现的次数
:return:
"""
cv = CountVectorizer()
data = cv.fit_transform(text_list)
print(data.toarray()) # 将sparse矩阵转换为数组格式
print(cv.get_feature_names()) # 获取所有文字的排列
return None
def cut_word():
c1 = jieba.cut(“今天很残酷,明天更残酷,后天更残酷”)
c2 = jieba.cut(“我们看到的从很远星系来的光是在几百万年之前发出的”)
c3 = jieba.cut(“如果只用一种方式了解事物,你不会真正理解它”)
# print(c1, c2, c3)
# 将生成器转换成列表,列表转换成字符串
c1 = "".join(list(c1))
c2 = "".join(str(list(c2)))
c3 = "".join(str(list(c3)))
# print(c1, c2, c3)
return c1, c2, c3
def hanzivec():
“”"
中文特征值化
:return:
“”"
cv = CountVectorizer()
data = cv.fit_transform(cut_word())
print(data.toarray())
print(cv.get_feature_names())
def tfidfvec():
“”"
tfidf的使用
:return:
“”"
c1, c2, c3 = cut_word()
print(c1, c2, c3)
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2, c3])
print(tf.get_feature_names())
print(data.toarray())
return None
if name == ‘main’:
# dictvec()
# countvec()
# cut_word()
# hanzivec()
tfidfvec()