有numpy
下面安装pandas;有点慢啊!;
使用import pandas 来检查是否安装成功。
下面安装Scikit-learn pip3 install Scikit-learn;也挺慢的。
import sklearn;用来检查是否安装成功。
跑的第一个程序;比较慢啊;能运行就好啦;
将sparse = False;关闭sparse矩阵;见上图和下图进行区分,上图为0的就不显示了,下图是不是0都会通过矩阵显示;
下载分词器pip install jieba
首先这个AttributeError: ‘module’ object has no attribute ‘cut’ 报错的原因是因为有jieba.py这个文件存在,或者jieba这样命名的文件存在,很多新人使用结巴 来分词的时候命名直接为jieba.py,但是其实官方给的教程代码里有import jieba,这样就会引用到你自己这个教程文件jieba.py,而没有引用官方的库,这样自然cut这个方法就没有,所以报错。
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import jieba
def dictvec():
dict = DictVectorizer(sparse = False)
data = dict.fit_transform(
[
{"city":'北京','temperature':100},
{"city":'上海','temperature':60},
{"city":'深圳','temperature':30}
]
)
print(dict.get_feature_names())
print(dict.inverse_transform(data))
print(data)
return None
def countvec():
'''
对于文本进行特征值化
'''
cv = CountVectorizer()
data = cv.fit_transform(
["life is short,i like python","life is too long,i dislike python"]
)
print(cv.get_feature_names())
print(data.toarray())
return None
def cutword():
con1 = jieba.cut("今天很残酷,明天更残酷,后天很美好")
con2 = jieba.cut("我们看到的从很远星系")
con3 = jieba.cut("如果只是用一种方式了解某个事物")
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1,c2,c3
def hanzivec():
c1,c2,c3 = cutword()
print(c1,c2,c3)
cv = CountVectorizer()
data = cv.fit_transform(
[c1,c2,c3]
)
print(cv.get_feature_names())
print(data.toarray())
return None
if __name__=='__main__' :
# dictvec()
# countvec()
hanzivec()
07.tf-idf分析问题
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
def dictvec():
dict = DictVectorizer(sparse = False)
data = dict.fit_transform(
[
{"city":'北京','temperature':100},
{"city":'上海','temperature':60},
{"city":'深圳','temperature':30}
]
)
print(dict.get_feature_names())
print(dict.inverse_transform(data))
print(data)
return None
def countvec():
'''
对于文本进行特征值化
'''
cv = CountVectorizer()
data = cv.fit_transform(
["life is short,i like python","life is too long,i dislike python"]
)
print(cv.get_feature_names())
print(data.toarray())
return None
def cutword():
con1 = jieba.cut("今天很残酷,明天更残酷,后天很美好")
con2 = jieba.cut("我们看到的从很远星系")
con3 = jieba.cut("如果只是用一种方式了解某个事物")
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1,c2,c3
def hanzivec():
c1,c2,c3 = cutword()
print(c1,c2,c3)
cv = CountVectorizer()
data = cv.fit_transform(
[c1,c2,c3]
)
print(cv.get_feature_names())
print(data.toarray())
return None
def tfidfvec():
c1,c2,c3 = cutword()
print(c1,c2,c3)
tf = TfidfVectorizer()
data = tf.fit_transform(
[c1,c2,c3]
)
print(tf.get_feature_names())
print(data.toarray())
return None
if __name__=='__main__' :
# dictvec()
# countvec()
# hanzivec()
tfidfvec()
09.归一化以及标准化对比