from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import jieba
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
def cut_word():
sentence1 = "今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。"
sentence2 = "我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。"
sentence3 = "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"
c1 = jieba.cut(sentence1)
c2 = jieba.cut(sentence2)
c3 = jieba.cut(sentence3)
c1 = ' '.join(list(c1))
c2 = ' '.join(list(c2))
c3 = ' '.join(list(c3))
return c1, c2, c3
def my_count():
c1, c2, c3 = cut_word()
count = CountVectorizer()
data = count.fit_transform([c1, c2, c3])
print(count.get_feature_names())
print(data)
print(data.toarray())
def my_count2():
c1, c2, c3 = cut_word()
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2, c3])
print(tf.get_feature_names())
print(data)
print(data.toarray())
def stand():
"""
标准化处理
:return:
"""
standard = StandardScaler()
data = standard.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
print(data)
def im():
im = SimpleImputer(missing_values=np.nan, strategy='mean')
data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
print(data)
pass
if __name__ == "__main__":
# my_count2()
# stand()
im()
机器学习日志_20211022
最新推荐文章于 2024-04-29 14:06:03 发布