本文爬取了有关科技、汽车、医学、国家这四个类别的的近300篇新闻或者简介。
完整代码如下:
import pandas as pd
import codecs
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
if __name__ == "__main__":
corpus = []
for line in open('聚类4类.txt', 'r',encoding='UTF-8').readlines():
corpus.append(line.strip())
vectorizer = CountVectorizer(min_df=10)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
word = vectorizer.