from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
X,y=news.data,news.target
from bs4 import BeautifulSoup
#导入nltk和re工具包
import nltk,re
#定义一个函数名为news_to_sentences将新闻中的句子逐一剥离出来,并返回一个句子的列表
def news_to_sentences(news):
news_text = BeautifulSoup(news,"html5lib").get_text()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(news_text)
sentences=[]
for sent in raw_sentences:
sentences.append(re.sub('[^a-zA-Z]',' ',sent.lower().strip()).split())
return sentences
sentences=[]
for x in X:
sentences += news_to_sentences(x)
#从gensim.models里导入word2vec
from gensim.models import word2vec
#配置词向量的维度
num_features = 300
#保证被考虑的词汇的频度
min_word_count = 20
#设定并行化训练使用CPU计算核心的数量,多核可用
num_workers = 2
#定义训练词向量的上下文窗口大小
context = 5
downsampling = 1e-3
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences,workers = num_workers,\
size = num_features,min_count=min_word_count,\
window = context,sample = downsampling)
model.init_sims(replace=True)
print(model.most_similar('hello'))
print(model.most_similar('email'))
print('end')
BeautifulSoup(news).get_text() 函数调用会出现警告信息,
Warning (from warnings module):
File "D:\Python35\lib\site-packages\bs4\__init__.py", line 181
markup_type=markup_type))
UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html5lib"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
The code that caused this warning is on line 1 of the file <string>. To get rid of this warning, change code that looks like this:
BeautifulSoup(YOUR_MARKUP}) to this:
加入html5lib参数,如下
BeautifulSoup(news,"html5lib").get_text() 输出结果如下:
>>> print(model.most_similar('email'))[('mail', 0.7399873733520508), ('contact', 0.6850252151489258), ('address', 0.6711879968643188), ('sas', 0.6611512303352356), ('replies', 0.6424497365951538), ('mailed', 0.6364169716835022), ('request', 0.6355448961257935), ('compuserve', 0.6323468685150146), ('send', 0.6153897047042847), ('internet', 0.59690260887146)]
>>> print(model.most_similar('hello'))
[('hi', 0.8492101430892944), ('netters', 0.6953952312469482), ('pl', 0.6211292147636414), ('dear', 0.5891242027282715), ('nh', 0.5402401685714722), ('scotia', 0.5400180220603943), ('tin', 0.5357101559638977), ('elm', 0.5321102142333984), ('greetings', 0.5246435403823853), ('hanover', 0.5063780546188354)]
>>>