使用教程:
https://blog.csdn.net/qq_35273499/article/details/79098689
https://www.jianshu.com/p/d6a0aec6e9a1
https://blog.csdn.net/majinlei121/article/details/83210996
https://blog.csdn.net/weixin_30663391/article/details/98550139
代码实现:
import multiprocessing
import warnings
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
# 加载包
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
def word2vec():
print('Start...')
rawdata='D:\\我的大学\\信息内容安全\\作业3\\out.txt'
modelpath='D:\\我的大学\\信息内容安全\\作业3\\modeldata.model'
#vectorpath=‘D:\\我的大学\\信息内容安全\\作业3‘
model=Word2Vec(LineSentence(rawdata),size=400,window=5,min_count=5,workers=multiprocessing.cpu_count())#参数说明,gensim函数库的Word2Vec的参数说明
model.save(modelpath)
#model.wv.save_word2vec_format(vectorpath,binary=False)
print("Finished!")
def wordsimilarity():
model=Word2Vec.load('D:\\我的大学\\信息内容安全\\作业3\\modeldata.model')
semi=''
try:
semi=model.most_similar('日本',topn=10)
except KeyError:
print('The word not in vocabulary!')
name=[0 for i in range(100)]
name.append(1)
colleges=[0 for i in range(100)]
colleges.append(1)
#print(model[u‘日本‘])#打印词向量
i=0
for term in semi:
print('%s,%s',term[0],term[1])
if __name__=='__main__':
#dataprocess()
#trans_seg()
word2vec()
wordsimilarity()
又做了个网易云音乐的
https://blog.csdn.net/Tong_T/article/details/80354512
https://blog.csdn.net/Tong_T/article/details/80366407
代码如下:
# -*- coding:utf-8 -*-
"""
爬虫爬取网易云音乐歌单的数据包保存成json文件
"""
import requests
import json
import os
import base64
import binascii
import urllib
import urllib.request
from Crypto.Cipher import AES
from bs4 import BeautifulSoup
class NetEaseAPI:
def __init__(self):
self.header = {
'Host': 'music.163.com',
'Origin': 'https://music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Accept': 'application/json, text/javascript',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
}
self.cookies = {'appver': '1.5.2'}
self.playlist_class_dict = {}
self.session = requests.Session()
def _http_request(self, method, action, query=None, urlencoded=None, callback=None, timeout=None):
connection = json.loads(self._raw_http_request(method, action, query, urlencoded, callback, timeout))
return connection
def _raw_http_request(self, method, action, query=None, urlencoded=None, callback=None, timeout=None):
if method == 'GET':
request = urllib2.Request(action, self.header)
response = urllib2.urlopen(request)
connection = response.read()
elif method == 'POST':
data = urllib.parse.urlencode(query).encode('utf-8')
request = urllib.request.Request(action, data, self.header)
response = urllib.request.urlopen(request)
connection = response.read()
return connection
@staticmethod
def _aes_encrypt(text, secKey):
pad = 16 - len(text) % 16
text = text + chr(pad) * pad
encryptor = AES.new(secKey.encode('utf-8'), 2, b'0102030405060708')
ciphertext = encryptor.encrypt(text.encode('utf-8'))
ciphertext = base64.b64encode(ciphertext).decode('utf-8')
return ciphertext
@staticmethod
def _rsa_encrypt(text, pubKey, modulus):
text = text[::-1]
rs = pow(int(binascii.hexlify(text.encode('utf-8')), 16), int(pubKey, 16), int(modulus, 16))
return format(rs, 'x').zfill(256)
@staticmethod
def _create_secret_key(size):
return (''.join(map(lambda xx: (hex(xx)[2:]), os.urandom(size))))[0:16]
def get_playlist_id(self, action):
request = urllib.request.Request(action, headers=self.header)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
response.close()
soup = BeautifulSoup(html, 'lxml')
list_url = soup.select('ul#m-pl-container li div a.msk')
for k, v in enumerate(list_url):
list_url[k] = v['href'][13:]
return list_url
def encode(s):
return ' '.join([bin(ord(c)).replace('0b', '') for c in s])
def get_playlist_detail(self, id):
text = {
'id': id,
'limit': '100',
'total': 'true'
}
text = json.dumps(text)
nonce = '0CoJUm6Qyw8W8jud'
pubKey = '010001'
modulus = ('00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7'
'b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280'
'104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932'
'575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b'
'3ece0462db0a22b8e7')
secKey = self._create_secret_key(16)
encText = self._aes_encrypt(self._aes_encrypt(text, nonce), secKey)
encSecKey = self._rsa_encrypt(secKey, pubKey, modulus)
data = {
'params': encText,
'encSecKey': encSecKey
}
action = 'http://music.163.com/weapi/v3/playlist/detail'
playlist_detail = self._http_request('POST', action, data)
return playlist_detail
if __name__ == '__main__':
nn = NetEaseAPI()
index = 1
for flag in range(1, 38):
if flag > 1:
page = (flag - 1) * 35
url = 'http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=' + str(
page)
else:
url = 'http://music.163.com/discover/playlist'
playlist_id = nn.get_playlist_id(url)
for item_id in playlist_id:
playlist_detail = nn.get_playlist_detail(item_id)
with open('d:/python/{0}.json'.format(index), 'w',encoding='utf-8') as file_obj:
json.dump(playlist_detail, file_obj, ensure_ascii=False)
index+=1
print("写入json文件:", item_id)
# -*- coding:utf-8-*-
"""
对网易云所有歌单爬虫的json文件进行数据预处理成csv文件
"""
from __future__ import (absolute_import, division, print_function, unicode_literals)
import json
def parse_playlist_item():
"""
:return: 解析成userid itemid rating timestamp行格式
"""
file = open("d:/python/test1.csv", 'a', encoding='utf8')
for i in range(1, 1292):
with open("d:/python/{0}.json".format(i), 'r', encoding='UTF-8') as load_f:
load_dict = json.load(load_f)
try:
for item in load_dict['playlist']['tracks']:
# playlist id # song id # score # datetime
line_result = [load_dict['playlist']['id'], item['id'], item['pop'], item['publishTime']]
for k, v in enumerate(line_result):
if k == len(line_result) - 1:
file.write(str(v))
else:
file.write(str(v) + ',')
file.write('\n')
except Exception:
print(i)
continue
file.close()
def parse_playlist_id_to_name():
file = open("d:/python/test1.csv", 'a', encoding='utf8')
for i in range(1, 1292):
with open("d:/python/{0}.json".format(i), 'r', encoding='UTF-8') as load_f:
load_dict = json.load(load_f)
try:
line_result = [load_dict['playlist']['id'], load_dict['playlist']['name']]
for k, v in enumerate(line_result):
if k == len(line_result) - 1:
file.write(str(v))
else:
file.write(str(v) + ',')
file.write('\n')
except Exception:
print(i)
continue
file.close()
def parse_song_id_to_name():
file = open("d:/python/test1.csv", 'a', encoding='utf8')
for i in range(1, 1292):
print("%d",i)
with open("d:/python/{0}.json".format(i), 'r', encoding='UTF-8') as load_f:
load_dict = json.load(load_f)
try:
for item in load_dict['playlist']['tracks']:
# playlist id # song id # score # datetime
line_result = [item['id'], item['name'] + '-' + item['ar'][0]['name']]
for k, v in enumerate(line_result):
if k == len(line_result) - 1:
file.write(str(v))
else:
file.write(str(v) + ',')
file.write('\n')
except Exception:
print(i)
continue
file.close()
parse_playlist_item()
parse_playlist_id_to_name()
parse_song_id_to_name()
import json
from random import shuffle
import multiprocessing
import gensim
import csv
def train_song2vec():
"""
:return: 所有歌单song2Vec模型的训练和保存
"""
songlist_sequence = []
# 读取网易云音乐原数据
for i in range(1, 1292):
with open("d:/python/{0}.json".format(i), 'r', encoding='UTF-8') as load_f:
load_dict = json.load(load_f)
parse_songlist_get_sequence(load_dict, songlist_sequence)
# 多进程计算
cores = multiprocessing.cpu_count()
print('Using all {cores} cores'.format(cores=cores))
print('Training word2vec model...')
model = gensim.models.Word2Vec(sentences=songlist_sequence, size=150, min_count=3, window=7, workers=cores)
print('Save model..')
model.save('songVec.model')
def parse_songlist_get_sequence(load_dict, songlist_sequence):
"""
解析每个歌单中的歌曲id信息
:param load_dict: 包含一个歌单中所有歌曲的原始列表
:param songlist_sequence: 一个歌单中所有给的id序列
:return:
"""
song_sequence = []
for item in load_dict['playlist']['tracks']:
try:
song = [item['id'], item['name'], item['ar'][0]['name'], item['pop']]
song_id, *song_name, artist, pop = song
song_sequence.append(str(song_id))
except:
print('song format error')
for i in range(len(song_sequence)):
shuffle(song_sequence)
# 这里的list()必须加上,要不songlist中歌曲根本就不是随机打乱序列,而是都相同序列
songlist_sequence.append(list(song_sequence))
def song_data_preprocessing():
"""
歌曲id到歌曲名字的映射
:return: 歌曲id到歌曲名字的映射字典,歌曲名字到歌曲id的映射字典
"""
csv_reader = csv.reader(open('d:/python/test1.csv', encoding='utf-8'))
id_name_dic = {}
name_id_dic = {}
for row in csv_reader:
id_name_dic[row[0]] = row[1]
name_id_dic[row[1]] = row[0]
return id_name_dic, name_id_dic
train_song2vec()
model_str = 'D:/python/songVec.model'
# 载入word2vec模型
model = gensim.models.Word2Vec.load(model_str)
id_name_dic, name_id_dic = song_data_preprocessing()
song_id_list = list(id_name_dic.keys())[4000:5000:200]
for song_id in song_id_list:
result_song_list = model.most_similar(song_id)
print(song_id, id_name_dic[song_id])
print('\n相似歌曲和相似度分别为:')
for song in result_song_list:
print('\t' + id_name_dic[song[0]], song[1])
print('\n')