Python实现基于皮尔森系数的协同过滤电影推荐。
- 爬虫获取用户数据
"""
爬取豆瓣某影视的评分前100个用户,将他们的影评信息抓取下来作为movie.json
为了保证数据的可靠性,选择豆瓣电影top250 No.1的【肖申克的救赎】,热门影评的前100人作为数据
"""
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import json
import urllib
import requests
people_names = []
people_urls = []
r = re.compile(r'e/(.+)/')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/74.0.3724.8 Safari/537.36',
'Referer': 'https://movie.douban.com/subject/26100958/comments',
'Connection': 'keep-alive'}
print("爬取用户中 ...")
for i in range(0, 10):
url = ("https://movie.douban.com/subject/27010768/comments?"
"start=" + str(i * 20) + "&limit=20&sort=new_score&status=P&percent_type=")
req = urllib.request.Request(url=url, headers=headers)
data = urllib.request.urlopen(req).read().decode('utf-8')
bs = BeautifulSoup(data, 'html.parser')
comments = bs.findAll("div", {
"class": "comment"})
for comment in comments:
people_url = comment.findAll("a")[1].attrs["href"].replace("www", "movie")
name = re.findall(r, people_url)[0]
people_names.append(name)
people_urls.append(people_url)
print("爬取用户完成")
final_data = {
}
for i in range(0, len(people_names)):
final_data.setdefault(people_names[i], {
})
final_data[people_names[i]]["people_url"] = people_urls[i]
print("爬取用户影评中...")
user_count = 1
for people_name in final_data:
print("正在爬取第" + str(user_count) + "位用户" + people_name + "的影评信息")
user_count += 1
for i in range(0, 6):
comment_url_suffix = ("collect?start=" + str(i * 15) +