简介
协同过滤算法包括基于用户和基于物品的协同过滤算法。
基于用户协同过滤: “喜欢这个东西的人也喜欢……”
基于产品协同过滤: “像你一样的人也喜欢……”
设计路线
采集豆瓣电影,包含电影的:分类、名称、评论
结巴分词去除无用的词语,找到前10关键字
余弦相似度算法得到推荐电影
项目文件说明
static 静态文件存放的位置
templates 前端展示的模板文件
app_service.py app应用
Cosine_similarity_algorithm.py 根据协同过滤算法原则,sklearn和jieba算法对评论数据计算得出2部电影的余弦相似度
tools.py 工具包
词性分析.py jieba和snownlp算法对词性进行分析
豆瓣电影.py 爬虫爬取豆瓣电影的评论
代码展示
import requests, json, time
from lxml import etree
from tools import MysqlHandler
def send_req(url):
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'll="108288"; bid=rUlJ5A0XyaU; _vwo_uuid_v2=D70D298AF5A6CE68F5BC22CD7F6FA63B0|a0aa43ad626e7854b141806ceace3414; __gads=ID=4bf99911ad9bd539-22b50cde7fd20097:T=1651234594:RT=1651234594:S=ALNI_MaYKtkxqgytT4pEKF_LrPXZAd6Fxw; __gpi=UID=00000515339f3259:T=1651240879:RT=1651240879:S=ALNI_MaVxQLi4y_jEiNNdS4GY5CB6Fs_Aw; dbcl2="186630761:46y82UsjGhg"; push_noty_num=0; push_doumail_num=0; __utmz=30149280.1651287858.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1651287858.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __yadk_uid=agt0c58AeealgkzeFaahGiSmb88cOKYh; ck=wcIE; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1651295250%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DKY9yXJflB80uadUvuIrXbNpAmLFQYL8B-q7BQsXjbafMDT5qmbxnTAzmoXQ6jncs%26wd%3D%26eqid%3D9ad620790004b2d700000003626ca72d%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.741971608.1651234552.1651287858.1651295250.5; __utmb=30149280.0.10.1651295250; __utmc=30149280; __utma=223695111.213794521.1651234552.1651287858.1651295250.5; __utmb=223695111.0.10.1651295250; __utmc=223695111; _pk_id.100001.4cf6=165cba3717947788.1651234553.5.1651295273.1651288930.',
'Referer': 'https://movie.douban.com/explore',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
while True:
try:
resp = requests.get(url=url,headers=headers)
time.sleep(3)
return resp
except Exception as e:
print(e)
def send_comment_req(url):
payload = {}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'll="108288"; bid=rUlJ5A0XyaU; _vwo_uuid_v2=D70D298AF5A6CE68F5BC22CD7F6FA63B0|a0aa43ad626e7854b141806ceace3414; __gads=ID=4bf99911ad9bd539-22b50cde7fd20097:T=1651234594:RT=1651234594:S=ALNI_MaYKtkxqgytT4pEKF_LrPXZAd6Fxw; __gpi=UID=00000515339f3259:T=1651240879:RT=1651240879:S=ALNI_MaVxQLi4y_jEiNNdS4GY5CB6Fs_Aw; dbcl2="186630761:46y82UsjGhg"; push_noty_num=0; push_doumail_num=0; __utmz=30149280.1651287858.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1651287858.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __yadk_uid=agt0c58AeealgkzeFaahGiSmb88cOKYh; ck=wcIE; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1651295250%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DKY9yXJflB80uadUvuIrXbNpAmLFQYL8B-q7BQsXjbafMDT5qmbxnTAzmoXQ6jncs%26wd%3D%26eqid%3D9ad620790004b2d700000003626ca72d%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.741971608.1651234552.1651287858.1651295250.5; __utmb=30149280.0.10.1651295250; __utmc=30149280; __utma=223695111.213794521.1651234552.1651287858.1651295250.5; __utmb=223695111.0.10.1651295250; __utmc=223695111; _pk_id.100001.4cf6=165cba3717947788.1651234553.5.1651300708.1651288930.',
'Referer': 'https://movie.douban.com/subject/26752088/reviews?start=20',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
while True:
try:
response = requests.request("GET", url, headers=headers, data=payload)
time.sleep(3)
return response
except Exception as e:
print(e)
def parse_data(response):
# print('text: {}'.format(response.text))
json_data = response.json()
for l in json_data['subjects']:
# print('每条电影信息: {}'.format(l))
movie_id = l['id']
title = l['title']
# print('title: {}'.format(title))
yield {'movie_id': movie_id, 'title':title}
def parse_data_comment(text):
html = etree.HTML(text)
for j in range(1,20):
comment = html.xpath(f"/html/body/div[3]/div[1]/div/div[1]/div[1]/div[{j}]/div/div/div[1]/div/text()")
comment = ''.join(comment)
comment = comment.strip()
comment = comment.replace('()','')
comment = comment.strip()
if comment:
yield comment
def main():
mysql_handler = MysqlHandler()
start_urls = [
'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=0',
'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=20',
'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=40',
'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=60',
'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=80',
'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=100',
'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=120',
'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=140',
'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=160',
]
for url in start_urls:
print('url:{}'.format(url))
resp = send_req(url)
list0 = parse_data(resp)
for m in list0:
print('m: {}'.format(m))
select_sql = 'select * from douban_movie where title="%s";'%(m['title'])
select_res = mysql_handler.select_data(select_sql)
print('mysql中的查询结果是:{}'.format(select_res))
if select_res:
print('mysql中已经有数据了')
else:
print('msyql中还没有数据')
comments_list = []
comment_urls = [f"https://movie.douban.com/subject/{m['movie_id']}/reviews?start={i * 20}" for i in
range(5)]
for comment_url in comment_urls:
print(f'comment_url:{comment_url}')
resp2 = send_comment_req(comment_url)
comments = parse_data_comment(resp2.text)
comments_list.extend(comments)
i = 0
for comment in comments_list:
i += 1
print('i: {}, comment: {}'.format(i, comment))
comments_str = '\n'.join(comments_list)
comments_str = comments_str.replace('\'','')
if comments_str:
insert_sql = f"INSERT INTO douban_movie(title,comments) VALUES('{m['title']}','{comments_str}');"
print("insert_sql: {}".format(insert_sql))
try:
mysql_handler.insert_data(insert_sql)
except Exception as e:
print(e)
main()
数据展示
效果展示
豆瓣电影推荐
参考
https://www.writebug.com/git/Badguy/film-recommend/src/branch/master/src
https://blog.csdn.net/acdreamers/article/details/44672305
https://blog.csdn.net/elecjack/article/details/50913776
https://www.zkxjob.com/10330