基于python协同过滤算法的电影推荐

简介

协同过滤算法包括基于用户和基于物品的协同过滤算法。
基于用户协同过滤: “喜欢这个东西的人也喜欢……”
基于产品协同过滤: “像你一样的人也喜欢……”

设计路线

采集豆瓣电影,包含电影的:分类、名称、评论
结巴分词去除无用的词语,找到前10关键字
余弦相似度算法得到推荐电影

项目文件说明

static 静态文件存放的位置
templates 前端展示的模板文件
app_service.py app应用
Cosine_similarity_algorithm.py 根据协同过滤算法原则,sklearn和jieba算法对评论数据计算得出2部电影的余弦相似度
tools.py 工具包
词性分析.py jieba和snownlp算法对词性进行分析
豆瓣电影.py 爬虫爬取豆瓣电影的评论

代码展示
import requests, json, time
from lxml import etree
from tools import MysqlHandler

def send_req(url):
        headers = {
            'Accept': '*/*',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Cookie': 'll="108288"; bid=rUlJ5A0XyaU; _vwo_uuid_v2=D70D298AF5A6CE68F5BC22CD7F6FA63B0|a0aa43ad626e7854b141806ceace3414; __gads=ID=4bf99911ad9bd539-22b50cde7fd20097:T=1651234594:RT=1651234594:S=ALNI_MaYKtkxqgytT4pEKF_LrPXZAd6Fxw; __gpi=UID=00000515339f3259:T=1651240879:RT=1651240879:S=ALNI_MaVxQLi4y_jEiNNdS4GY5CB6Fs_Aw; dbcl2="186630761:46y82UsjGhg"; push_noty_num=0; push_doumail_num=0; __utmz=30149280.1651287858.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1651287858.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __yadk_uid=agt0c58AeealgkzeFaahGiSmb88cOKYh; ck=wcIE; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1651295250%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DKY9yXJflB80uadUvuIrXbNpAmLFQYL8B-q7BQsXjbafMDT5qmbxnTAzmoXQ6jncs%26wd%3D%26eqid%3D9ad620790004b2d700000003626ca72d%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.741971608.1651234552.1651287858.1651295250.5; __utmb=30149280.0.10.1651295250; __utmc=30149280; __utma=223695111.213794521.1651234552.1651287858.1651295250.5; __utmb=223695111.0.10.1651295250; __utmc=223695111; _pk_id.100001.4cf6=165cba3717947788.1651234553.5.1651295273.1651288930.',
            'Referer': 'https://movie.douban.com/explore',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest',
            'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"'
        }
        while True:
            try:
                resp = requests.get(url=url,headers=headers)
                time.sleep(3)
                return resp
            except Exception as e:
                print(e)

def send_comment_req(url):
    payload = {}
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Cookie': 'll="108288"; bid=rUlJ5A0XyaU; _vwo_uuid_v2=D70D298AF5A6CE68F5BC22CD7F6FA63B0|a0aa43ad626e7854b141806ceace3414; __gads=ID=4bf99911ad9bd539-22b50cde7fd20097:T=1651234594:RT=1651234594:S=ALNI_MaYKtkxqgytT4pEKF_LrPXZAd6Fxw; __gpi=UID=00000515339f3259:T=1651240879:RT=1651240879:S=ALNI_MaVxQLi4y_jEiNNdS4GY5CB6Fs_Aw; dbcl2="186630761:46y82UsjGhg"; push_noty_num=0; push_doumail_num=0; __utmz=30149280.1651287858.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1651287858.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __yadk_uid=agt0c58AeealgkzeFaahGiSmb88cOKYh; ck=wcIE; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1651295250%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DKY9yXJflB80uadUvuIrXbNpAmLFQYL8B-q7BQsXjbafMDT5qmbxnTAzmoXQ6jncs%26wd%3D%26eqid%3D9ad620790004b2d700000003626ca72d%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.741971608.1651234552.1651287858.1651295250.5; __utmb=30149280.0.10.1651295250; __utmc=30149280; __utma=223695111.213794521.1651234552.1651287858.1651295250.5; __utmb=223695111.0.10.1651295250; __utmc=223695111; _pk_id.100001.4cf6=165cba3717947788.1651234553.5.1651300708.1651288930.',
        'Referer': 'https://movie.douban.com/subject/26752088/reviews?start=20',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
    }
    while True:
        try:
            response = requests.request("GET", url, headers=headers, data=payload)
            time.sleep(3)
            return response
        except Exception as e:
            print(e)

def parse_data(response):
    # print('text: {}'.format(response.text))
    json_data = response.json()
    for l in json_data['subjects']:
        # print('每条电影信息: {}'.format(l))
        movie_id = l['id']
        title = l['title']
        # print('title: {}'.format(title))
        yield {'movie_id': movie_id, 'title':title}

def parse_data_comment(text):
    html = etree.HTML(text)
    for j in range(1,20):
        comment = html.xpath(f"/html/body/div[3]/div[1]/div/div[1]/div[1]/div[{j}]/div/div/div[1]/div/text()")
        comment = ''.join(comment)
        comment = comment.strip()
        comment = comment.replace('()','')
        comment = comment.strip()
        if comment:
            yield comment

def main():
    mysql_handler = MysqlHandler()
    start_urls = [
        'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=0',
        'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=20',
        'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=40',
        'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=60',
        'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=80',
        'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=100',
        'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=120',
        'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=140',
        'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=160',
    ]
    for url in start_urls:
        print('url:{}'.format(url))
        resp = send_req(url)
        list0 = parse_data(resp)
        for m in list0:
            print('m: {}'.format(m))
            select_sql = 'select * from douban_movie where title="%s";'%(m['title'])
            select_res = mysql_handler.select_data(select_sql)
            print('mysql中的查询结果是:{}'.format(select_res))
            if select_res:
                print('mysql中已经有数据了')
            else:
                print('msyql中还没有数据')
                comments_list = []
                comment_urls = [f"https://movie.douban.com/subject/{m['movie_id']}/reviews?start={i * 20}" for i in
                                range(5)]
                for comment_url in comment_urls:
                    print(f'comment_url:{comment_url}')
                    resp2 = send_comment_req(comment_url)
                    comments = parse_data_comment(resp2.text)
                    comments_list.extend(comments)
                i = 0
                for comment in comments_list:
                    i += 1
                    print('i: {}, comment: {}'.format(i, comment))
                comments_str = '\n'.join(comments_list)
                comments_str = comments_str.replace('\'','')
                if comments_str:
                    insert_sql = f"INSERT INTO douban_movie(title,comments) VALUES('{m['title']}','{comments_str}');"
                    print("insert_sql: {}".format(insert_sql))
                    try:
                        mysql_handler.insert_data(insert_sql)
                    except Exception as e:
                        print(e)

main()
数据展示

在这里插入图片描述

效果展示

豆瓣电影推荐

参考

https://www.writebug.com/git/Badguy/film-recommend/src/branch/master/src
https://blog.csdn.net/acdreamers/article/details/44672305
https://blog.csdn.net/elecjack/article/details/50913776
https://www.zkxjob.com/10330

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

v(z_xiansheng88)

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值