Python爬虫爬取豆瓣电影Top250

最新推荐文章于 2024-06-25 13:52:07 发布

风离

最新推荐文章于 2024-06-25 13:52:07 发布

阅读量1.8k

点赞数 5

分类专栏： Python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_49186423/article/details/118497537

版权

Python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

该博客介绍了如何使用Python爬虫获取豆瓣Top250电影的数据，包括电影名称、别名、链接、图片、演员、评分等信息，并将数据存储到Excel表格中。通过正则表达式匹配网页内容，然后遍历解析结果，最终将所有信息整合并保存。

摘要由CSDN通过智能技术生成

爬虫爬取豆瓣Top250

文章目录

- 爬虫爬取豆瓣Top250

本文学习自B站UP主 https://www.bilibili.com/video/BV12E411A7ZQ 大家可以去三连一波~~

完整代码

# coding=utf-8
import xlrd
from bs4 import BeautifulSoup  # 网页解析 获取数据
import re  # 正则表达式
import urllib.request, urllib.error  # 指定url 获取网页数据
import requests
import xlwt  # 操作execl
import sqlite3  # 进行sqlite数据库操作 (存数据库)
from xlutils.copy import copy


# title + 别名 + 电影链接 + imgSrc + 演员表 + 标签 + 评价人数  +短评语

# 爬取网页获取数据
def getData(url):
    header = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122  Safari / 537.36"

    }
    head = """
            Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
            Accept-Language: zh-CN,zh;q=0.9
            User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
        """
    html = ""
    try:
        proxy = {'http': 'http://127.0.0.1:10809','https': 'http://127.0.0.1:10809'}  # 配置代理
        req = requests.get(url=url, headers=header,proxies=proxy)     #  发起请求 获取响应
        html = req.text    #  获取响应内容
        # print(html)
        bs = BeautifulSoup(html, "html.parser")  # 解析成 html
        # 第一步缩小范围
        a = bs.find_all("div", class_="item")    # div 标签 class="item" 里面的内容


        findLink = re.compile(r'<a href="(.*)">')  # r忽略转义字符 用单引号可以内嵌双引号
        findTitle = re.compile(r'<span class="title">(.*?)</span>', re.S)
        findOther = re.compile(r'<span class="other">(.*?)</span>', re.S)
        findSrc = re.compile(r'<img.*?src="(.*?)"', re.S)  # re.S 匹配包括换行符的所有字符
        findMaster = re.compile(r'<p class="">(.*?)</p>', re.S)
        findAvg = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
        findPersons = re.compile(r'<span>(\d*)人评价')
        findDetail = re.compile(r'<span class="inq">(.*?)</span>')
        # print(re.findall(findDetail,html))
        # ['https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg']
        '''
        findSrc = re.compile(r'<img.*src="(.*)"')
        ['https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg" width="100']
        '''
        # findMaster = re.compile()
        dataList = []    # 一部电影的所有信息存入dataList中
        for item in a:   #  a为缩小之后的范围
            print(type(item))  # 查看当前item类型
            item = str(item)  #  将item转为字符串类型 findall(匹配的正则表达式,item为字符串类型)
            #exit()
            data = []
            title = re.findall(findTitle, item)  # 找出标题信息
            if len(title) == 2:  # 若有副标题 就都加上去
                ctitle = title[0]
                data.append(ctitle)
                otitle = title[1].replace(u'\xa0', u'').replace("/", "")  # 消除转义字符
                data.append(otitle)
            else:  # 没有就只加一个并留空一格 保持列表一致
                data.append(title[0])
                data.append(" ")

            # 别名 加[0] 表示会把list列表里面数据拿出来 （字符串）而不是列表类型了
            data.append((re.findall(findOther, item))[0].replace(u'\xa0', u'').replace("/", '-').replace(' ', ''))
            # 链接
            data.append(re.findall(findLink, item)[0])
            # imgSrc
            data.append(re.findall(findSrc, item)[0])
            # 演员表
            data.append(
                re.findall(findMaster, item)[0].replace("...<br/>", "").replace("\n", "").replace(" ", "").replace(
                    "\xa0", ""))
            # 评价星级
            data.append(re.findall(findAvg, item)[0])
            # 评价人数
            data.append(re.findall(findPersons, item)[0])
            # 短评语
            if len(re.findall(findDetail, item)) != 0:
                data.append(re.findall(findDetail, item)[0].replace("。", ""))
            else:
                data.append("暂无评论")
            dataList.append(data)  # 把一部影片所有信息放入dataList中
            # print(data)
        # print(dataList)
        return dataList



    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html

# title + 别名 + 电影链接 + imgSrc + 演员表 + 评价星级 + 评价人数  +短评语
def getAll(url):
    savepath = "豆瓣电影Top250.xls"
    excel = xlwt.Workbook(encoding="utf-8",style_compression=0)
    sheet = excel.add_sheet("豆瓣电影top250",cell_overwrite_ok=True)
    col = ("电影名","别名1","别名2","电影链接","图片链接","演员表","评分","评论人数","短评语")
    for item in range(0,9):
        sheet.write(0,item,col[item])  # 列名
    #  25*10 =25
    listarr = []
    for i in range(0, 10):
        url2 = url + str(i * 25)
        dataList = getData(url2)  # 传入url  返回的dataList 里面有25条电影数据
        listarr.append(dataList)
    arr = []
    for items in listarr:
        for item in items:
            arr.append(item)
    print(len(arr))
    for items in range(0,len(arr)):
        data = arr[items]
        for item in range(0,len(data)):
            sheet.write(items+1,item,arr[items][item])
    excel.save(savepath)










def getHtml(url):
    header = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122  Safari / 537.36"
    }
    proxy_handler = urllib.request.ProxyHandler({'http': '127.0.0.1:10810'})
    opener = urllib.request.build_opener(proxy_handler)
    r = opener.open(url=url, headers=header)
    html = r.read().decode('utf-8', 'ignore')
    return html


def main():
    url = "https://movie.douban.com/top250?start="
    url2 = 'https://sec.douban.com/a?c=a6b706&d="+d+"&r=https%3A%2F%2Fmovie.douban.com%2Ftop250%3Fstart%3D0&k=a6wmDk9GyuEKktSqamFs3h3441rJCo%2F2%2FPK%2FLTqRk0k'
    getAll(url)
    # getData(url)

第一步获取整个网页并以html来解析

模拟真实请求

header = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122  Safari / 537.36"
    }

获取整个页面的html内容

        proxy = {'http': 'http://127.0.0.1:10809','https': 'http://127.0.0.1:10809'}  # 配置代理
        req = requests.get(url=url, headers=header,proxies=proxy)     #  发起请求 获取响应
        html = req.text    #  获取响应内容
        # print(html)
        bs = BeautifulSoup(html, "html.parser")  # 解析成 html

正则表达式来匹配关键词

缩小查找范围并用正则表达式匹配对应字段

  # 第一步缩小范围
        a = bs.find_all("div", class_="item")    # div 标签 class="item" 里面的内容


        findLink = re.compile(r'<a href="(.*)">')  # r忽略转义字符 用单引号可以内嵌双引号
        findTitle = re.compile(r'<span class="title">(.*?)</span>', re.S)
        findOther = re.compile(r'<span class="other">(.*?)</span>', re.S)
        findSrc = re.compile(r'<img.*?src="(.*?)"', re.S)  # re.S 匹配包括换行符的所有字符
        findMaster = re.compile(r'<p class="">(.*?)</p>', re.S)
        findAvg = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
        findPersons = re.compile(r'<span>(\d*)人评价')
        findDetail = re.compile(r'<span class="inq">(.*?)</span>')

所有信息写入列表中

匹配并写入列表中 url 根据页数动态变化

dataList = []    # 一部电影的所有信息存入dataList中
        for item in a:   #  a为缩小之后的范围
            print(type(item))  # 查看当前item类型
            item = str(item)  #  将item转为字符串类型 findall(匹配的正则表达式,item为字符串类型)
            #exit()
            data = []
            title = re.findall(findTitle, item)  # 找出标题信息
            if len(title) == 2:  # 若有副标题 就都加上去
                ctitle = title[0]
                data.append(ctitle)
                otitle = title[1].replace(u'\xa0', u'').replace("/", "")  # 消除转义字符
                data.append(otitle)
            else:  # 没有就只加一个并留空一格 保持列表一致
                data.append(title[0])
                data.append(" ")

            # 别名 加[0] 表示会把list列表里面数据拿出来 （字符串）而不是列表类型了
            data.append((re.findall(findOther, item))[0].replace(u'\xa0', u'').replace("/", '-').replace(' ', ''))
            # 链接
            data.append(re.findall(findLink, item)[0])
            # imgSrc
            data.append(re.findall(findSrc, item)[0])
            # 演员表
            data.append(
                re.findall(findMaster, item)[0].replace("...<br/>", "").replace("\n", "").replace(" ", "").replace(
                    "\xa0", ""))
            # 评价星级
            data.append(re.findall(findAvg, item)[0])
            # 评价人数
            data.append(re.findall(findPersons, item)[0])
            # 短评语
            if len(re.findall(findDetail, item)) != 0:
                data.append(re.findall(findDetail, item)[0].replace("。", ""))
            else:
                data.append("暂无评论")
            dataList.append(data)  # 把一部影片所有信息放入dataList中
            # print(data)
        # print(dataList)
        return dataList

存入Excel中

最后存入Excel中就大功告成啦~

# title + 别名 + 电影链接 + imgSrc + 演员表 + 评价星级 + 评价人数  +短评语
def getAll(url):
    savepath = "豆瓣电影Top250.xls"
    excel = xlwt.Workbook(encoding="utf-8",style_compression=0)
    sheet = excel.add_sheet("豆瓣电影top250",cell_overwrite_ok=True)
    col = ("电影名","别名1","别名2","电影链接","图片链接","演员表","评分","评论人数","短评语")
    for item in range(0,9):
        sheet.write(0,item,col[item])  # 列名
    #  25*10 =25
    listarr = []
    for i in range(0, 10):
        url2 = url + str(i * 25)
        dataList = getData(url2)  # 传入url  返回的dataList 里面有25条电影数据
        listarr.append(dataList)
    arr = []
    for items in listarr:
        for item in items:
            arr.append(item)
    print(len(arr))
    for items in range(0,len(arr)):
        data = arr[items]
        for item in range(0,len(data)):
            sheet.write(items+1,item,arr[items][item])
    excel.save(savepath)