python爬取豆瓣电影top250全部影片详细页面详情

陪我长大嘛

已于 2024-05-12 16:10:50 修改

阅读量316

点赞数 9

文章标签：爬虫

于 2024-05-12 16:10:34 首次发布

本文链接：https://blog.csdn.net/weixin_69039688/article/details/138757311

版权

豆瓣电影 Top 250

1.抓取影片的目录页

import re
import  requests
import csv
from bs4 import BeautifulSoup as bs

movie_list = []
# get_multt_pages(pagenum)函数是处理翻页，抓取多个页面的函数（主函数）
def get_multi_pages(pagenum):
    """
    该函数是主函数，通过循环处理翻页抓取和多页信息提取的工作
    :param pagenum: 要抓取的页面数量（也是要解析的页面数量）
    :return: None
    """
    # 通过循环构建url，抓取每页的内容
    for movie_index in range(0, pagenum * 25, 25):
        # movie_index是url中影片的起始排名索引，start参数的值
        url = 'https://movie.douban.com/top250?start=' + str(movie_index) + '&filter='
        # print('翻页用的url:', url)  # 调试用
        print(f'正在抓取第{movie_index // 25 + 1}页信息，请稍待......')
        # 2.抓取每页的内容，写入本地文本文件，该操作必须在循环中进行 用子函数实现 requests + with open
        get_page(url)是抓取每页内容并写入文本的函数

        #get_page(url)  # 抓取完成后，调用抓取函数的代码即可注释掉

        # 3.读取抓取到本地的文本文件，提取每部影片的信息 bs + re 用另一个子函数实现
        # 构建用来提取信息的文件名：top250index/top_1.html
        # get_movie_info是开启文件提取信息的函数
        file_name = 'index/top_' + str(movie_index // 25 + 1) + '.html'
        get_movie_info(file_name)

# get_page(url)是抓取每页内容并写入文本的函数
def get_page(url):
    """
    抓取页面，写入文件
    :param url: 要抓取的url
    :return: None
    """
    db_headers = {
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
    }
    # 抓取页面内容
    db_response = requests.get(url, headers = db_headers)
    db_response.encoding = 'utf-8'

    movie_index = url.split('=')[1].split('&')[0]
    page_num = int(movie_index) // 25 + 1

    file_name = 'index/top_' + str(page_num) + '.html' 
    print(f'正在写入{file_name},请稍待......')

    # 将抓取到的页面文件写入文本文件
    with open(file_name, 'w', encoding = 'utf-8') as tpf:
        tpf.write(db_response.text)

# 3:读取：
def get_movie_info(file_name):
    with open (file_name,'r',encoding='utf-8')as db:
        str_page=db.read()    
        soup_page=bs(str_page,'html.parser') #转为soup对象

        try:
            tagOL=soup_page.find('ol', class_ ='grid_view')
        except Exception as e:
            print(e)
            tagOL=' '

        #提取ol中所有的li标签
        try:
            topLIs=tagOL.find_all('li')#只有标签，后面没有属性用find_all  ，需要判断属性用find
        except Exception as e:
            print(e)
            topLIs=[]
        print(len(topLIs))
        # 遍历每部影片对应的li标签，提取影片信息
        for movie in topLIs:
            try:
                topem=movie.em.string
            except Exception as e:
                print(e)
                topem=''              
            print('影片的id:',topem)
            # b.影片名称 <span class="title">肖申克的救赎</span>
            try:
                topname=movie.find('span', class_='title').string  
            except Exception as e :
                print(e)
                topname=''
            try:
                topname2=movie.img['alt']
            except Exception as e:
                print(e)
                topname2=''
            print('影片名称：', topname2)
            try:
                toprating=movie.find('span',class_='rating_num').string
            except Exception as e :
                print(e)
                toprating=''
            print('影片评分：', toprating)
            # d.评论人数
            try:
                top_people4=movie.find('span',property="v:best").find_next_sibling().string[:-3]
            except Exception as e:
                print(e)
                top_people4=''
            print('4:',top_people4)

            # e. 短评 
            try:
                top_duanping=movie.find('span',class_='inq').string
            except Exception as e :
                print(e)
                top_duanping=''
            print('短评：', top_duanping)
            # f.详情页链接
            try:
                topherf=movie.a['href']
            except Exception as e :
                print(e)
                topherf=''
            print('详情页链接：', topherf)
            print('-'*100)
            # 将抓取到的所有信息添加到全局变量中
            movie_list.append((topem,topname,toprating,top_people,top_duanping,topherf))

def save2csv():
    # 开启要写入的csv文件
    with open ('top2550fullindex.csv','w',encoding='utf-8' , newline = '')as db:
        index_writer=csv.writer(db)
        index_writer.writerow(['排名', '名称', '评分', '评论人数', '短评', '详情页链接'])
        index_writer.writerows(movie_list)

2.抓取影片的详情页

# read_index是从csv文件中读取数据，构建容器的函数
def read_index():
    #开启csv文件
    with open ('top250fullindex.csv','r',encoding='utf-8')as csv1:
        index_reader=list(csv.reader(csv1)) # 将reader得到的迭代器转换为列表，方便后续操作
        # 同切片器去掉表头
        # 提取排名、名称和详情页链接三列，构建三个容器对象
        ranks = [item[0]for item in index_reader][1:] # 提取reader列表中的第一列，去掉表头信息
        names=[item[1]for item in index_reader][1:]
        links=[item[-1]for item in index_reader][1:]
        return ranks,names,links

# get_full_page是抓取详情页并写入文件的函数
def get_full_page():
    movie_ranks,movie_names,movie_links=read_index()
    # 调用read_index函数，同时返回影片的排名、名称和链接三个列表
    print('影片排名：', movie_ranks)
    print('影片名称：', movie_names)
    print('影片链接：', movie_links)
    print('-' * 100)
    # 在当前文件夹下创建一个用来存储详情页的文件夹
    os.makedirs('top250full',exist_ok=True)  
    for i in  range(0,50):
        print(movie_ranks[i], movie_names[i], movie_links[i])
        # 构建用来保存的文件名
        full_name='top250full/'+movie_ranks[i]+'_'+movie_names[i]+'.html'
        print(f"正在抓取第{full_name}请稍后....")
        full_page=webtools.get_wbe_page(movie_links[i],decode=True,encoding='utf-8')   #
        with open (full_name,'w',encoding='utf-8')as fdb:
            fdb.write(full_page)
        time.sleep(random.randint(1, 3))
        # 每抓一页随机停顿1~3秒，避免被服务器发现封掉IP

# get_full_data是打开影片详情页文件，提取信息的函数
def get_full_data():
    files.sort(key = lambda i:int(i[:4].replace('-','_').split('_')[0]))
    # sort的key机制是将list中的每个元素作为i，进行函数处理，用处理之后的结果进行排序
    print(files)  # 调试用   按照排名的顺序排列
    # 遍历文件名列表，开启文件，提取信息
    for file in files[:100]:
        
        with open(root + '/' + file, 'r', encoding='utf-8') as mf:
            print(f'正在提取{file}的信息，请稍待......')
            print('-'*100)
            str_page=mf.read()
            soup_page=bs(str_page,'html.parser')

            try:
                movie_content=soup_page.find('div',id='content')
            except Exception as e:
                print(e)
                movie_content=''
            print(movie_content)
            # a.排名 
            try:
                movie_rank=movie_content.find('span',class_="top250-no").string
            except Exception as e:
                print(e)
                movie_rank=''
            print('影片排名：', movie_rank)
            # b.影片名称
            try:
                movie_name=soup_page.title.string[:-5].strip()  # 去掉所有的空格
            except Exception as e:
                print(e)
                movie_name=''
            print('影片名称：', movie_name)
            # c.上映年份 
            try:
                movie_year=movie_content.find('span',class_="year").string[1:-1]
            except Exception as e:
                print(e)
                movie_year=''
            print('上映年份：', movie_year)
            # d.演职人员
            
            try:
                movie_info=movie_content.find('div',id='info')
            except Exception as e:
                print(e)
                movie_info=''
            
            try:
                movie_attrs=movie_info.find_all('span',class_='attrs')
            except Exception as e :
                print(e)
                movie_attrs=''

            # 导演
            try:
                movie_dir = movie_attrs[0].text
            except Exception as e:
                print(e)
                movie_dir = ''
            print('导演：', movie_dir)
            # 编剧
            try:
                movie_script = movie_attrs[1].text
            except Exception as e:
                print(e)
                movie_script = ''
            print('编剧：', movie_script)
            # 演员
            try:
                movie_actor = movie_attrs[2].text
            except Exception as e:
                print(e)
                movie_actor = ''
            print('演员：', movie_actor)
       
            try:
               
                tag_movie_types=movie_info.find_all('span', property = 'v:genre')
                movie_types='/'.join([tag.string for tag in tag_movie_types])  # tag 的string方法
            except Exception as e:
                print(e)
                movie_types=''
            print('影片类型：', movie_types)
            # f.国家/地区 <span class="pl">制片国家/地区:</span> 美国<br/>
            # 方法1：用bs直接提取比较麻烦，可以考虑re
            try:
                pn_movie_area=re.compile(r'<span class="pl">制片国家/地区:</span> (.*?)<br/>')
                movie_area=re.findall(pn_movie_area,str(movie_info))[0].strip()
            except Exception as e:
                print(e)
                movie_area=''
            print('制片国家/地区：', movie_area)
           
            # g.语言
            # 先提取出div info中所有的文本，再设法从中提取语言对应的信息
            try:
                movie_lange=movie_info.find('span',string='语言:').next_sibling.strip()
            except Exception as e:
                print(e)
                movie_lange=''
            print('语言：', movie_lange)
            # h.片长
            try:
                movie_runtime=movie_info.find('span',property = 'v:runtime').string
            except Exception as e:
                print(e)
                movie_runtime=''
            print('片长：', movie_runtime)
            # i.评分
            try:
                movie_rating=movie_content.find('strong',class_="ll rating_num").string
            except Exception as e:
                print(e)
                movie_rating=''
            print('影片评分：', movie_rating)
            # j.评论人数
            try:
                movie_people=movie_content.find('span',property="v:votes").string
            except Exception as e:
                print(e)
                movie_people=''
            print('评论人数：', movie_people)
            # k.简介
            try:
                movie_summary=re.sub(r'\s +',' ',movie_content.find('span',property="v:summary").text.strip())
                
            except Exception as e:
                print(e)
                movie_summary=''
            print('简介：', movie_summary)
            movie_list.append((movie_rank, movie_name, movie_year, movie_dir, movie_script, movie_actor, movie_types, movie_area2,
             movie_lange, movie_runtime, movie_rating, movie_people, movie_summary))

def save_data():
    # 开启文件
    with open ('top250full.csv','w',encoding='utf-8')as file:
        full_writer=csv.writer(file)
        full_writer.writerow(['排名', '名称' ,'上映年份', '导演', '编剧', '演员', '影片类型', '制片国家/地区', '语言', '片长', '评分', '评论人数', '简介'])
        full_writer.writerows(movie_list)