python爬取豆瓣电影top250全部影片详细页面详情

豆瓣电影 Top 250

1.抓取影片的目录页

import re
import  requests
import csv
from bs4 import BeautifulSoup as bs
movie_list = []
# get_multt_pages(pagenum)函数是处理翻页,抓取多个页面的函数(主函数)
def get_multi_pages(pagenum):
    """
    该函数是主函数,通过循环处理翻页抓取和多页信息提取的工作
    :param pagenum: 要抓取的页面数量(也是要解析的页面数量)
    :return: None
    """
    # 通过循环构建url,抓取每页的内容
    for movie_index in range(0, pagenum * 25, 25):
        # movie_index是url中影片的起始排名索引,start参数的值
        url = 'https://movie.douban.com/top250?start=' + str(movie_index) + '&filter='
        # print('翻页用的url:', url)  # 调试用
        print(f'正在抓取第{movie_index // 25 + 1}页信息,请稍待......')
        # 2.抓取每页的内容,写入本地文本文件,该操作必须在循环中进行 用子函数实现 requests + with open
        get_page(url)是抓取每页内容并写入文本的函数

        #get_page(url)  # 抓取完成后,调用抓取函数的代码即可注释掉

        # 3.读取抓取到本地的文本文件,提取每部影片的信息 bs + re 用另一个子函数实现
        # 构建用来提取信息的文件名:top250index/top_1.html
        # get_movie_info是开启文件提取信息的函数
        file_name = 'index/top_' + str(movie_index // 25 + 1) + '.html'
        get_movie_info(file_name)
# get_page(url)是抓取每页内容并写入文本的函数
def get_page(url):
    """
    抓取页面,写入文件
    :param url: 要抓取的url
    :return: None
    """
    db_headers = {
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
    }
    # 抓取页面内容
    db_response = requests.get(url, headers = db_headers)
    db_response.encoding = 'utf-8'

    movie_index = url.split('=')[1].split('&')[0]
    page_num = int(movie_index) // 25 + 1

    file_name = 'index/top_' + str(page_num) + '.html' 
    print(f'正在写入{file_name},请稍待......')

    # 将抓取到的页面文件写入文本文件
    with open(file_name, 'w', encoding = 'utf-8') as tpf:
        tpf.write(db_response.text)
# 3:读取:
def get_movie_info(file_name):
    with open (file_name,'r',encoding='utf-8')as db:
        str_page=db.read()    
        soup_page=bs(str_page,'html.parser') #转为soup对象

        try:
            tagOL=soup_page.find('ol', class_ ='grid_view')
        except Exception as e:
            print(e)
            tagOL=' '

        #提取ol中所有的li标签
        try:
            topLIs=tagOL.find_all('li')#只有标签,后面没有属性用find_all  ,需要判断属性用find
        except Exception as e:
            print(e)
            topLIs=[]
        print(len(topLIs))
        # 遍历每部影片对应的li标签,提取影片信息
        for movie in topLIs:
            try:
                topem=movie.em.string
            except Exception as e:
                print(e)
                topem=''              
            print('影片的id:',topem)
            # b.影片名称 <span class="title">肖申克的救赎</span>
            try:
                topname=movie.find('span', class_='title').string  
            except Exception as e :
                print(e)
                topname=''
            try:
                topname2=movie.img['alt']
            except Exception as e:
                print(e)
                topname2=''
            print('影片名称:', topname2)
            try:
                toprating=movie.find('span',class_='rating_num').string
            except Exception as e :
                print(e)
                toprating=''
            print('影片评分:', toprating)
            # d.评论人数
            try:
                top_people4=movie.find('span',property="v:best").find_next_sibling().string[:-3]
            except Exception as e:
                print(e)
                top_people4=''
            print('4:',top_people4)

            # e. 短评 
            try:
                top_duanping=movie.find('span',class_='inq').string
            except Exception as e :
                print(e)
                top_duanping=''
            print('短评:', top_duanping)
            # f.详情页链接
            try:
                topherf=movie.a['href']
            except Exception as e :
                print(e)
                topherf=''
            print('详情页链接:', topherf)
            print('-'*100)
            # 将抓取到的所有信息添加到全局变量中
            movie_list.append((topem,topname,toprating,top_people,top_duanping,topherf))
def save2csv():
    # 开启要写入的csv文件
    with open ('top2550fullindex.csv','w',encoding='utf-8' , newline = '')as db:
        index_writer=csv.writer(db)
        index_writer.writerow(['排名', '名称', '评分', '评论人数', '短评', '详情页链接'])
        index_writer.writerows(movie_list)

2.抓取影片的详情页

# read_index是从csv文件中读取数据,构建容器的函数
def read_index():
    #开启csv文件
    with open ('top250fullindex.csv','r',encoding='utf-8')as csv1:
        index_reader=list(csv.reader(csv1)) # 将reader得到的迭代器转换为列表,方便后续操作
        # 同切片器去掉表头
        # 提取排名、名称和详情页链接三列,构建三个容器对象
        ranks = [item[0]for item in index_reader][1:] # 提取reader列表中的第一列,去掉表头信息
        names=[item[1]for item in index_reader][1:]
        links=[item[-1]for item in index_reader][1:]
        return ranks,names,links
# get_full_page是抓取详情页并写入文件的函数
def get_full_page():
    movie_ranks,movie_names,movie_links=read_index()
    # 调用read_index函数,同时返回影片的排名、名称和链接三个列表
    print('影片排名:', movie_ranks)
    print('影片名称:', movie_names)
    print('影片链接:', movie_links)
    print('-' * 100)
    # 在当前文件夹下创建一个用来存储详情页的文件夹
    os.makedirs('top250full',exist_ok=True)  
    for i in  range(0,50):
        print(movie_ranks[i], movie_names[i], movie_links[i])
        # 构建用来保存的文件名
        full_name='top250full/'+movie_ranks[i]+'_'+movie_names[i]+'.html'
        print(f"正在抓取第{full_name}请稍后....")
        full_page=webtools.get_wbe_page(movie_links[i],decode=True,encoding='utf-8')   #
        with open (full_name,'w',encoding='utf-8')as fdb:
            fdb.write(full_page)
        time.sleep(random.randint(1, 3))
        # 每抓一页随机停顿1~3秒,避免被服务器发现封掉IP
# get_full_data是打开影片详情页文件,提取信息的函数
def get_full_data():
    files.sort(key = lambda i:int(i[:4].replace('-','_').split('_')[0]))
    # sort的key机制是将list中的每个元素作为i,进行函数处理,用处理之后的结果进行排序
    print(files)  # 调试用   按照排名的顺序排列
    # 遍历文件名列表,开启文件,提取信息
    for file in files[:100]:
        
        with open(root + '/' + file, 'r', encoding='utf-8') as mf:
            print(f'正在提取{file}的信息,请稍待......')
            print('-'*100)
            str_page=mf.read()
            soup_page=bs(str_page,'html.parser')

            try:
                movie_content=soup_page.find('div',id='content')
            except Exception as e:
                print(e)
                movie_content=''
            print(movie_content)
            # a.排名 
            try:
                movie_rank=movie_content.find('span',class_="top250-no").string
            except Exception as e:
                print(e)
                movie_rank=''
            print('影片排名:', movie_rank)
            # b.影片名称
            try:
                movie_name=soup_page.title.string[:-5].strip()  # 去掉所有的空格
            except Exception as e:
                print(e)
                movie_name=''
            print('影片名称:', movie_name)
            # c.上映年份 
            try:
                movie_year=movie_content.find('span',class_="year").string[1:-1]
            except Exception as e:
                print(e)
                movie_year=''
            print('上映年份:', movie_year)
            # d.演职人员
            
            try:
                movie_info=movie_content.find('div',id='info')
            except Exception as e:
                print(e)
                movie_info=''
            
            try:
                movie_attrs=movie_info.find_all('span',class_='attrs')
            except Exception as e :
                print(e)
                movie_attrs=''

            # 导演
            try:
                movie_dir = movie_attrs[0].text
            except Exception as e:
                print(e)
                movie_dir = ''
            print('导演:', movie_dir)
            # 编剧
            try:
                movie_script = movie_attrs[1].text
            except Exception as e:
                print(e)
                movie_script = ''
            print('编剧:', movie_script)
            # 演员
            try:
                movie_actor = movie_attrs[2].text
            except Exception as e:
                print(e)
                movie_actor = ''
            print('演员:', movie_actor)
       
            try:
               
                tag_movie_types=movie_info.find_all('span', property = 'v:genre')
                movie_types='/'.join([tag.string for tag in tag_movie_types])  # tag 的string方法
            except Exception as e:
                print(e)
                movie_types=''
            print('影片类型:', movie_types)
            # f.国家/地区 <span class="pl">制片国家/地区:</span> 美国<br/>
            # 方法1:用bs直接提取比较麻烦,可以考虑re
            try:
                pn_movie_area=re.compile(r'<span class="pl">制片国家/地区:</span> (.*?)<br/>')
                movie_area=re.findall(pn_movie_area,str(movie_info))[0].strip()
            except Exception as e:
                print(e)
                movie_area=''
            print('制片国家/地区:', movie_area)
           
            # g.语言
            # 先提取出div info中所有的文本,再设法从中提取语言对应的信息
            try:
                movie_lange=movie_info.find('span',string='语言:').next_sibling.strip()
            except Exception as e:
                print(e)
                movie_lange=''
            print('语言:', movie_lange)
            # h.片长
            try:
                movie_runtime=movie_info.find('span',property = 'v:runtime').string
            except Exception as e:
                print(e)
                movie_runtime=''
            print('片长:', movie_runtime)
            # i.评分
            try:
                movie_rating=movie_content.find('strong',class_="ll rating_num").string
            except Exception as e:
                print(e)
                movie_rating=''
            print('影片评分:', movie_rating)
            # j.评论人数
            try:
                movie_people=movie_content.find('span',property="v:votes").string
            except Exception as e:
                print(e)
                movie_people=''
            print('评论人数:', movie_people)
            # k.简介
            try:
                movie_summary=re.sub(r'\s +',' ',movie_content.find('span',property="v:summary").text.strip())
                
            except Exception as e:
                print(e)
                movie_summary=''
            print('简介:', movie_summary)
            movie_list.append((movie_rank, movie_name, movie_year, movie_dir, movie_script, movie_actor, movie_types, movie_area2,
             movie_lange, movie_runtime, movie_rating, movie_people, movie_summary))

def save_data():
    # 开启文件
    with open ('top250full.csv','w',encoding='utf-8')as file:
        full_writer=csv.writer(file)
        full_writer.writerow(['排名', '名称' ,'上映年份', '导演', '编剧', '演员', '影片类型', '制片国家/地区', '语言', '片长', '评分', '评论人数', '简介'])
        full_writer.writerows(movie_list)

  • 9
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值