python脚本爬取豆瓣top250电影超清海报原图

代码亲自试了一下,速度取决于网速,简单的贴一下代码好了,有什么问题可以留言。思路可以看一下[视频介绍]

-- coding: utf-8 --

import os

import bs4

import requests as req

def get_content(page_url):  # 获取网页

    url = page_url

    r = []

    headers = {

        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) '

                      'AppleWebKit/537.36 (KHTML, like Gecko)'

                      ' Chrome/63.0.3239.132 Mobile Safari/537.36',

        'cookie': 'bid=QB1DolHTrHk; douban-fav-remind=1; ll="108288"; yadk_uid=VonAsOI0yj6dLjA0uGeBA3Bm2eNCNOTw; _vwo_uuid_v2=D458B58B2BDC40664ADD985D2BBBD6465|baa92a0224e8ed5529abe508121bfb94; gads=ID=db538611950a59d9:T=1580548123:S=ALNI_MYFBTvsfFyou-MeX4br3oJ1CSSX1A; utmz=223695111.1582988839.6.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=%E6%9C%B1%E8%BF%AA; utmz=30149280.1583041381.8.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=microsoft%20remote%20desktop%20for%20mac; push_noty_num=0; push_doumail_num=0; utmv=30149280.8277; ct=y; dbcl2="82771268:DgQVL/PAHOM"; ck=NIH4; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1583303993%2C%22https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D1%26ch%3D%26tn%3Dbaidu%26bar%3D%26wd%3D%25E6%259C%25B1%25E8%25BF%25AA%26oq%3DJudy.2019.%26rsv_pq%3Dc7ca6b9b000c927d%26rsv_t%3Ddf18ju54V9KyOQDdTUOYo4FWELLR2X6OQoXHcJhqO5e5WCUi5CPibeNGq%252Fc%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_dl%3Dtb%26inputT%3D6494%22%5D; _pk_ses.100001.4cf6=*; utma=30149280.363329858.1579408383.1583285998.1583303993.15; utmb=30149280.0.10.1583303993; utmc=30149280; utma=223695111.1275774826.1580548095.1583285998.1583303993.13; utmb=223695111.0.10.1583303993; __utmc=223695111; _pk_id.100001.4cf6=3174f2cb4a0368b8.1580548095.11.1583304000.1583286152.'

    }

    for each in url:

        r.append(req.get(each, headers=headers))

    return r

def how_many_pages():  # 爬取深度

    pages_url = []

    pages = input('How many pages do you want to scrap?(total:10):')

    for i in range(0, int(pages)):

        pages_url.append('https://movie.douban.com/top250?start=' + str(25 * (int(i))) + '&filter=')

    return pages_url

def analysis_content(html):  # 分析网页

    temp = html

    homeIds = {}  # 首页id集合

    # 此处temp是一个列表,存储pages个网页的内容

    for each in temp:

        soup = bs4.BeautifulSoup(each.text, 'html.parser')  # 注意是each.text

        div_all = soup.find_all('div', class_='pic')

        for each in div_all:

            to = str(each.a['href']).split('/')

            one_div = to[len(to) - 2]

            movice_name = each.img['alt']

            homeIds[movice_name] = one_div

    return homeIds

def movice_detail(ids):  # 获取每个电影的海报界面

    r = []

    headers = {

        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) '

                      'Chrome/79.0.3945.117 Safari/537.36'

    }

    for value in ids.values():

        url = "https://movie.douban.com/subject/" + value + "/photos?type=R"

        r.append(req.get(url, headers=headers))

    return r

def analysis_movice_detail(html):  # 分析每个电影的海报界面

    global conformIds

    t = html

    conformIds = {}  # 符合要求的海报id集合

    for each in t:

        flag = 0

        soup = bs4.BeautifulSoup(each.text, 'html.parser')  # 注意是each.text

        ul = soup.find('ul', class_='poster-col3 clearfix')

        h1 = soup.find('h1')

        lis = ul.find_all('li')

        if lis is not None:

            # 某一电影海报界面li集合

            for a in lis:

                real_id = a['data-id']

                div = a.find('div', class_='prop').text.split('x')

                # 如果图片尺寸满足要求(长宽均>800) 保存到ids

                if int(div[0].split()[0]) > 0 and int(div[1].split()[0]) > 0:

                    flag = flag + 1

                    conformIds[h1.text.split()[0] + str(flag)] = real_id

                if flag == 5:
                    break
    
    return conformIds

def image_save():

    folder = '豆瓣top250'

    g = 1

    try:  # 防止出现同名文件夹

        os.mkdir(folder)

    except:

        folder = '豆瓣top250(' + str(g) + ')'

        g += 1

        os.mkdir(folder)

    os.chdir(folder)

    for item in conformIds.items():

        with open(item[0] + '.jpg', 'wb') as f:

            heards = {

                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) '

                              'Chrome/79.0.3945.117 Safari/537.36',

                'referer': 'https://movie.douban.com/photos/photo/' + item[1]

            }

            img_html = req.get("https://img3.doubanio.com/view/photo/raw/public/p" + item[1], headers=heards)

            f.write(img_html.content)

def main():

    page = how_many_pages()

    html = get_content(page)

    movies = analysis_content(html)

    a = movice_detail(movies)  # 获取每个电影的海报界面

    analysis_movice_detail(a)

    image_save()

if name == "main":

    main()
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值