python爬取豆瓣top250电影信息成功代码

import requests

from lxml import etree

import pandas as pd

import os



MOVIES = []

IMGURLS = []



def get_html(url):



    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}



    try:

        html = requests.get(url,headers = headers)

        html.encoding = html.apparent_encoding

        if html.status_code == 200:

            print('成功获取源代码')

           

    except Exception as e:

        print('获取源代码失败:%s' % e)



    return html.text



def parse_html(html):



    movies = []

    imgurls = []

    html = etree.HTML(html)

    lis = html.xpath("//ol[@class = 'grid_view']/li")



    for li in lis:

        name = li.xpath(".//a/span[@class='title'][1]/text()")[0]

        director_actor = "".join(li.xpath(".//div[@class='bd']/p/text()[1]")[0].replace(' ','').replace('\n','').replace('/','').split())

        info = "".join(li.xpath(".//div[@class='bd']/p/text()[2]")[0].replace(' ','').replace('\n','').split())

        rating_score = li.xpath(".//span[@class='rating_num']/text()")[0]

        rating_num = li.xpath(".//div[@class='star']/span[4]/text()")[0]

        introduce = li.xpath(".//p[@class='quote']/span/text()")



        if introduce:

            movie = {'name': name, 'director_actor': director_actor, 'info': info, 'rating_score': rating_score,

                     'rating_num': rating_num, 'introduce': introduce[0]}

        else:

            movie = {'name': name, 'director_actor': director_actor, 'info': info, 'rating_score': rating_score,

                     'rating_num': rating_num, 'introduce': None}

        imgurl = li.xpath(".//img/@src")[0]



        movies.append(movie)

        imgurls.append(imgurl)



    return movies,imgurls



def download_img(url,movie):



    if 'movieposter' in os.listdir(r'G:\爬虫数据'):

        pass

    else:

        os.mkdir('movieposter')

    os.chdir(r'G:\爬虫数据\movieposter')



    img = requests.get(url).content



    with open(movie['name'] + '.jpg','wb') as f:

        print('正在下载 : %s' % url)

        f.write(img)







if __name__ == '__main__':



    for i in range(10):

        url = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='



        html = get_html(url)

        movies = parse_html(html)[0]

        imgurls = parse_html(html)[1]



        MOVIES.extend(movies)

        IMGURLS.extend(imgurls)



    for i in range(250):

        download_img(IMGURLS[i],MOVIES[i])



    os.chdir(r'G:\爬虫数据')

    moviedata = pd.DataFrame(MOVIES)

    moviedata.to_csv('movie.csv')

    print('电影信息成功保存到本地')

在这里插入图片描述
在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值