python爬取豆瓣电影(requests模块)

最新推荐文章于 2024-07-24 17:05:28 发布

小鱼子爱吃

最新推荐文章于 2024-07-24 17:05:28 发布

阅读量1.8k

点赞数 1

分类专栏： python

本文链接：https://blog.csdn.net/DHKSHFJ/article/details/104739831

版权

python 专栏收录该内容

10 篇文章 4 订阅

订阅专栏

python使用requests模块请求网址，使用lxml模块中etree抓取数据，并使用time模块延时

爬取的页面为：
在这里插入图片描述

运行结果如下图所示：
python代码如下：

在这里插入代码片
# _*_ coding:utf _*_
# 邮箱：3195841740@qq.com
# 人员：21292
# 日期：2020/3/8 11:05
# 工具：PyCharm


import requests
from lxml import etree
import re
import time

headers = {

'Cookie': 'll="118375"; bid=LweMDRu6xy0; __utmz=30149280.1582607485.1.1.utmcsr=sogou.com|utmccn=(referral)|utmcmd=referral|utmcct=/link; __utmz=223695111.1583572638.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __yadk_uid=sIlSb4fUzktAAB7ns01bryACK9TG0Ytt; _vwo_uuid_v2=D4D430B7FCD55769AFD16F4AB7B8A5907|ae49228565fb206135f49f584eb2c78e; __gads=ID=a3adab5ce8eafc57:T=1583573105:S=ALNI_MYInfQ1FlG09Ho82DR2aEpSSXRC_Q; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1583668047%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.502674428.1582607485.1583572636.1583668047.3; __utmb=30149280.0.10.1583668047; __utmc=30149280; __utma=223695111.2100654023.1583572638.1583572638.1583668047.2; __utmb=223695111.0.10.1583668047; __utmc=223695111; _pk_id.100001.4cf6=96f806704894c344.1583572638.2.1583668060.1583573242.',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'


}

def print_movies(movies):
    print("*"*100)
    for each in movies:
        time.sleep(0.5)
        if each == '主演':
            for x in range(len(movies['主演'])):
                time.sleep(0.5)
                if x == 0:
                    print(each,':',movies['主演'][x])
                else:
                    print('   ',movies['主演'][x])
        else:
            print(each,':',movies[each])
    print("*" * 100)
def HTML_spider_detial(url):
    response = requests.get(url,headers = headers,allow_redirects=False)
    text = response.content.decode('utf-8')
    html = etree.HTML(text)
    return html

def get_detial_urls(url):
    html = HTML_spider_detial(url)
    detial_urls = html.xpath('//div[@class = "info"]/div[1]//@href')
    for detial_url in detial_urls:
        print('正在请求网址：',detial_url,'中....................')
        spider_detials_url(detial_url)
        time.sleep(0.5)

def spider_detials_url(url):
    movies = {}
    html = HTML_spider_detial(url)
    movie_name = html.xpath('//div[@id = "content"]/h1/span/text()')[0]
    movies['电影名称'] = movie_name
    movie_year = html.xpath('//div[@id = "content"]/h1/span/text()')[1]
    movie_year = re.findall(r'[(](.*?)[)]',movie_year)[0]
    movies['年份'] = movie_year
    movie_drector = html.xpath('//div[@id = "info"]/span[1]/span[2]/a/text()')[0]
    movies['导演'] = movie_drector
    movie_value = html.xpath('//div[@class ="rating_self clearfix" ]/strong/text()')[0]
    movies['豆瓣评分'] = movie_value
    movie_actors = html.xpath('//div[@id = "info"]/span[3]/span[2]//a/text()')
    movies['主演'] = movie_actors
    infos = html.xpath('//div[@id = "info"]//text()')
    for index in range(0,len(infos),1):
        if infos[index] == "语言:":
            movie_language = infos[index+1]
            movies['语言'] = movie_language
        elif infos[index] == '上映日期:':
            movie_time = infos[index+2]
            movies["上映时间"] = movie_time
        elif infos[index] == "片长:":
            movie_long = infos[index+2]
            movies['片长'] = movie_long
    movie_simple = html.xpath('//span[@property="v:summary"]/text()')[0].strip()
    movies['电影简介'] = movie_simple
    print_movies(movies)


if __name__ == '__main__':
    for i in range(0,10,1):
        count = i*25
        url = 'https://movie.douban.com/top250?start='+str(count)+'&filter='
        get_detial_urls(url)

修改代码，将数据储存到excel中
运行如下：
在这里插入图片描述详细代码如下：

在这里插入代码片
# _*_ coding:utf _*_
# 邮箱：3195841740@qq.com
# 人员：21292
# 日期：2020/3/8 11:05
# 工具：PyCharm

import requests
from lxml import etree
import re
import time
import openpyxl



#创建headers 若爬取过程中出现错误，修改这两个参数
headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0',

'Cookie': 'll="118375"; bid=Gnw8x-tUTyQ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1583715518%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=dac2da16aa651d16.1582123086.16.1583716968.1583677611.; __yadk_uid=skuIGmPsoBorvw32ahEZf6sqfam16Rtj; __utma=30149280.1846912402.1582123089.1583715518.1583728695.18; __utmz=30149280.1583677417.16.10.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1046665669.1582123089.1583677417.1583715518.16; __utmz=223695111.1583677417.15.9.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gads=ID=e685b433defaf33d:T=1582374422:S=ALNI_MbPZ69DTsUEApb-_etqVfoNXEAO5g; ct=y; _vwo_uuid_v2=D630691FC9560003A3D895CB985C1B204|b22c83cc5ece5c99126b175f0bd35663; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18932; __utmc=30149280; __utmc=223695111; dbcl2="189323804:LgqGltbQVkw"; ck=p11c; __utmb=30149280.6.9.1583728745418; __utmt=1'
}

#将爬取到的数据写入excel
def save_detials_movies(movies_list,count = [1]):
    file = openpyxl.Workbook()
    sheet = file.active
    sheet.title = "电影数据"
    #excel的标题
    sheet['A1'] = '电影名称'
    sheet['B1'] = '年份'
    sheet['C1'] = '导演'
    sheet['D1'] = '豆瓣评分'
    sheet['E1'] = '主演'
    sheet['F1'] = '语言'
    sheet['G1'] = '上映时间'
    sheet['H1'] = '片长'
    sheet['I1'] = '电影简介'
    number = count[0]+1
    count[0] = number
    #定义行数
    number = 2
    while number < len(movies_list):
        for i in range(0, len(movies_list), 1):
            sheet['A' + str(number)] = movies_list[number]['电影名称']
            sheet['B' + str(number)] = movies_list[number]['年份']
            sheet['C' + str(number)] = movies_list[number]['导演']
            sheet['D' + str(number)] = movies_list[number]['豆瓣评分']
            actor = ''
            for each in movies_list[number]["主演"]:
                actor = actor + ' ' + each
            sheet['E' + str(number)] = actor
            sheet['F' + str(number)] = movies_list[number]['语言']
            sheet['G' + str(number)] = movies_list[number]['上映时间']
            sheet['H' + str(number)] = movies_list[number]['片长']
            sheet['I' + str(number)] = movies_list[number]['电影简介']
        number = number + 1
    #保存excel
    file.save('豆瓣电影.xlsx')


#输出movies
def print_movies(movies):
    print("*"*100)
    for each in movies:
        time.sleep(0.5)
        if each == '主演':
            for x in range(len(movies['主演'])):
                time.sleep(0.5)
                if x == 0:
                    print(each,':',movies['主演'][x])
                else:
                    print('   ',movies['主演'][x])
        else:
            print(each,':',movies[each])
    print("*" * 100)
#对网址进行解析，并返回
def HTML_spider_detial(url):   
    response = requests.get(url,headers = headers,allow_redirects=False)
    text = response.content.decode('utf-8')
    html = etree.HTML(text)
    return html
#提取网页中的电影url
def get_detial_urls(url,count = [1]):
    html = HTML_spider_detial(url)
    detial_urls = html.xpath('//div[@class = "info"]/div[1]//@href')
    #提取每页中详细的url地址
    for detial_url in detial_urls:
        print('正在请求第'+str(count[0])+'个网址：',detial_url,'中....................')
        spider_detials_url(detial_url)
        count[0] = count[0]+1
        #延时
        time.sleep(0.5)

#定义数组，将所有的电影数据储存在这个数组内
movies_list = []
#对电影数据的爬取，保存到movies字典中

def spider_detials_url(url):
    #定义空的字典，用于储存爬取的数据
    movies = {}
    html = HTML_spider_detial(url)
    #电影名称
    movie_name = html.xpath('//div[@id = "content"]/h1/span/text()')[0]
    movies['电影名称'] = movie_name
    #电影上映时间
    movie_year = html.xpath('//div[@id = "content"]/h1/span/text()')[1]
    movie_year = re.findall(r'[(](.*?)[)]',movie_year)[0]
    movies['年份'] = movie_year
    #导演
    movie_drector = html.xpath('//div[@id = "info"]/span[1]/span[2]/a/text()')[0]
    movies['导演'] = movie_drector
    #豆瓣评分
    movie_value = html.xpath('//div[@class ="rating_self clearfix" ]/strong/text()')[0]
    movies['豆瓣评分'] = movie_value
    #电影演员
    movie_actors = html.xpath('//div[@id = "info"]/span[3]/span[2]//a/text()')
    movies['主演'] = movie_actors
    infos = html.xpath('//div[@id = "info"]//text()')
    for index in range(0,len(infos),1):
        #电影语言
        if infos[index] == "语言:":
            movie_language = infos[index+1]
            movies['语言'] = movie_language
        #电影具体的上映日期
        elif infos[index] == '上映日期:':
            movie_time = infos[index+2]
            movies["上映时间"] = movie_time
        #电影时长
        elif infos[index] == "片长:":
            movie_long = infos[index+2]
            movies['片长'] = movie_long
    #电影简介
    movie_simple = html.xpath('//span[@property="v:summary"]/text()')[0].strip()
    movies['电影简介'] = movie_simple
    #print_movies(movies)
    movies_list.append(movies)
    save_detials_movies(movies_list)

#开始运行爬虫
def start():
    for i in range(0,10,1):
        count = i*25
        url = 'https://movie.douban.com/top250?start='+str(count)+'&filter='
        get_detial_urls(url)

if __name__ == '__main__':
    start()