2021-08-19

豆瓣爬取最新版的电影top250,教你如何将p标签里面的作者、主演、年份、地点、类型分开

  • 1.设置表头
def request_douban(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None

由于现在的豆瓣反爬机制做的比较好,因此需要加上headers头部

  • 2.制作Excel表头信息
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('豆瓣电影top250')
worksheet.write(0, 0, '排名')
worksheet.write(0, 1, '电影名')
worksheet.write(0, 2, '导演')
worksheet.write(0, 3, '主演')
worksheet.write(0, 4, '时间')
worksheet.write(0, 5, '地点')
worksheet.write(0, 6, '分类')
worksheet.write(0, 7, '影评')
worksheet.write(0, 8, '评分')
worksheet.write(0, 9, '简介')
worksheet.write(0, 10, '图片')
  • 3.设置一个全局变量n用来记录每一行的信息
n = 1
  • 4.使用BeautifulSoup库爬取主要信息,并且将爬取到的信息写入到Excel表中
def save_to_excel(soup):
    items = soup.find(class_='grid_view').find_all('li')
    for item in items:
        item_index = item.find(class_='item').find('em').string
        item_pic = item.find(class_='pic').find('img').get('src')
        item_title = item.find(class_='title').string
        item_info = parse_info(item)
        item_director = item_info[0]
        item_actor = item_info[1]
        item_year = item_info[2]
        item_spot = item_info[3]
        item_sort = item_info[4]
        item_score = item.find(class_='star').find(class_='rating_num').string
        if item.find(class_='inq') != None:
            item_inq = item.find(class_='inq').string

        print(item_index + '|' + item_title + '|' + item_director + '|' + item_actor + '|' + item_year + '|' + item_spot + '|' + item_sort + '|' + item_score + '|' + item_inq + '|' + item_pic)

        global n
        worksheet.write(n, 0, item_index)
        worksheet.write(n, 1, item_title)
        worksheet.write(n, 2, item_director)
        worksheet.write(n, 3, item_actor)
        worksheet.write(n, 4, item_year)
        worksheet.write(n, 5, item_spot)
        worksheet.write(n, 6, item_sort)
        worksheet.write(n, 7, item_score)
        worksheet.write(n, 8, item_inq)
        worksheet.write(n, 9, item_pic)

        n += 1

注意:由于有些信息是在同一个标签里面,所以要将它们分开,这里用了一个parse_info函数,这也是整个程序的核心,豆瓣的发爬取机制做的越来越好,因此将它们分开就变得比较麻烦

def parse_info(item):
    list_info = item.find(class_='bd').find('p').text
    str1 = list_info.strip(' \n').split('\xa0')
    str2 = []
    for temp in str1:
        if len(temp) > 1:
            str2.append(temp)
    item_director = ''
    item_actor = ''
    item_year = ''
    if len(str2) == 3:
        if len(str2[0]) > len(str2[1]):
            str3 = str2[0].split('\n ')
            item_director += str3[0]
            item_actor += ''
            item_year += str3[1].strip()
        else:
            str3 = str2[1].split('\n ')
            item_director += str3[0]
            item_actor += ''
            item_year += str3[1].strip()
    if len(str2) > 3:
        str4 = str2[1].split('\n ')
        item_director += str2[0]
        item_actor += str4[0]
        item_year += str4[1].strip()
    item_spot = str2[-2]
    item_sort = str2[-1]
    str_total = [item_director, item_actor, item_year, item_spot, item_sort]
    return str_total

     导演主演年份比较复杂,有的电影没有主演,所以是导演和年份在一起,而大部分情况下是主演和年份在一起,所以为了将这些与普通的类型区分开,这里是用数组的长度来区分,长度为3的,也就是上面两种情况,将它们单独列开,之后里面又分两种情况。最后用一个数组将这个标签里面所有的信息保存起来并作为结果返回就可以啦

  • 5.有了上面的函数封装,于是我们就可以爬取每一页的信息
def main(page):
    url = 'https://movie.douban.com/top250?start=' + str(page*25)+'&filter='
    html = request_douban(url)
    soup = BeautifulSoup(html, 'lxml')
    save_to_excel(soup)
  • 6.最后设置我们爬取的页数并将其保存起来就OK啦!过程千辛万苦,好在结果还不错,嘻嘻
if __name__ == "__main__":
    for i in range(0, 10):
        main(i)

workbook.save('豆瓣最受欢迎的250部电影.xls')

虽然过程千辛万苦,好在结果还不错,嘻嘻
下面就是完整代码块

import requests
from bs4 import BeautifulSoup
import xlwt


def request_douban(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None


workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('豆瓣电影top250')
worksheet.write(0, 0, '排名')
worksheet.write(0, 1, '电影名')
worksheet.write(0, 2, '导演')
worksheet.write(0, 3, '主演')
worksheet.write(0, 4, '时间')
worksheet.write(0, 5, '地点')
worksheet.write(0, 6, '分类')
worksheet.write(0, 7, '影评')
worksheet.write(0, 8, '评分')
worksheet.write(0, 9, '简介')
worksheet.write(0, 10, '图片')

n = 1


def save_to_excel(soup):
    items = soup.find(class_='grid_view').find_all('li')
    for item in items:
        item_index = item.find(class_='item').find('em').string
        item_pic = item.find(class_='pic').find('img').get('src')
        item_title = item.find(class_='title').string
        item_info = parse_info(item)
        item_director = item_info[0]
        item_actor = item_info[1]
        item_year = item_info[2]
        item_spot = item_info[3]
        item_sort = item_info[4]
        item_score = item.find(class_='star').find(class_='rating_num').string
        if item.find(class_='inq') != None:
            item_inq = item.find(class_='inq').string

        print(item_index + '|' + item_title + '|' + item_director + '|' + item_actor + '|' + item_year + '|' + item_spot + '|' + item_sort + '|' + item_score + '|' + item_inq + '|' + item_pic)

        global n
        worksheet.write(n, 0, item_index)
        worksheet.write(n, 1, item_title)
        worksheet.write(n, 2, item_director)
        worksheet.write(n, 3, item_actor)
        worksheet.write(n, 4, item_year)
        worksheet.write(n, 5, item_spot)
        worksheet.write(n, 6, item_sort)
        worksheet.write(n, 7, item_score)
        worksheet.write(n, 8, item_inq)
        worksheet.write(n, 9, item_pic)

        n += 1


def parse_info(item):
    list_info = item.find(class_='bd').find('p').text
    str1 = list_info.strip(' \n').split('\xa0')
    str2 = []
    for temp in str1:
        if len(temp) > 1:
            str2.append(temp)
    item_director = ''
    item_actor = ''
    item_year = ''
    if len(str2) == 3:
        if len(str2[0]) > len(str2[1]):
            str3 = str2[0].split('\n ')
            item_director += str3[0]
            item_actor += ''
            item_year += str3[1].strip()
        else:
            str3 = str2[1].split('\n ')
            item_director += str3[0]
            item_actor += ''
            item_year += str3[1].strip()
    if len(str2) > 3:
        str4 = str2[1].split('\n ')
        item_director += str2[0]
        item_actor += str4[0]
        item_year += str4[1].strip()
    item_spot = str2[-2]
    item_sort = str2[-1]
    str_total = [item_director, item_actor, item_year, item_spot, item_sort]
    return str_total


def main(page):
    url = 'https://movie.douban.com/top250?start=' + str(page*25)+'&filter='
    html = request_douban(url)
    soup = BeautifulSoup(html, 'lxml')
    save_to_excel(soup)


if __name__ == "__main__":
    for i in range(0, 10):
        main(i)

workbook.save('豆瓣最受欢迎的250部电影.xls')
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值