豆瓣电影Top250爬取

豆瓣电影Top250爬取

爬取时间:2020年8月6日
编译器:PyCharm
技术路线:requests-bs4-re-xlwt
Url:https://movie.douban.com/top250
作者:YRH

如需转载,请标明出处

# -*- coding: utf-8 -*-
# Author : YRH
# Data : 
# Project : 
# Tool : PyCharm

import requests
from bs4 import BeautifulSoup
import re
import xlwt

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"}


#爬取网页
def getHtml(Url):
    try:
        rep = requests.get(Url, headers=headers)
        rep.raise_for_status()
        rep.encoding = rep.apparent_encoding
        #由于文档中有些解码编译不出来,所以在获取网站文档前将识别不出来的文本进行替换
        return rep.text.replace(u'\xee', u'').replace(u'\xf4', u'').replace(u'\xfb', u'')
    except:
        print("提取网页失败")

#解析数据
def parser(Html, info):
    soup = BeautifulSoup(Html, 'xml')
    ol = soup.find("ol", class_="grid_view").find_all_next("li")

    for li in ol:
        # 获取电影名称
        try:
            title = li.find_all("span", class_='title')[0].string
            # print(title)
        except:
            title = " "
            # print(title)

        # 获取导演director、主演actor、发布时间time、地点site、剧情plot

        # 找出所有信息,再利用正则表达式进行获取
        data = li.find("p", clsaa_="")

        # 导演
        try:
            director = re.findall(r'.*?导演: (.*?) .*?', str(data))[0]
        except:
            director = " "

        # 主演
        try:
            actor = re.findall(r'.*?主演: (.*?)/.*?', str(data))[0]
        except:
            actor = " "

        # 出版时间
        try:
            time = re.findall(r'(\d{4})', str(data))[0]
        except:
            time = " "

        # 地点
        try:
            site = re.findall(r'.*?(\d{4})/(.*?)/', str(data))[0][1]
        except:
            site = " "

        # 剧情
        try:
            plot = re.findall(r'<br>(.*?)</br>', str(data), re.S)
            plot = re.subn(" ", "", plot[0])[0]
            plot = plot.split("/")
            plot = plot[len(plot) - 1].replace("剧情","")
        except:
            plot = " "

        # 获取评分
        try:
            grade = li.find("span", class_='rating_num').string

        except:
            grade = " "


        # 获取评论人数
        try:
            comment = li.find("div", class_='star').find_all_next("span")[3].string

        except:
            comment = " "


        info.append([title, director, actor, time, site, plot, grade, comment])

def save(data):
    print("save.....")
    workbook = xlwt.Workbook(encoding="utf-8")  #创建workbook对象
    movieBook = workbook.add_sheet("sheet1")    #创建工作表

    #输入头标签
    head = ["电影名","导演","主演","出版时间","地点","剧情","评分","评论人数"]
    for i in range(0,len(head)):
        movieBook.write(0,i,head[i])    #参数1是行,参数2是列,参数3是值

    #数据逐行输入
    y = 1
    for a in data:
        print("成功抓取第"+str(y)+"部")
        for x in range(0,len(a)):
            movieBook.write(y,x,a[x])
        y += 1

    workbook.save("豆瓣电影排名top250.xls")   #保存数据表


if __name__ == '__main__':
    info = []
    start = 0
    while start <= 225:
        url = "https://movie.douban.com/top250" + "?start=" + str(start)
        html = getHtml(url)
        parser(html, info)
        start += 25
    save(info)
  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值