爬取豆瓣top250上的前十页的电影,评分,标题,并且转化成excel的格式保存


# https://movie.douban.com/top250?start=0&filter= 第一页
# https://movie.douban.com/top250?start=25&filter= 第二页
# https://movie.douban.com/top250?start=50&filter= 第三页

# (page-1)*25
# a = 1
# {}.format()
import requests
from lxml import etree
import csv # 多种形式导入导出数据的方式 excel打开

doubanUrl = 'https://movie.douban.com/top250?start={}&filter='

# 获取网页源码

def getSource(url):

# 反爬 添加Headers ua用户代理
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
    response = requests.get(url,headers=headers)

    response.encoding = 'utf-8'

    # print(response.text)
    return response.text

# 爬取 标题 评分 详情页的地址 引言
def getEveryItem(source):

    html_element = etree.HTML(source)

    movieItemList = html_element.xpath('//div[@class="info"]')


    # 展示信息 [{名字:XXX,评分:xxx,},{},{}]
    movieList = []

    for eachMoive in movieItemList:

        # 存储 标题 评分 详情页的地址 引言
        movieDict = {}

        title = eachMoive.xpath('div[@class="hd"]/a/span[@class="title"]/text()') # 标题
        otherTitle = eachMoive.xpath('div[@class="hd"]/a/span[@class="other"]/text()')  # 副标题
        link = eachMoive.xpath('div[@class="hd"]/a/@href')[0]  # url
        star = eachMoive.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
        quote = eachMoive.xpath('div[@class="bd"]/p[@class="quote"]/span/text()')  # 引言(名句)
        # 引言为空的判断
        if quote:
            quote = quote[0]
        else:
            quote = ''

        movieDict['title'] = ''.join(title + otherTitle)
        movieDict['url'] = link
        movieDict['star'] = star
        movieDict['quote'] = quote

        movieList.append(movieDict)

        print(movieList)

    return movieList


# 写入数据

def writeData(movieList):
    # newline='' 设置为空字符串
    with open('doubanmv.csv','w',encoding='utf-8',newline='') as f:

        # 需要理解一下
        writer = csv.DictWriter(f,fieldnames=['title','star','quote','url'])

        writer.writeheader() # 写入表头

        for each in movieList:

            writer.writerow(each)

if __name__ == '__main__':

    movieList = []

    for i in range(10):

        # 获取每一个列表页url
        pageLink = doubanUrl.format(i * 25)

        # 获取每一页的源码数据
        source = getSource(pageLink)

        # 获取数据 movieList = movieList + getEveryItem(source) 注意点
        movieList += getEveryItem(source)

    # 写入数据
    writeData(movieList)
   

然后以excel的格式保存接下来就是如下结果

在这里插入图片描述

import requests
import csv
from lxml import  etree
# 1.获取网页源代码
#用来把html文件转化成xml对象 html_element = etree.HTML(html)
# 2.查找网页数据
# 3.以csv的格式来保存
class A:

    def __init__(self):
        self.header = ('title','forward','point','writer')
        self.url = 'https://movie.douban.com/top250?start={}&filter='
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57'}


    def getsource(self,url):

        response = requests.get(url,headers=self.headers)

        html = response.content.decode('utf-8')

        return html

    def getlocal(self,html):

        html_element = etree.HTML(html)

        movielist = html_element.xpath("//div[@class='info']")

        list1 = []
        for x in movielist:
            dict1 = {}
            title = x.xpath("div[@class='hd']/a/span[@class='title']/text()")[0]
            forward = x.xpath("div[@class='bd']/p/span[@class='inq']/text()")
            point = x.xpath("div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()")[0]
            writer = x.xpath("div[@class='bd']/p[@class]/text()")[0]

            if forward:
                forward = forward[0]
            else:
                forward = ''


            dict1['title'] = title
            dict1['forward'] = forward
            dict1['point'] = point
            dict1['writer'] = writer

            list1.append(dict1)



        return  list1


    def writer(self,list1):
        with open('top250.csv', 'w', encoding='utf-8', newline='') as f:
            Dwriter = csv.DictWriter(f,self.header)
            Dwriter.writeheader()
            Dwriter.writerows(list1)

    def main(self):
        list2 = []
        for i in range(10):
            url = self.url.format(i*25)
            rem = self.getsource(url)
            rep = self.getlocal(rem)
            # self.writer(rep)
            list2 = list2 + rep
        self.writer(list2)

if __name__ == '__main__':
    sper = A()
    sper.main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

最低调的奢华

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值