python捕捉数据

doudan数据捕获

import requests
from lxml import etree
from openpyxl import Workbook
#etree 模块提供了一组功能,用于解析、构建和操作XML和HTML文档,提供了比标准库更高效的XML处理方法。
def get_douban_movies(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
    }
    response = requests.get(url,headers=headers)
    # print(response.text)
    # etree.HTML(response.text)#括号内传递参数,传递我们需要解析的文本
    html = etree.HTML(response.text)
    ranks = html.xpath('//div[@class="item"]//em/text()')
    ranks = [int(rank) for rank in ranks]#将字符串转化成整数
    names = html.xpath('//div[@class="item"]//span[@class="title"][1]/text()')
    gradeds = html.xpath('//div[@class="item"]//span[@class="rating_num"]/text()')
    gradeds = [float(graded) for graded in gradeds]#将字符串转化成整数
    sentenses = html.xpath('//div[@class="item"]//p//span/text()')

    movies = []
    for rank, name, graded, sentense in zip(ranks,names,gradeds,sentenses):
        movies.append((rank,name,graded,sentense))
    return movies

def save_to_excel(movies):
    wb = Workbook()
    # 激活
    ws =wb.active
    ws.append(['排名','电影名称','评分','经典语句'])#加入表头
    for movie in movies:
        ws.append(movie)
    wb.save("豆瓣电影250数据爬取表.xlsx")
    wb.close()
    #关闭网站

if __name__ == '__main__':
    base_url = "https://movie.douban.com/top250"
    data = []
    # 前面2个是范围,后面一个是数每次增加25
    for i in range(0,251,25):
        url = f"{base_url}?start={i}"
        movies = get_douban_movies(url)
        data += movies
    print(data)
    save_to_excel(data)

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值