requests 用类方法爬取数据

本文介绍了一个名为Spider1的Python类,它使用lxml库解析HTML并结合requests库发送请求,从电影列表页面抓取电影名称、产地、时长、上映时间和封面链接,将数据存储在CSV文件中。
摘要由CSDN通过智能技术生成
from lxml import etree
import requests
import time
import csv


class Spider1():
    # 发送请求,获取响应
    def get_html(self, url):
        response1 = requests.get(url, headers=self.headers, auth=("admin", "admin"))
        time.sleep(2)
        return response1.content.decode()

    # 从响应中提取数据
    def get_content(self, html_str):
        html = etree.HTML(html_str)
        div_list = html.xpath('//div[@class="el-col el-col-18 el-col-offset-3"]/div')
        move_infos = []
        for li in div_list:
            name = li.xpath('.//div/div/div[2]/a/h2/text()')[0]
            addr = li.xpath('.//div[@class="el-row"]/div[2]/div[2]/span[1]/text()')[0]
            time_long = li.xpath('.//div[@class="el-row"]/div[2]/div[2]/span[3]/text()')[0]
            time_agen = li.xpath('.//div[@class="el-row"]/div[2]/div[3]/span/text()')[0] if len(
                li.xpath('.//div[@class="el-row"]/div[2]/div[3]/span/text()')) > 0 else None
            img_url = li.xpath('.//div[@class="el-row"]/div[1]/a/img/@src')[0]

            # 详情页
            detail_url = 'https://ssr1.scrape.center' + li.xpath('./div/div/div[1]/a/@href')[0]
            responses1 = requests.get(detail_url, headers=self.headers)
            html = etree.HTML(responses1.content.decode())
            datas = html.xpath('//div[@class="el-card__body"]/div/div[2]/div[4]/p/text()')[0]
            move_info = [name, addr, time_long, time_agen, img_url, datas]
            move_infos.append(move_info)

            # 图片
            filepath = r'./dyimg/' + name + ".png"
            response2 = requests.get(img_url, headers=self.headers)
            with open(filepath, "wb") as f:
                f.write(response2.content)

        return move_infos

    def save_content(self, move_infos):
        for i in move_infos:
            self.w.writerow(i)

    def run(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.0.10191 SLBChan/105'}
        move_title = ["电影名", "出产地", "电影时长", "上映时间", "封面网址"]
        with open("moves.csv", "w", encoding="utf_8", newline="") as f:
            self.w = csv.writer(f)
            self.w.writerow(move_title)
            for i in range(1, 3):
                url = "https://ssr3.scrape.center" + f"/page/{i}"
                print(url)
                html_str = self.get_html(url)
                movies = self.get_content(html_str)
                self.save_content(movies)


if __name__ == '__main__':
    st = Spider1()  # 实例化对象
    st.run()

  • 7
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值