爬取彼岸图网图片

import os
import requests_html
import threading

from lxml import etree


class Spider:
    def __init__(self):
        self.url = 'https://pic.netbian.com/index_{}.html'
        self.session = requests_html.HTMLSession()
        self.proxies = {
            "http": "203.78.142.125:8088"
        }

    def get_html(self):
        for page in range(11, 51):
            url = self.url.format(page)
            response = self.session.get(url=url,proxies=self.proxies)
            # self.parse(response.text,page)
            p = threading.Thread(target=self.parse,args=(response.text,page))
            p.start()
            p.join()

    def parse(self,response,page):
        tree = etree.HTML(response)
        lis = tree.xpath('//*[@id="main"]/div[3]/ul/li')
        count = 1
        for li in lis:
            img = li.xpath('./a/@href')[0]
            img_url = self.url.replace('/index_{}.html',img)  # 第几页的页面地址
            response1 = self.session.get(url=img_url,proxies=self.proxies)
            tree1 = etree.HTML(response1.text)
            src = tree1.xpath('//*[@id="img"]/img/@src')[0]
            src1 = self.url.replace('/index_{}.html',src)  # 下载图片的地址

            path = 'images/彼岸图网/' + str(page) + '/'
            if not os.path.exists(path):
                os.makedirs(path)

            print(f'开始下载第{page}页,第{count}张图片')

            with open(path+str(count)+'.jpg','wb') as f:
                f.write(self.session.get(url=src1,proxies=self.proxies).content)

            print(f'第{page}页,第{count}张图片下载完成')
            count += 1




if __name__ == '__main__':
    spider = Spider()
    spider.get_html()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值