python 爬取图片封装代码

最新推荐文章于 2024-08-17 17:18:53 发布

q小七

最新推荐文章于 2024-08-17 17:18:53 发布

阅读量896

点赞数

本文链接：https://blog.csdn.net/qhy521qhy/article/details/85273611

版权

本文介绍了如何使用Python的requests、lxml和urllib库来爬取天堂网的图片。首先，通过调用摩菇代理服务获取付费代理IP，并将IP和端口组装成requests可用的代理格式。接着，定义了一个IvskySpider类，用于执行爬虫任务。在主函数中，实例化该类并运行爬虫，实现图片的下载。

摘要由CSDN通过智能技术生成

import requests
import re, os
from lxml import etree
from urllib import request
import json

MOGU_PROXY_URL = ‘http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=fc1a46b572d54ca0a12f375eceb3b5e8&count=20&expiryDate=0&format=1&newLine=2’
FREE_PROXY_URL = ‘http://192.168.221.221:5010/get/’
TIANTANG_INDEX_URL = ‘http://www.ivsky.com/’
GET_PROXY_TIMEOUT = 2

def get_mogu_proxies():
“”"
请求付费代理
:return:
“”"
try:
resp = requests.get(MOGU_PROXY_URL)
except Exception as e:
print(“获取代理失败”, e, resp.status_code)
if resp.status_code == 200:
resp_dict = json.loads(resp.text)
raw_proxies = resp_dict[‘msg’]
# 组装为requests包代理参数需要的格式
res_proxies = []
for proxy in raw_proxies:
proxy_type = ‘https’
proxy_url = ‘https://’ + proxy[‘ip’] + proxy[‘port’]
res_proxies.append({proxy_type, proxy_url})
return res_proxies

class IvskySpider(object):

def __init__(self):
    self.url = 'http://www.ivsky.com/tupian/ziranfengguang/index_2.html'
    # self.html = ''
    self.title = ''
    self.count = 0
    self.headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    # 创建文件夹
    self.create_directry()

# 请求源代码
def get_html(self, url):
    response = requests.get(url=url, headers=self.headers)
    self.html = etree.HTML(response.text)

# 创建文件夹
def create_directry(self):
    # 获取源代码提取标题
    self.get_html(self.url)
    pass

# 解析网页
def parse_html(self):
    # 详情页
    pattern ='//ul/li/div[@class="il_img"]/a/@href'
    pattern2 = '//ul/li/div[@class="il_img"]/a/@title'
    html_content_name = self.html.xpath(pattern2)
    for name in html_content_name:
        self.title = name
        if not os.path.exists(self.title):
            os.mkdir(name)
    # 详情页链接
    html_content_url = self.html.xpath(pattern)
    for url in html_content_url:
        self.url_get = 'http://www.ivsky.com' + url
        # print(url_get)
        self.html_content(pattern2)

# 解析详情页的图片
def html_content(self,path):
    # print(self.url_get)
    response = requests.get(url=self.url_get, headers=self.headers)
    self.html = etree.HTML(response.text)
    picture = self.html.xpath('//ul/li/div[@class="il_img"]//img/@src')

    for pic in picture:

        # pic = request.get(url=pic)
        self.count += 1
        print('正在下载第%s张图片，请稍后。。。。' % self.count)
        # img_name = pic.split('/')[-1]
        path = self.title + '/' + '%s.jpg' % self.count
        # with open(path+'/'+self.count+'.jpg', 'wb') as f:
        #     f.write(pic)
        #     f.fileno()
        # print(pic)
        request.urlretrieve(pic, path)

# 下一页
def index_html(self):
    # 下一页
    page = '//div[@class="pagelist"]/a[@class="page-next"]/@href'
    page_next = self.html.xpath(page)
    for p_next in page_next:
        self.url = 'http://www.ivsky.com' + p_next
        print(self.url)

def run(self):
    for x in range(9):
        print('-'*12)
        self.get_html(url=self.url)
        self.index_html()
        self.parse_html()

if name == ‘main’:
ivsky = IvskySpider()
ivsky.run()
这是爬取的天堂网图片，使用一些包，requests包、os包、获取网页中的图片信息，然后爬取下来。
在这里插入图片描述