python 爬虫采集 图标

采集# http://sc.adminbuy.cn/icon/list_1_2.html网页里的图标
import requests
from lxml import etree
import urllib.parse
import urllib.request
import os
import time
# http://sc.adminbuy.cn/icon/list_1_2.html
# tree = etree.HTML(content)
class pa(object):
    def __init__(self):
        self.url = "http://sc.adminbuy.cn"
        self.header= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
        self.item = []
    def run_url(self):
        #这里是拼接20个网页并逐步获取url的响应
        for i in range(1,20):
            url = "http://sc.adminbuy.cn/icon/list_1_{}.html".format(i)
            book = requests.get(url)
            book.encoding='utf-8'
            book=book.text
            self.load(book)
            time.sleep(10)
    def load(self,book):
        #获取每个网页中的图片并解析图片的 链接地址
        tree = etree.HTML(book)
        film_list = tree.xpath('.//div[@class="content"]/ul/li')
        for list_link in film_list:
                # link_href = list_link.xpath('.//a/img/@src')[0]
                link_href = self.url + list_link.xpath('.//a/@href')[0]
                name = list_link.xpath('.//a/img/@alt')[0]+".png"
                self.download_img(link_href,name)
                time.sleep(3)
    def download_img(self,*args): 
        #得到每个图片的网络地址后 用urllib去下载图片,保存到imgpath路径下面
        request = urllib.request.Request(url = args[0],headers=self.header)
        path = r"D:\project\iconbom\Icon"
        respone = urllib.request.urlopen(request)
        imgpath = os.path.join(path,args[1])
        print(imgpath)
        if not os.path.exists(path):
            os.mkdir(path)
        with open(imgpath,"wb")as fp:
            fp.write(respone.read())
if __name__ == '__main__':
    spider = pa()
    spider.run_url()
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值