采集# http://sc.adminbuy.cn/icon/list_1_2.html网页里的图标 import requests from lxml import etree import urllib.parse import urllib.request import os import time # http://sc.adminbuy.cn/icon/list_1_2.html # tree = etree.HTML(content) class pa(object): def __init__(self): self.url = "http://sc.adminbuy.cn" self.header= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"} self.item = [] def run_url(self): #这里是拼接20个网页并逐步获取url的响应 for i in range(1,20): url = "http://sc.adminbuy.cn/icon/list_1_{}.html".format(i) book = requests.get(url) book.encoding='utf-8' book=book.text self.load(book) time.sleep(10) def load(self,book): #获取每个网页中的图片并解析图片的 链接地址 tree = etree.HTML(book) film_list = tree.xpath('.//div[@class="content"]/ul/li') for list_link in film_list: # link_href = list_link.xpath('.//a/img/@src')[0] link_href = self.url + list_link.xpath('.//a/@href')[0] name = list_link.xpath('.//a/img/@alt')[0]+".png" self.download_img(link_href,name) time.sleep(3) def download_img(self,*args): #得到每个图片的网络地址后 用urllib去下载图片,保存到imgpath路径下面 request = urllib.request.Request(url = args[0],headers=self.header) path = r"D:\project\iconbom\Icon" respone = urllib.request.urlopen(request) imgpath = os.path.join(path,args[1]) print(imgpath) if not os.path.exists(path): os.mkdir(path) with open(imgpath,"wb")as fp: fp.write(respone.read()) if __name__ == '__main__': spider = pa() spider.run_url()
python 爬虫采集 图标
最新推荐文章于 2024-08-05 11:10:25 发布