# -*- coding:utf-8 -*- # 获取网页源码/下载网页/图片/视频/音频.. import requests # 解析网页相关数据 from lxml import etree # 操作文件夹/路径 import os # 1.下载网页源码 # 2.解析网页源码(难度比较大) # 3.存储相关数据 url = "http://www.ivsky.com/tupian/ziranfengguang/" response = requests.get(url) # content text 数据类型不一样 # 把网页源码解析为根节点 root = etree.HTML(response.content) # 根据xpath来定位相关数据 # ul li a # 注意:xpath返回的结果一定是个列表 a_list = root.xpath("//ul[@class='tpmenu']/li/a") # 对列表进行切片,跳过"所有分类"这个元素 for a in a_list[1:]: # text() 表示获取标签之间的文本内容 big_title = a.xpath("text()")[0] # 获取标签中的某个属性 big_url = a.xpath("@href")[0] if not big_url.startswith("http"): big_url = "http://www.ivsky.com" + big_url big_response = requests.get(big_url) big_root = etree.HTML(big_response.content) big_a_list = big_root.xpath("//div[@class='sline']/div/a") for big_a in big_a_list: small_title = big_a.xpath("text()")[0] small_url = big_a.xpath("@href")[0] if not small_url.startswith("http"): small_url = "http://www.ivsky.com" + small_url path = "images/" + big_title + "/" + small_title # 如果路径对应的文件夹不存在,(目的防止出现"文件夹已存在,创建失败") if not os.path.exists(path): # makedirs = MakeDirectorys 根据路径创建文件夹 os.makedirs(path) page = 1 old_small_url = small_url while True: # div/a/img small_response = requests.get(small_url) small_root = etree.HTML(small_response.content) img_list = small_root.xpath("//div[@class='il_img']/a/img") if not img_list: break for idx, img in enumerate(img_list): src = img.xpath("@src")[0] # 命名图片的两种方式: # name = src.split("/")[-1] name = img.xpath("@alt")[0] + str(page) + "-" + str(idx) + ".jpg" img_response = requests.get(src) f = open(path+"/"+name,"wb") f.write(img_response.content) f.close() page += 1 small_url = old_small_url + "/index_%s.html" % page print(small_url)
基于python的-爬取风景图片网图片
最新推荐文章于 2024-07-04 23:15:41 发布