写这个仅仅是为了记录一下自己的小小成就,代码有很多不足,但仅仅是业余写写玩而已,如果你喜欢的话,给博主点个赞,再走吧。
效果图:
代码截图:
代码
import requests
from lxml import etree
import re,os
def getHTMLText (url):
try:
r = requests.get(url,timeout = 30)
r.raise_for_status() #如果状态不是200,引发HTTPError异常
r.encoding = r.apparent_encoding #将apparent_encoding替代encoding,使返回解码的格式是正常的
return r.text
except:
return "产生异常"
def getXpath(html):
try:
text = etree.HTML(html)
src = text.xpath('//ul[@class="d1 ico3"]/li/a/img/@src')
alt = text.xpath('//ul[@class="d1 ico3"]/li/a/img/@alt')
href = text.xpath('//ul[@class="d1 ico3"]/li/a/@href')
return zip(src,alt,href)
except:
return "处理出现错误!"
if __name__ == "__main__":
url = "https://www.tupianzj.com/meinv/mm/pxmnt/"
text = getHTMLText(url)
zip = getXpath(text)
for zip in zip:
urls = "https://www.tupianzj.com" + zip[2]
html = getHTMLText(urls)
text = etree.HTML(html)
number = text.xpath('//div[@class="pages"]//li/a')
for i in number:
it = re.findall(r"\d+", str(i.text))
number = int(it[0]) # 获取每一个网页图集照片的数量
it = re.findall(r"\d+", str(i.text)) #
for i in range(2,number):
test = re.findall(r"(.*?).ht", urls)[0] + "_" + str(i) + ".html" # 重新编辑成可以解析的网页地址
text = etree.HTML(getHTMLText(test)) # 获取到下载图片的源代码
pngurl = text.xpath('//img[@id="bigpicimg"]/@src')
title = zip[1] + str(i)
path = "D:\\test\\" + title + ".jpg"
for ite in pngurl:
with open(path, "wb") as f:
req = requests.get(ite)
f.write(req.content)
break