# 爬取美女高清图片
import re
import requests
from lxml import etree
import urllib
import urllib.request
import time
# https://www.tupianzj.com/meinv/xinggan/list_176_2.html
# https://www.tupianzj.com/meinv/xinggan/list_176_3.html
# https://www.tupianzj.com/meinv/xinggan/list_176_4.html
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
}
# url0 = "https://www.tupianzj.com/meinv/xinggan/list_176_1.html"
# data0 = requests.get(url0, headers=header)
# data0.encoding = 'utf-8'
# html0 = etree.HTML(data0.text)
# page0 = html0.xpath("//div[@class='pages']/ul/li/span[@class='pageinfo']/text()")
#
# print(page0)
for p in range(1, 332):
url = "https://www.tupianzj.com/meinv/xinggan/list_176_"+str(p)+".html"
data = requests.get(url, headers=header)
data.encoding = 'utf-8'
html = etree.HTML(data.text)
print(url)
license1 = html.xpath("//ul[@class='list_con_box_ul']/li/a/@href")
print(license1)
print("第"+str(p)+"页的图片网址已提取完毕")
time.sleep(3)
# https://www.tupianzj.com/meinv/20210319/226412_1.html
# https://www.tupianzj.com/meinv/20210319/226412_2.html
# https://www.tupianzj.com/meinv/20210319/226412_3.html
# https://www.tupianzj.com/meinv/20210319/226412_4.html
# https://www.tupianzj.com/meinv/20210319/226412_5.html
for i in range(0, len(license1)):
url_pic = "https://www.tupianzj.com"+str(license1[i])
data = requests.get(url_pic, headers=header)
data.encoding = 'utf-8'
html = etree.HTML(data.text)
page = html.xpath("//div[@class='pages']/ul/li[1]/a/text()")
page = re.findall(r"\d+", page[0])[0]
url_pics = url_pic.split('.', -1)
url_pics = url_pics[0]+'.'+url_pics[1]+'.'+url_pics[2]
# print(url_pics)
for j in range(1, int(page)+1):
if j==1:
url_page = url_pics + '.html'
else :
url_page = url_pics+'_'+str(j)+'.html'
# print(url_page)
data = requests.get(url_page, headers=header)
data.encoding = 'utf-8'
html = etree.HTML(data.text)
pages = html.xpath("//div[@id='bigpic']/a[2]/img/@src")
# print(pages)
time.sleep(1)
urllib.request.urlretrieve(pages[0], "E:/spider/picture/"+str(p)+'.'+str(i+1)+'.'+str(j)+'.jpg')
python--爬取图片
最新推荐文章于 2022-09-06 19:00:51 发布