import requests
from lxml import etree
import re
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50'
}
domain = 'https://www.umei.cc/meinvtupian/siwameinv/'
# 获取domain页面里每个图集的地址
def get_html(url):
res = requests.get(url=url, headers=headers)
res.encoding = 'utf-8'
return res.text
def get_imgs_urls(html):
imgs_urls = []
et = etree.HTML(html)
infos = et.xpath('//div[@class="item masonry_brick"]/div[@class="item_b clearfix"]/div[@class="title"]')
for info in infos:
url = info.xpath('./span/a/@href')
urls = 'https://www.umei.cc' + url[0]
imgs_urls.append(urls)
return imgs_urls
def get_img_url(imgs_url):
html = get_html(imgs_url)
et = etree.HTML(html)
s = et.xpath('//div[@class="pages"]/ul/li/a/@href')
obj = re.compile(r'.*?_(?P<max>.*?).htm', re.S)
result = obj.search(s[-1])
max = int(result.group('max'))
urls = []
img_urls = []
while max > 1:
url = imgs_url.replace('.htm', '_') + str(max) + '.htm'
urls.append(url)
max = max - 1
urls.append(imgs_url)
for url in urls:
res = requests.get(url=url, headers=headers)
et = etree.HTML(res.text)
obj = et.xpath('//div[@class="big-pic"]/a/img/@src')
img_urls.append(obj)
return img_urls
def download(img_url):
if img_url[4] != 's':
img_url = 'https' + img_url[4:]
resp = requests.get(url=img_url)
name = img_url.split('/')[-1]
with open(f'imgs/{name}', 'wb') as f:
f.write(resp.content)
# 1.获取页面里面所有图集的地址
# 2.进入每个图集
# 3.获取当前图集所有的图片链接
# 4.将图片下载到对应的图集中
def run():
html = get_html(domain)
imgs_urls = get_imgs_urls(html)
print(imgs_urls)
for imgs_url in imgs_urls:
img_urls = get_img_url(imgs_url)
for img_url in img_urls:
download(img_url[0])
run()
【python学习笔记】使用正则、xpath爬取好看的妹子
于 2023-02-24 00:28:20 首次发布