【python学习笔记】使用正则、xpath爬取好看的妹子

import requests
from lxml import etree
import re

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50'
}

domain = 'https://www.umei.cc/meinvtupian/siwameinv/'

# 获取domain页面里每个图集的地址
def get_html(url):
    res = requests.get(url=url, headers=headers)
    res.encoding = 'utf-8'
    return res.text


def get_imgs_urls(html):
    imgs_urls = []
    et = etree.HTML(html)
    infos = et.xpath('//div[@class="item masonry_brick"]/div[@class="item_b clearfix"]/div[@class="title"]')
    for info in infos:
        url = info.xpath('./span/a/@href')
        urls = 'https://www.umei.cc' + url[0]
        imgs_urls.append(urls)
    return imgs_urls


def get_img_url(imgs_url):
    html = get_html(imgs_url)
    et = etree.HTML(html)
    s = et.xpath('//div[@class="pages"]/ul/li/a/@href')
    obj = re.compile(r'.*?_(?P<max>.*?).htm', re.S)
    result = obj.search(s[-1])
    max = int(result.group('max'))
    urls = []
    img_urls = []
    while max > 1:
        url = imgs_url.replace('.htm', '_') + str(max) + '.htm'
        urls.append(url)
        max = max - 1
    urls.append(imgs_url)
    for url in urls:
        res = requests.get(url=url, headers=headers)
        et = etree.HTML(res.text)
        obj = et.xpath('//div[@class="big-pic"]/a/img/@src')
        img_urls.append(obj)
    return img_urls


def download(img_url):
    if img_url[4] != 's':
        img_url = 'https' + img_url[4:]
    resp = requests.get(url=img_url)
    name = img_url.split('/')[-1]
    with open(f'imgs/{name}', 'wb') as f:
        f.write(resp.content)


# 1.获取页面里面所有图集的地址
# 2.进入每个图集
# 3.获取当前图集所有的图片链接
# 4.将图片下载到对应的图集中

def run():
    html = get_html(domain)
    imgs_urls = get_imgs_urls(html)
    print(imgs_urls)
    for imgs_url in imgs_urls:
        img_urls = get_img_url(imgs_url)
        for img_url in img_urls:
            download(img_url[0])


run()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值