python--爬取图片

# 爬取美女高清图片
import re
import requests
from lxml import etree
import urllib
import urllib.request
import time
# https://www.tupianzj.com/meinv/xinggan/list_176_2.html
# https://www.tupianzj.com/meinv/xinggan/list_176_3.html
# https://www.tupianzj.com/meinv/xinggan/list_176_4.html
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
}
# url0 = "https://www.tupianzj.com/meinv/xinggan/list_176_1.html"
# data0 = requests.get(url0, headers=header)
# data0.encoding = 'utf-8'
# html0 = etree.HTML(data0.text)
# page0 = html0.xpath("//div[@class='pages']/ul/li/span[@class='pageinfo']/text()")
#
# print(page0)
for p in range(1, 332):
    url = "https://www.tupianzj.com/meinv/xinggan/list_176_"+str(p)+".html"
    data = requests.get(url, headers=header)
    data.encoding = 'utf-8'
    html = etree.HTML(data.text)
    print(url)
    license1 = html.xpath("//ul[@class='list_con_box_ul']/li/a/@href")
    print(license1)
    print("第"+str(p)+"页的图片网址已提取完毕")
    time.sleep(3)
    # https://www.tupianzj.com/meinv/20210319/226412_1.html
    # https://www.tupianzj.com/meinv/20210319/226412_2.html
    # https://www.tupianzj.com/meinv/20210319/226412_3.html
    # https://www.tupianzj.com/meinv/20210319/226412_4.html
    # https://www.tupianzj.com/meinv/20210319/226412_5.html
    for i in range(0, len(license1)):
        url_pic = "https://www.tupianzj.com"+str(license1[i])
        data = requests.get(url_pic, headers=header)
        data.encoding = 'utf-8'
        html = etree.HTML(data.text)
        page = html.xpath("//div[@class='pages']/ul/li[1]/a/text()")
        page = re.findall(r"\d+", page[0])[0]
        url_pics = url_pic.split('.', -1)
        url_pics = url_pics[0]+'.'+url_pics[1]+'.'+url_pics[2]
        # print(url_pics)
        for j in range(1, int(page)+1):
            if j==1:
                url_page = url_pics + '.html'
            else :
                url_page = url_pics+'_'+str(j)+'.html'
            # print(url_page)
            data = requests.get(url_page, headers=header)
            data.encoding = 'utf-8'
            html = etree.HTML(data.text)
            pages = html.xpath("//div[@id='bigpic']/a[2]/img/@src")
            # print(pages)
            time.sleep(1)
            urllib.request.urlretrieve(pages[0], "E:/spider/picture/"+str(p)+'.'+str(i+1)+'.'+str(j)+'.jpg')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值