爬取美女图片,网站netbian 模块:requests、etree(xpath) os threadpoolexecutor

爬取图片,网站netbian 模块:requests、etree(xpath) os threadpoolexecutor

思路:
1、滤遍http://www.netbian.com/meinv/index_{}.htm的所有页面
2、寻找页面规律,抓取单个图片的href
3、多线程抓取图片下载

import requests
from lxml import etree
from random import choice
from time import sleep
import re
import os
from concurrent.futures import ThreadPoolExecutor


user_agents = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
href_list=[]
title_list=[]
file_name_downloaded_list=[]
file_name_eixted_list=[]
if not os.path.exists('E:/1/netbian'):
    os.mkdir('E:/1/netbian/')
else:
    pass
def url_get(i):
    headers = {'User-Agent': choice(user_agents)}
    # print(headers)
    if i==1:
        print("http://www.netbian.com/meinv/")
        response= requests.get("http://www.netbian.com/meinv/",headers=headers) 
    else:
        print("http://www.netbian.com/meinv/index_{}.htm".format(i))
        response= requests.get("http://www.netbian.com/meinv/index_{}.htm".format(i),headers=headers) 
    # print(response.status_code)            
    html = etree.HTML(response.content.decode("gbk"))
    hrefs_raw= html.xpath('//*[@id="main"]/div[3]/ul/li/a/@href')
    # print(hrefs_raw)
    titles= html.xpath('//*[@id="main"]/div[3]/ul/li/a/@title')
    # print(titles)
    
    for href in hrefs_raw:
        href_list.append(href)
    for title in titles:
        title_final=re.sub(r"\s.*:","_",title)
        title_list.append(title_final)
    # print(title_list)
    # print(len(href_list))
    # print(len(title_list))
    return title_list,href_list

def photo_download(list):
    sleep(0.5)
    headers = {'User-Agent': choice(user_agents)}
    # print(headers)
    # try:
    taotu_name =list[0]
    file_name=taotu_name+".jpg"
    if not os.path.exists(r'E:/1/netbian/{}'.format(file_name)):
        response_2 = requests.get("http://www.netbian.com/"+list[1].replace(".thm","-1920x1080.htm"), headers=headers)                                  
        data_raw=etree.HTML(response_2.content.decode("gbk")) 
        src=data_raw.xpath('//*[@id="main"]/div[3]/div/p/a/img/@src')[0]
        print(src)
        img=requests.get(src,headers=headers).content                               
        with open(r'E:/1/netbian/{}'.format(file_name), 'wb') as f:
            f.write(img)
            print(f'成功下载图片:{file_name}')
            file_name_downloaded_list.append(file_name)
    else:
        print("已存在"+file_name)
        file_name_eixted_list.append(file_name)
        
        
    # except:
    #     pass

# url_get()
# taotu_url(urls)
def main():
    with ThreadPoolExecutor(max_workers=5) as executor:
        for i in range(1,120):
            executor.submit(url_get,i)
    total_list=zip(title_list,href_list)
    with ThreadPoolExecutor(max_workers=36) as exector_photo:
        exector_photo.map(photo_download, total_list)
    print('=================== 图片全部下载成功啦! =====================')
    print("下载了:",len(file_name_downloaded_list))
    print(file_name_downloaded_list)
    print("原先有重复:",len(file_name_eixted_list))

    
if __name__ == '__main__' :
    main()
  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值