爬取图片,网站netbian 模块:requests、etree(xpath) os threadpoolexecutor
思路:
1、滤遍http://www.netbian.com/meinv/index_{}.htm的所有页面
2、寻找页面规律,抓取单个图片的href
3、多线程抓取图片下载
import requests
from lxml import etree
from random import choice
from time import sleep
import re
import os
from concurrent.futures import ThreadPoolExecutor
user_agents = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
href_list=[]
title_list=[]
file_name_downloaded_list=[]
file_name_eixted_list=[]
if not os.path.exists('E:/1/netbian'):
os.mkdir('E:/1/netbian/')
else:
pass
def url_get(i):
headers = {'User-Agent': choice(user_agents)}
# print(headers)
if i==1:
print("http://www.netbian.com/meinv/")
response= requests.get("http://www.netbian.com/meinv/",headers=headers)
else:
print("http://www.netbian.com/meinv/index_{}.htm".format(i))
response= requests.get("http://www.netbian.com/meinv/index_{}.htm".format(i),headers=headers)
# print(response.status_code)
html = etree.HTML(response.content.decode("gbk"))
hrefs_raw= html.xpath('//*[@id="main"]/div[3]/ul/li/a/@href')
# print(hrefs_raw)
titles= html.xpath('//*[@id="main"]/div[3]/ul/li/a/@title')
# print(titles)
for href in hrefs_raw:
href_list.append(href)
for title in titles:
title_final=re.sub(r"\s.*:","_",title)
title_list.append(title_final)
# print(title_list)
# print(len(href_list))
# print(len(title_list))
return title_list,href_list
def photo_download(list):
sleep(0.5)
headers = {'User-Agent': choice(user_agents)}
# print(headers)
# try:
taotu_name =list[0]
file_name=taotu_name+".jpg"
if not os.path.exists(r'E:/1/netbian/{}'.format(file_name)):
response_2 = requests.get("http://www.netbian.com/"+list[1].replace(".thm","-1920x1080.htm"), headers=headers)
data_raw=etree.HTML(response_2.content.decode("gbk"))
src=data_raw.xpath('//*[@id="main"]/div[3]/div/p/a/img/@src')[0]
print(src)
img=requests.get(src,headers=headers).content
with open(r'E:/1/netbian/{}'.format(file_name), 'wb') as f:
f.write(img)
print(f'成功下载图片:{file_name}')
file_name_downloaded_list.append(file_name)
else:
print("已存在"+file_name)
file_name_eixted_list.append(file_name)
# except:
# pass
# url_get()
# taotu_url(urls)
def main():
with ThreadPoolExecutor(max_workers=5) as executor:
for i in range(1,120):
executor.submit(url_get,i)
total_list=zip(title_list,href_list)
with ThreadPoolExecutor(max_workers=36) as exector_photo:
exector_photo.map(photo_download, total_list)
print('=================== 图片全部下载成功啦! =====================')
print("下载了:",len(file_name_downloaded_list))
print(file_name_downloaded_list)
print("原先有重复:",len(file_name_eixted_list))
if __name__ == '__main__' :
main()