效果
代码,码云地址
import datetime
import os
import random
from lxml import etree
url_src='https://www.nvshens.org'
user_agent = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
start = datetime.datetime.now()
headers={
"User-Agent": random.choice(user_agent)
, "Referer": "http://pic.netbian.com/4kdongman/"
}
# !/usr/bin/env python
# encoding=utf-8
import urllib
import image
from io import StringIO
# encoding=utf-8
import requests
from io import BytesIO
from PIL import Image
url = 'http://pic.netbian.com/4kdongman/'
# response = requests.get(url)
# tmpIm = BytesIO(response.content)
# im = Image.open(tmpIm)
#
# # 长宽储存在im当中的size列表当中
# w = im.size[0]
# h = im.size[1]
def get_html(url):
response=requests.get(url,headers=headers)
response.encoding='gbk'#response.apparent_encoding#设置成页面一样的编码
print("正在请求的url:"+url)
if response.status_code ==200:
html=etree.HTML(response.text)
return html
suffix=['jpg','png','git']
def get_page_name(html):
if html is not None:
page_name=html.xpath('//title/text()')
return page_name[0]
src_url='http://pic.netbian.com'
globals_href=set()
def get_this_page_all_href(html):
href_list=html.xpath('//child::*/@href')
need_href=set()
for i in href_list:
if len(i)>3 and i[0]=='/' and('youxi'in i or 'dongman'in i or 'tupian'in i):
if len(globals_href) <=globals_max_href and src_url+i not in globals_href:#最大请求globals_max_href 个href
href=src_url+i
need_href.add(href)
globals_href.add(href)
print(f'目前集群有{len(globals_href)}个有效超链接')
print(f'本页面有{len(need_href)}个有效超链接')
print(need_href)
for i in need_href:
get_this_page_all_href(get_html(i))
globals_img_urls=set()
def find_this_page_imgs_and_save_img(url):
html = get_html(url)
if html is not None:
page_name=get_page_name(html) # 找到page名字
get_this_page_all_href(html) # 找到本页所有超链接
src_list=html.xpath('//child::*[@src and @alt]/@src')
alt_list=html.xpath('//child::*[@src and @alt]/@alt')
img_info=dict(zip(src_list,alt_list))
print('-'*30)
print(img_info)
print('页面标题:'+page_name)
print('-' * 30)
valid_img_urls=set()
for i in img_info.keys():
if len(i)>3 and i.endswith('.jpg') and src_url+i not in globals_href:
img_url=src_url+i
valid_img_urls.add(img_url)
globals_img_urls.add(img_url)
save_img(img_url,page_name,img_info[i])
print(f'本页有{len(valid_img_urls)}个有效图片')
print(f'目前集群有{len(globals_img_urls)}个有效图片')
dir=r'X:\Users\SwordArtOnline\Desktop\爬虫\dongman\\'
def save_img(img_url,page_name,img_name):
response=requests.get(img_url,headers=headers)
img_conten=response.content
save_path=dir+page_name+'\\'
tmp_img=BytesIO(img_conten)
img=Image.open(tmp_img)
w=img.size[0]
h=img.size[1]
if not os.path.exists(save_path):
os.makedirs(save_path)
print('成功创建目录:'+save_path)
img_name=save_path+img_name+f'_本图片尺寸_{w}x{h}.jpg'
with open(img_name,'wb') as f:
f.write(img_conten)
print(f'成功保存第{len(globals_img_urls)}图片:'+img_name)
globals_max_href=100
if __name__ == '__main__':
find_this_page_imgs_and_save_img(url)
for i in globals_href:
find_this_page_imgs_and_save_img(i)
print(f'集群总共{len(globals_href)}有效超链接')