#!/usr/bin/env python3
# -*- coding:utf-8 -*-
"""
pip install beautifulsoup4
pip install lxml
pip install requests
pip install threadpool
"""
import requests
from urllib import request
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
from Download import Download
from contextlib import suppress
import os
import time
import sys
import threadpool
from concurrent.futures import ThreadPoolExecutor
import ssl
from unittest.mock import Mock
from requests.models import Response
ssl._create_default_https_context = ssl._create_unverified_context
from requests.adapters import HTTPAdapter
session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
# 爬取目标
domain = 'https://www.baidu.com/'
main_url = domain+'thread.php?fid-24.html'
gif_url = domain+'thread.php?fid-29-page-1.html'
cur_path = os.getcwd() + '/images'
gif_path = os.getcwd() + '/gifs'
# 设置报头,Http协议
header = {
"Accept":"image/webp,image/apng,image/*,*/*;q=0.8",
"Content-Type":"charset=utf-8",
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
"Connection":"keep-alive",
}
def update_header(referer):
header['Referer'] = '{}'.format(referer)
def downloadContent(url):
the_response = Response()
the_response.code = "expired"
the_response.error_type = "expired"
the_response.status_code = 400
the_response.encoding='UTF-8'
the_response._content = b'{ "key" : "a" }'
i = 0
while i < 3:
with suppress(requests.exceptions.RequestException,requests.exceptions.SSLError,requests.exceptions.ReadTimeout,requests.exceptions.ConnectionError):
the_response= requests.get(url, headers=header,timeout=5,stream=True)
return the_response
time.sleep(2)
i += 1
if i>=3 :
print(url,' times:',i)
return the_response
def replace(tag,content):
tag.replace_with(content)
def mkdir(dir):
if not os.path.exists(dir):
os.makedirs(dir,exist_ok=True)
def getDirname(pathN):
return os.path.dirname(pathN)
def getName(pathN):
return pathN[len(getDirname(pathN))+1:]
def saveContent(content,to_file):
mkdir(getDirname(to_file))
with open(to_file, 'wb') as file:
file.write(content)
def download(url:str,save_file:str):
content=downloadContent(url).content
saveContent(content,save_file)
# urlretrieve(url,save_file)
def downloadInThreadPool(tdpool,url,save_file):
argsList=[([url,save_file], None)]
reqs = threadpool.makeRequests(download, argsList) # 构建任务队列
for req in reqs: # 提交任务
tdpool.putRequest(req)
#请求网页并转网页编码
def getUrlAsSoup(url,aHeader):
html=downloadContent(url)
code=html.encoding
print(code)
html=html.text
html=html.encode(code)
html=html.decode('utf-8')
# parser = 'html.parser'
parser = 'lxml'
soup = BeautifulSoup(html ,parser)
# soup = soup.prettify()
return soup
def getPage(sp):
pages=sp.find('div',class_='pages').find_all('a')
for herf in pages:
# herf = herf.prettify()
print(herf.attrs['href'])
def getSubPage(sp):
pages=sp.find_all('a',class_='subject')
# print(sp)
herfs=[]
for page in pages:
# herf = herf.prettify()
tmp={"name":page.string,
"url":page.attrs['href']}
herfs.append(tmp)
# print(tmp)
return herfs
def getImages(sp):
images=sp.find_all('ignore_js_op',class_='att_img')
# print(sp)
image_herfs=[]
for img in images:
pic=img.find('img')
# herf = herf.prettify()
image_herfs.append(pic.attrs['src'])
# print("image:",pic.attrs['src'])
return image_herfs
if __name__ == "__main__" :
pool = threadpool.ThreadPool(16) # 创建线程池
# thread_pool_size = 32 # 线程池大小
# executor = ThreadPoolExecutor(max_workers=thread_pool_size)
# futures = []
all_images=[]
for i in range(13,1616):
page_url=domain+'thread.php?fid-29-page-'+str(i)+'.html'
print(page_url)
page = getUrlAsSoup(page_url, header)
getPage(page)
sub_pages=getSubPage(page)
for sub_url in sub_pages:
img_url_sp=getUrlAsSoup(domain+sub_url['url'], header)
tmp={"name":sub_url['name'],
"image_urls":getImages(img_url_sp)}
all_images.append(tmp)
# print(tmp)
if tmp["image_urls"]!=[]:
print(i,sub_url['name'])
for image_url in tmp["image_urls"]:
save_name=gif_path+'/'+str(i)+'_'+ tmp["name"]+'/'+getName(image_url)
# print(image_url)
# print(save_name)
downloadInThreadPool(pool,image_url,save_name)
pool.wait()
print('finish..',i)
print('下载完成')