参考网址:Python3网络爬虫快速入门实战解析_Jack-Cui-CSDN博客_python爬虫实例教程
# -*- coding:UTF-8 -*-
# 下载网络上的图片到本地
import requests
import json
import re
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
if __name__ == '__main__':
params = {}
headers={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.9",
"Connection":"keep-alive",
"Cookie":"__guid=45238818.1145742547842433700.1540392751906.5432; UM_distinctid=166a69146f6ed-0ae6a89a710775-3c604504-100200-166a69146fa61; CNZZDATA1262204355=1316053633-1540392329-%7C1540649295; Hm_lvt_a476a5ad5ad147e6b24a01c84b92ebc5=1540392831,1540401440,1540653238; Hm_lpvt_a476a5ad5ad147e6b24a01c84b92ebc5=1540653755; monitor_count=15",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Referer":"http://www.kanmanhua.me/manhua-67231/277554_123.html",
"Host":"www.kanmanhua.me"
}
headers2={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.9",
"Connection":"keep-alive",
"Cookie":"__guid=45238818.1145742547842433700.1540392751906.5432; UM_distinctid=166a69146f6ed-0ae6a89a710775-3c604504-100200-166a69146fa61; CNZZDATA1262204355=1316053633-1540392329-%7C1540649295; Hm_lvt_a476a5ad5ad147e6b24a01c84b92ebc5=1540392831,1540401440,1540653238; Hm_lpvt_a476a5ad5ad147e6b24a01c84b92ebc5=1540653755; monitor_count=15",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Host":"manhua-me.oss-cn-hongkong.aliyuncs.com"
}
data={}
cookies={}
proxies = {}
# 读取json数据
# result3 = json.loads(r.text)
# result4 = json.dumps(result3,indent=4,ensure_ascii=False)
# 读文件
# with open('E:/program/Python/learn/carton/sample.html', 'r',encoding='utf-8') as f:
# content = f.read()
# bf = BeautifulSoup(content)
# texts = bf.find_all('img', class_ = 'img-responsive')
# print(texts[1].get('data-original'))
# 正则
# for item in result3['data']:
# if len(item):
# print(item['thumbURL'])
# fileName = re.search('[0-9]*[^\,]*jpg',item['thumbURL']).group()
# imgr = requests.get(item['thumbURL'],headers=imgheaders)
# print(imgr.status_code)
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
count = 268
url = "http://www.kanmanhua.me/manhua-67231/277554_"
catalog = "ZERO"
for i in range(100,count):
print(i)
response = s.get(url+str(i+1)+'.html',params=params,headers=headers,timeout=3)
bf = BeautifulSoup(response.text)
imgEles = bf.find_all('img', class_ = 'img-responsive')
needHref = imgEles[1].get('data-original')
response2 = s.get(needHref,params=params,headers=headers2,timeout=3)
with open("F:/img2/"+catalog+"/img"+str(i+1)+".jpg", 'ab+') as f:
for chunk in response2.iter_content(chunk_size = 1024):
if chunk:
f.write(chunk)
f.flush()