Python使用（二）-----爬虫

最新推荐文章于 2024-11-02 16:28:26 发布

hurricane_li

最新推荐文章于 2024-11-02 16:28:26 发布

阅读量235

点赞数

分类专栏：机器学习文章标签： python 开发语言后端

本文链接：https://blog.csdn.net/hurricane_li/article/details/79388922

版权

机器学习专栏收录该内容

6 篇文章 0 订阅

订阅专栏

参考网址：Python3网络爬虫快速入门实战解析_Jack-Cui-CSDN博客_python爬虫实例教程

# -*- coding:UTF-8 -*-
# 下载网络上的图片到本地
import requests
import json
import re
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter

if __name__ == '__main__':
	params = {}
	headers={
	"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
	"Accept-Encoding":"gzip, deflate",
	"Accept-Language":"zh-CN,zh;q=0.9",
	"Connection":"keep-alive",
	"Cookie":"__guid=45238818.1145742547842433700.1540392751906.5432; UM_distinctid=166a69146f6ed-0ae6a89a710775-3c604504-100200-166a69146fa61; CNZZDATA1262204355=1316053633-1540392329-%7C1540649295; Hm_lvt_a476a5ad5ad147e6b24a01c84b92ebc5=1540392831,1540401440,1540653238; Hm_lpvt_a476a5ad5ad147e6b24a01c84b92ebc5=1540653755; monitor_count=15",
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
	"Referer":"http://www.kanmanhua.me/manhua-67231/277554_123.html",
	"Host":"www.kanmanhua.me"
	}
	headers2={
	"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
	"Accept-Encoding":"gzip, deflate",
	"Accept-Language":"zh-CN,zh;q=0.9",
	"Connection":"keep-alive",
	"Cookie":"__guid=45238818.1145742547842433700.1540392751906.5432; UM_distinctid=166a69146f6ed-0ae6a89a710775-3c604504-100200-166a69146fa61; CNZZDATA1262204355=1316053633-1540392329-%7C1540649295; Hm_lvt_a476a5ad5ad147e6b24a01c84b92ebc5=1540392831,1540401440,1540653238; Hm_lpvt_a476a5ad5ad147e6b24a01c84b92ebc5=1540653755; monitor_count=15",
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
	"Host":"manhua-me.oss-cn-hongkong.aliyuncs.com"
	}
	data={}
	cookies={}
	proxies = {}



	# 读取json数据
	# result3 = json.loads(r.text)
	# result4 = json.dumps(result3,indent=4,ensure_ascii=False)
	# 读文件
	# with open('E:/program/Python/learn/carton/sample.html', 'r',encoding='utf-8') as f:
	# 	content = f.read()
	# 	bf = BeautifulSoup(content)
	# 	texts = bf.find_all('img', class_ = 'img-responsive')
	# 	print(texts[1].get('data-original'))
	# 正则
	# for item in result3['data']:
	# 	if len(item):
	# 		print(item['thumbURL'])
	# 		fileName = re.search('[0-9]*[^\,]*jpg',item['thumbURL']).group()
	# 		imgr = requests.get(item['thumbURL'],headers=imgheaders)
	# 		print(imgr.status_code)


	s = requests.Session()
	s.mount('http://', HTTPAdapter(max_retries=3))
	s.mount('https://', HTTPAdapter(max_retries=3))

	count = 268
	url = "http://www.kanmanhua.me/manhua-67231/277554_"
	catalog = "ZERO"

	for i in range(100,count):
		print(i)
		response = s.get(url+str(i+1)+'.html',params=params,headers=headers,timeout=3)
		bf = BeautifulSoup(response.text)
		imgEles = bf.find_all('img', class_ = 'img-responsive')
		needHref = imgEles[1].get('data-original')
		response2 = s.get(needHref,params=params,headers=headers2,timeout=3)
		with open("F:/img2/"+catalog+"/img"+str(i+1)+".jpg", 'ab+') as f:
			                for chunk in response2.iter_content(chunk_size = 1024):
			                    if chunk:
			                        f.write(chunk)
			                        f.flush()