python 爬虫多线程多代理爬取工具

最新推荐文章于 2022-07-06 01:24:07 发布

(困—困)

最新推荐文章于 2022-07-06 01:24:07 发布

阅读量383

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/qq_36671927/article/details/103679561

版权

python 专栏收录该内容

19 篇文章 0 订阅

订阅专栏

python 爬虫多线程多代理爬取工具

设置代理爬取

import requests
import re
import random
import threading


# 爬虫类
class Spider():
	"""docstring for ClassName"""
	def __init__(self):
		self.url="https://www.kuaidaili.com/free/inha/1"
		self.headers = {
			"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
			"Content-Type": "text/html"
		}
		self.response=""
	# 不使用代理访问多url
	def load_page(self,*urls):
		url=urls
		r=""
		if not url:
			rs=[]
			url=self.url
			r= requests.get(url,headers=self.headers)
			if r.status_code ==200:
				print("OK")
				# print(r.content)
				# print(r.url)
				rs.append(r)
				return rs
			else:
				raise ValueError("status_code is:",r.status_code)
		else:
			rs=[]
			for x in urls:
				print(x+"\n")
				r= requests.get(x,headers=self.headers)
				if r.status_code ==200:
					print("OK")
					rs.append(r)
				else:
					raise ValueError("status_code is:",r.status_code)
			return rs
	# 使用代理访问多url
	def load_page_byProxies(self,Proxies,urls):
		if not Proxies:
			raise ValueError("Proxies is Not")
		urllist=urls
		if not urls:
			raise ValueError("Urls is Not")
		for url in urllist:
			res=requests.get(url,proxies=Proxies)
			print(url)
			print(res.status_code)
			

# 根据正则匹配工具 strs 是正则，text是需要匹配的文本
def reText(strs,text):
	pattern = re.compile(strs)
	text= pattern.findall(str(text))
	return text

# 获取代理的URL
def getProxiesIP():
	s=Spider()
	returns= s.load_page()
	address={}
	for x in returns:
		text=reText(r"<td [- a-zA-Z=\"]*>[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*</td>\\n\s*<td [- a-zA-Z=\"]*>[0-9]{4}</td>",x.content)
		if text:
			for ptext in text:
				IPs=reText(r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*",ptext)
				Ports=reText(r"[0-9]{4}",ptext)
				address[IPs[0]]=Ports[0]
	print(address)
	Proxies=[]
	head="http://"
	for key,value in address.items():
		url=head+key+":"+value
		Proxies.append(url)
	return Proxies


spider=Spider()

Proxies= getProxiesIP()

#使用多IP访问url
for x in Proxies:
	proxies={ "http":x}
	t= threading.Thread(target=spider.load_page_byProxies,args=(proxies,["https://hao.360.com/?h_lnk"]))
	t.start()