python 爬虫 多线程多代理爬取工具
设置代理爬取
import requests
import re
import random
import threading
class Spider():
"""docstring for ClassName"""
def __init__(self):
self.url="https://www.kuaidaili.com/free/inha/1"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Content-Type": "text/html"
}
self.response=""
def load_page(self,*urls):
url=urls
r=""
if not url:
rs=[]
url=self.url
r= requests.get(url,headers=self.headers)
if r.status_code ==200:
print("OK")
rs.append(r)
return rs
else:
raise ValueError("status_code is:",r.status_code)
else:
rs=[]
for x in urls:
print(x+"\n")
r= requests.get(x,headers=self.headers)
if r.status_code ==200:
print("OK")
rs.append(r)
else:
raise ValueError("status_code is:",r.status_code)
return rs
def load_page_byProxies(self,Proxies,urls):
if not Proxies:
raise ValueError("Proxies is Not")
urllist=urls
if not urls:
raise ValueError("Urls is Not")
for url in urllist:
res=requests.get(url,proxies=Proxies)
print(url)
print(res.status_code)
def reText(strs,text):
pattern = re.compile(strs)
text= pattern.findall(str(text))
return text
def getProxiesIP():
s=Spider()
returns= s.load_page()
address={}
for x in returns:
text=reText(r"<td [- a-zA-Z=\"]*>[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*</td>\\n\s*<td [- a-zA-Z=\"]*>[0-9]{4}</td>",x.content)
if text:
for ptext in text:
IPs=reText(r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*",ptext)
Ports=reText(r"[0-9]{4}",ptext)
address[IPs[0]]=Ports[0]
print(address)
Proxies=[]
head="http://"
for key,value in address.items():
url=head+key+":"+value
Proxies.append(url)
return Proxies
spider=Spider()
Proxies= getProxiesIP()
for x in Proxies:
proxies={ "http":x}
t= threading.Thread(target=spider.load_page_byProxies,args=(proxies,["https://hao.360.com/?h_lnk"]))
t.start()