支持抓取,验证,使用http,https代理,可添加自己的稳定代理。哪位好心人能给我点好的代理?Thanks♪(・ω・)ノ,因为我抓的源都不怎么靠谱
╮(╯▽╰)╭
https://gitee.com/zouzheng/personal_gadget
# coding=gbk
import threading
import Queue
import requests
import re
import sys
# requests proxies问题,url和代理 http https问题,最好一样是http或https
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0; SE 2.X MetaSr 1.0) like Gecko"}
class proxy():
"""update()获得最新代理,
check()检查保存的代理存活,
get()直接获取保存的代理—很大可能失效"""
def __init__(self,protocol):
self.protocol=protocol
self.updata_ips = []
self.check_ips=[]
self.get_ips=[]
self.proxys_temp = []
# 返回类型[{},{}]
#稳定自定义
def from_myself(self):
ips=[]
return ips
#0x00 可能失效,及时更新
def from_66ip(self):
ips=[]
if self.protocol=="http":
ip = re.findall("[0-9]+?\.[0-9]+?\.[0-9]+?\.[0-9]+?:[0-9]+", requests.get(
"http://www.66ip.cn/nmtq.php?getnum=288&isp=0&anonymoustype=0&area=1&proxytype=0&api=66ip",headers=head).content)
else:
ip = re.findall("[0-9]+?\.[0-9]+?\.[0-9]+?\.[0-9]+?:[0-9]+", requests.get(
"http://www.66ip.cn/nmtq.php?getnum=288&isp=0&anonymoustype=0&area=1&proxytype=1&api=66ip",headers=head).content)
for i in range(len(ip)):
ip[i] = '{"%s":"%s"}' % (self.protocol,ip[i])
ips.append( eval(ip[i]))
return ips
# 0x01 大量失效,不好用,默认关闭(如需启用设置count>1)
def from_xicidaili(self):
#页数
count=1
ips=[]
for i in range(1,count):
if self.protocol == "http":
html = requests.get("http://www.xicidaili.com/wt/"+str(i), headers=head)
else:
html = requests.get("http://www.xicidaili.com/wn/"+str(i), headers=head)
ip = re.findall("<td>([0-9]+?\.[0-9]+?\.[0-9]+?\.[0-9]+?)</td>", html.content)
port = re.findall("<td>([0-9]+?)</td>", html.content)
for i in range(len(ip)):
ips.append( eval('{"%s":"%s:%s"}' % (self.protocol,ip[i],port[i])))
return ips
def updata_run(self,ip):
file = self.file()
if str(ip)+"\r\n" not in file.readlines():
print ip
file.write(str(ip))
file.write("\r\n")
file.close()
self.updata_ips.append(ip)
def updata(self):
threads = []
thread_count = 10
queue = Queue.Queue()
# 多来源代理地址:
self.proxys_temp+=self.from_myself()
self.proxys_temp+=self.from_66ip()
self.proxys_temp+=self.from_xicidaili()
#print self.proxys_temp
for ip in self.proxys_temp:
#多次读取判断重复
if ip not in self.updata_ips:
queue.put(ip)
for i in range(thread_count):
threads.append(moreThead(queue,self.updata_run))
for i in range(thread_count):
threads[i].start()
for i in range(thread_count):
threads[i].join()
# print len(proxys)
return self.updata_ips
def check_run(self,ip):
print ip
self.check_ips.append(ip)
def check(self):
threads = []
thread_count = 10
queue = Queue.Queue()
for ip in self.get():
queue.put(ip)
del self.get_ips[:]
for i in range(thread_count):
threads.append(moreThead(queue,self.check_run))
for i in range(thread_count):
threads[i].start()
for i in range(thread_count):
threads[i].join()
fileB = self.file()
fileB.truncate(0)
for ip in self.check_ips:
fileB.write(str(ip))
fileB.write("\r\n")
fileB.close()
return self.check_ips
def get(self):
file=self.file()
#get_ips=[]
for ip in file.readlines():
self.get_ips.append(eval(ip.replace("\r\n","")))
return self.get_ips
def file(self):
if self.protocol == "http":
return open("proxy_ips_http.txt", "a+")
else:
return open("proxy_ips_https.txt", "a+")
class moreThead(threading.Thread):
def __init__(self,queue,_run):
threading.Thread.__init__(self)
self.queue = queue
self._run=_run
def run(self):
while not self.queue.empty():
ip = self.queue.get()
if ip.has_key("http"):
protocol = "http"
else:
protocol = "https"
try:
if protocol=="http":
respon = requests.get("http://www.baidu.com", headers=head, proxies=ip, timeout=3)
else:
respon = requests.get("https://www.baidu.com", headers=head, proxies=ip, timeout=3)
except Exception as e:
#print e
respon = None
if respon is not None:
if respon.status_code==200:
self._run(ip)