python爬虫抓取代理ip_python爬虫实战(一)——实时获取代理ip

#-*-coding:utf8-*-

importre,threading,requests,timeimporturllib.requestfrom bs4 importBeautifulSoup as BS

rawProxyList=[]

checkedProxyList=[]

targets=[]

headers={'User-Agent': r'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36','Connection': 'keep-alive'}for i in range(1,4):

target= r"http://www.xicidaili.com/nn/%d" %i

targets.append(target)#print (targets)

#获取代理的类

classProxyGet(threading.Thread):def __init__(self,target):

threading.Thread.__init__(self)

self.target=targetdefgetProxy(self):print ("目标网站:"+self.target)

r= requests.get(self.target,headers =headers)

page=r.text

soup= BS(page,"lxml")#这里的class_用的是"Searching by CSS class"",BS文档中有详细介绍

tr_list = soup.find_all("tr", class_= "odd")for i inrange(len(tr_list)):

row=[]#.stripped_strings 方法返回去除前后空白的Python的string对象.

for text intr_list[i].stripped_strings:

row.append(text)#row = ['58.208.16.141','808','江苏苏州','高匿','HTTP,......]

ip =row[0]

port= row[1]

agent= row[4].lower()

addr=agent+ "://" + ip + ":" +port

proxy=[ip, port, agent, addr]

rawProxyList.append(proxy)defrun(self):

self.getProxy()#检验代理类

classProxyCheck(threading.Thread):def __init__(self,proxyList):

threading.Thread.__init__(self)

self.proxyList=proxyList

self.timeout=2self.testUrl= "https://www.baidu.com/"

defcheckProxy(self):for proxy inself.proxyList:

proxies={}if proxy[2] =="http":

proxies['http'] = proxy[3]else:

proxies['https'] = proxy[3]

t1=time.time()try:

r= requests.get(self.testUrl, headers=headers, proxies=proxies, timeout=self.timeout)

time_used= time.time() -t1ifr:

checkedProxyList.append((proxy[0],proxy[1],proxy[2],proxy[3],time_used))else:continue

exceptException as e:continue

defrun(self):

self.checkProxy()print("hello")if __name__ =="__main__":

getThreads=[]

checkedThreads=[]#对每个目标网站开启一个线程负责抓取代理

for i inrange(len(targets)):

t=ProxyGet(targets[i])

getThreads.append(t)for i inrange(len(getThreads)):

getThreads[i].start()for i inrange(len(getThreads)):

getThreads[i].join()print ('.'*10+"总共抓取了%s个代理" %len(rawProxyList) +'.'*10)#开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份

for i in range(10):

n=len(rawProxyList)/10

#print (str(int(n * i))+ ":" +str(int(n * (i+1))))

t = ProxyCheck(rawProxyList[int(n * i):int(n * (i+1))])

checkedThreads.append(t)for i inrange(len(checkedThreads)):

checkedThreads[i].start()for i inrange(len(checkedThreads)):

checkedThreads[i].join()print ('.'*10+"总共有%s个代理通过校验" %len(checkedProxyList) +'.'*10)#持久化

f = open("proxy_list.txt",'w+')for checked_proxy insorted(checkedProxyList):print ("checked proxy is: %s\t%s" %(checked_proxy[3],checked_proxy[4]) )

f.write("%s:%s\t%s\t%s\t%s\n" % (checked_proxy[0], checked_proxy[1], checked_proxy[2], checked_proxy[3], checked_proxy[4]))

f.close()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值