目录
为了方式爬虫被封,想自己做一个代理池,因为没有代理池,只能单线程爬取,而且使用time.sleep()进行了暂停,爬取位置后为后续我们做代理池做准备.
1.1西祠代理爬取代码
import time
import csv
from lxml import etree
from urllib import request,parse
Headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
'Referer':'https://www.xicidaili.com/nn/',
}
BASE_URL='https://www.xicidaili.com/nn/%d'
# 文件路径,数据格式:[[1,2,3],[3,4,5],[5,4,6]]
def write_csv(filepath,datas):
try:
with open(filepath, 'a', encoding='utf-8', newline='') as wf:
writer = csv.writer(wf)
for data in datas:
writer.writerow(data)
return True
except Exception as ex:
return False
def crawl(url):
req=request.Request(url=url,headers=Headers)
respon=request.urlopen(req)
html=respon.read().decode('utf-8')
return html
def parsel(html):
elem_obj=etree.HTML(html)
elems=elem_obj.xpath('//table//tr[position()>1]') # [position() <= 3]
result=[]
for elem in elems:
ip=elem.xpath('./td[2]/text()')[0]
port=elem.xpath('./td[3]/text()')[0]
result.append([ip,port])
return result
if __name__ == '__main__':
for i in range(1,1001):
html=crawl(BASE_URL%(i))
datas=parsel(html)
flag=write_csv('ip_port.csv',datas)
if flag:
print('第{}爬取成功'.format(i))
else:
print('爬取失败')
if i%10==0:
time.sleep(10)
1.2结果展示
爬取了3600个,应该足够了,下次我们将使用多线程对ip进行有效的验证
2.1对爬取的ip进行检验代码展示
import threading
import csv
import requests
from queue import Queue
# data=[ip,port]
def check_proxies(q):
url = 'http://httpbin.org/get'
data=q.get()
proxies={"https":"{}:{}".format(data[0],data[1])}
try:
requests.get(url,proxies=proxies,timeout=10)
except Exception as ex:
print(ex)
else:
flag=write_csv('./success_ip_port.csv',data)
if flag:
print(data,'写入成功')
else:
print(data,'写入失败')
# [ip,port]
def write_csv(filepath,data):
try:
with open(filepath,'a',encoding='utf-8',newline='') as wf:
writer=csv.writer(wf)
writer.writerow(data)
return True
except Exception as ex:
return False
def read_csv(filepath,q):
with open(filepath,'r',encoding='utf-8',newline='') as rf:
reader=csv.reader(rf)
for r in reader:
try:
q.put(r)
except Exception as ex:
print(ex)
if __name__ == '__main__':
q = Queue(20000)
read_csv('./ip_port1.csv',q)
for i in range(100):
t=threading.Thread(target=check_proxies,args=(q,))
t.start()
2.2检验结果展示
小结:代码有点缺陷,就是忘记把爬取到的协议也存取进来,目前只能当https进行处理了。