上一篇我们找到了代理,大家在实现过程中应该发现,每次循环除了3~5秒冷却,验证代理IP也需要时间,上万的IP,每个IP的验证时间平均按1秒算,也需要10000+秒!
太慢了!!!
如何解决这个问题呢?
日常生活中,餐馆做饭一般都是两个人,一个负责洗菜切菜,一个负责炒菜。这样可以大大提高效率。
这就是——多线程!
百度百科关于多线程的概念:
多线程(英语:multithreading),是指从软件或者硬件上实现多个线程并发执行的技术。具有多线程能力的计算机因有硬件支持而能够在同一时间执行多于一个线程,进而提升整体处理性能。
为什么使用多线程呢?
前面的程序有个问题:每次验证IP时,需要3秒的等待时间,这个等待的时间完全可以利用来继续下载下一页的IP。
所以,如果使用两个线程,一个专门下载代理IP,一个专门验证代理IP,可以节约一半的时间。
结合数据库,修改后的代码如下:
import urllib
import requests
from bs4 import BeautifulSoup
import time
import random
import DBHelper
import threading
heads = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}
def downIP(startPage, endPage):
for i in range(startPage, endPage):
#####################################################################################################
# 爬66ip网的免费代理IP
url = "http://www.66ip.cn/"+str(i)+".html"
response = requests.get(url,headers=heads)
soup = BeautifulSoup(response.content.decode("gbk"),"lxml")
# 找到属性"bordercolor"为"#6699ff"的table中的所有tr
trs = soup.find("table", attrs={"bordercolor":"#6699ff"}).find_all("tr")
for tr in trs[1:]:
address = tr.find_all("td")[2].get_text().strip()
if("市" in address):
ip = tr.find_all("td")[0].get_text().strip()
port = tr.find_all("td")[1].get_text().strip()
proxy = "http://"+ip+":"+port
# print(proxy+" "+address)
if len(DBHelper.select("select * from iptbl where ipurl='"+proxy+"'"))==0:
DBHelper.exec("insert into iptbl(ipurl) values('"+proxy+"')");
#####################################################################################################
# 爬快代理网的免费代理IP
url = "https://www.kuaidaili.com/free/inha/"+str(i)+"/"
response = requests.get(url, headers=heads)
soup = BeautifulSoup(response.content.decode("utf-8"), "lxml")
# 找到属性"class"为"table table-bordered table-striped"的table中的所有tr
trs = soup.find("table", attrs={"class": "table table-bordered table-striped"}).find_all("tr")
for tr in trs[1:]:
address = tr.find_all("td")[4].get_text().strip()
if ("市" in address):
ip = tr.find_all("td")[0].get_text().strip()
port = tr.find_all("td")[1].get_text().strip()
proxy = "http://" + ip + ":" + port
# print("快代理:"+proxy+" "+address)
if len(DBHelper.select("select * from iptbl where ipurl='" + proxy + "'")) == 0:
DBHelper.exec("insert into iptbl(ipurl) values('" + proxy + "')");
#####################################################################################################
# 爬快89ip网的免费代理IP
url = "http://www.89ip.cn/index_"+str(i)+".html"
response = requests.get(url, headers=heads)
soup = BeautifulSoup(response.content.decode("utf-8"), "lxml")
# 找到属性"class"为"layui-table"的table中的所有tr
trs = soup.find("table", attrs={"class": "layui-table"}).find_all("tr")
for tr in trs[1:]:
address = tr.find_all("td")[2].get_text().strip()
if ("市" in address):
ip = tr.find_all("td")[0].get_text().strip()
port = tr.find_all("td")[1].get_text().strip()
proxy = "http://" + ip + ":" + port
# print("89ip:"+ proxy+" "+address)
if len(DBHelper.select("select * from iptbl where ipurl='" + proxy + "'")) == 0:
DBHelper.exec("insert into iptbl(ipurl) values('" + proxy + "')");
#####################################################################################################
# 爬快西刺网的免费代理IP
url = "https://www.xicidaili.com/nn/" + str(i)
response = requests.get(url, headers=heads)
soup = BeautifulSoup(response.content.decode("utf-8"), "lxml")
# 找到属性"id"为"ip_list"的table中的所有tr
trs = soup.find("table", attrs={"id": "ip_list"}).find_all("tr")
for tr in trs[1:]:
address = tr.find_all("td")[3].get_text().strip()
# if ("市" in address):
ip = tr.find_all("td")[1].get_text().strip()
port = tr.find_all("td")[2].get_text().strip()
proxy = "http://" + ip + ":" + port
# print("西刺ip:" + proxy + " " + address)
if len(DBHelper.select("select * from iptbl where ipurl='" + proxy + "'")) == 0:
DBHelper.exec("insert into iptbl(ipurl) values('" + proxy + "')");
#####################################################################################################
# 间隔时间为3~5秒
time.sleep(random.randint(3,5))
def valiIP(x):
# 每次验证10条记录
result = DBHelper.select("select ipurl from iptbl where state=0 limit "+str(x)+",1")
for row in result :
proxy = row[0]
# 验证代理IP是否可用,如果IP不可用会抛异常
try:
proxies = {"http": proxy}
urlList = ["http://www.qq.com","http://www.jd.com","http://www.baidu.com","http://www.csdn.net","http://www.qidian.com","http://www.51cto.com"]
tmpUrl = random.choice(urlList)
rsp = requests.get(tmpUrl, headers=heads, proxies=proxies, timeout=3)
# 如果不可用则删除
if (rsp.status_code != 200):
sql = "delete from iptbl where ipurl='"+proxy+"'"
DBHelper.exec(sql)
print("删除了"+proxy)
else:
sql = "update iptbl set state=1 where ipurl='" + proxy + "'"
DBHelper.exec(sql)
print(proxy+"可用")
except Exception as e:
sql = "delete from iptbl where ipurl='" + proxy + "'"
DBHelper.exec(sql)
print("删除了" + proxy)
pass
# 验证结束
if __name__ == "__main__":
# 下载IP
t1 = threading.Thread(target=downIP, args=[1,100])
t1.start()
x = 0
while True:
# 验证IP
t2 = threading.Thread(target=valiIP, args=[x])
t2.start()
x += 1
if x>1000:
x=0
time.sleep(0.2)
一次多爬几个代理IP网,能得到更多的有用IP。
效果如下图: