在github搜索了一些爬虫的源代码,发现它们都基本上实现了多线程,于是自己仿照着他们的代码把之前写过的筛选域名那个程序以多线程实现了一下,发现产生结果的速度比单线程确实快了很多:
#__author__ = 'hefangteng'
# coding:utf-8
import requests
import re
import MySQLdb
import threading
url = 'http://www.zgsj.com/domain_reg/domaintrans.asp'
class Do_screen(threading.Thread):
def __init__(self,url):
self.a = []
self.url = url
def GetHtml(self,data):
html = requests.post(self.url,data)
return html
#获取提交表单得到的value值
def Get_Value(self,html):
req = '<tr height=.*?value=\'(.*?)\' />'
valuelist = re.compile(req).findall(html)
for value in valuelist:
return value
#获取所有5位字母的字符串
def Get_all_str(self):
for i in range(97,123):
for k in range(97,123):
for y in range(97,123):
for x in range(97,123):
for m in range(97,123):
self.a.append(chr(i)+chr(k)+chr(y)+chr(x)+chr(m))
def fun(self,i ):
filename = ('result'+i+'.txt')
file = open(filename,'w')
print '爬虫'+i+'号加载!'
while True:
x = self.get_one()
if x == 1:
break
suffix = '.com'
data = {'d_name':x+suffix,'dtype':'common'}
html = self.GetHtml(data)
value = self.Get_Value(html.content)
if(value == 'no'): #value为no即为可用域名
values = [x,suffix]
try:
file.write(x+suffix)
except:
print 'write to file fail!'
print '可用域名:'+x+'.com'
file.close()
#创建10个线程
def run(self):
thread = []
for i in range(10):
thread.append(threading.Thread(target= self.fun ,args = (str(i+1),) ))
for th in thread:
th.start()
#筛选所有可用的域名
def get_one(self):
to_do_list = self.a
if len(to_do_list)==0:
print 'list empty'
return 1
pup = to_do_list[0]
del self.a[0]
return pup
if __name__=='__main__':
temp = Do_screen(url)
temp.Get_all_str()
temp.run()