import threading,time,random,htmllib,urllib,formatter,string,re
def getPageNum(data):
result=re.findall( r'pageNum">共(\d+)' , data )
try:
return result[0]
except:
return 0
def geturls(cururl):
global l
data = urllib.urlopen(cururl).read()
linkdemo = GetLinks()
linkdemo.feed(data)
linkdemo.close()
urls=[]
for item in linkdemo.links:
try:
if item.index('/suppliers/') and item.startswith(cururl)==False:
urls.append(item)
except ValueError:
pass
cursize=getPageNum(data)
if int(cursize)>100:
cursize=100
output = open('datadown/listUrl.txt', 'a+')
output.write(cururl + "@@@@" + str(cursize) + "\n")
output.close()
for ni in urls:
try:
l.index(ni)
pass
except ValueError:
l.append(ni)
output = open('datadown/historyUrl.txt', 'a+')
output.write(ni + "\n")
output.close()
class GetLinks(htmllib.HTMLParser):
def __init__(self):
self.links = []
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.link = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.link and text:
self.links.append(self.link)
mylock = threading.RLock()
num=0
l=[]
class myThread(threading.Thread):
def __init__(self, name):
threading.Thread.__init__(self)
self.t_name = name
def run(self):
global num,l
while True:
#同步开始
mylock.acquire()
num+=1
if len(l)==0:
curl="http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/"
print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num)
geturls(curl)
curl=l[num-1]
print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num+1)
mylock.release()
#同步结束
geturls(curl)
if num>=len(l):
#mylock.release()
#print 'len(l):'+ str(len(self.web.l))
print "\n\nl:\n"
#for i in l:
#print i
break
def test():
for i in range(1,6):
threadi=myThread('A'+str(i))
threadi.start()
if __name__== '__main__':
test()
def getPageNum(data):
result=re.findall( r'pageNum">共(\d+)' , data )
try:
return result[0]
except:
return 0
def geturls(cururl):
global l
data = urllib.urlopen(cururl).read()
linkdemo = GetLinks()
linkdemo.feed(data)
linkdemo.close()
urls=[]
for item in linkdemo.links:
try:
if item.index('/suppliers/') and item.startswith(cururl)==False:
urls.append(item)
except ValueError:
pass
cursize=getPageNum(data)
if int(cursize)>100:
cursize=100
output = open('datadown/listUrl.txt', 'a+')
output.write(cururl + "@@@@" + str(cursize) + "\n")
output.close()
for ni in urls:
try:
l.index(ni)
pass
except ValueError:
l.append(ni)
output = open('datadown/historyUrl.txt', 'a+')
output.write(ni + "\n")
output.close()
class GetLinks(htmllib.HTMLParser):
def __init__(self):
self.links = []
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.link = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.link and text:
self.links.append(self.link)
mylock = threading.RLock()
num=0
l=[]
class myThread(threading.Thread):
def __init__(self, name):
threading.Thread.__init__(self)
self.t_name = name
def run(self):
global num,l
while True:
#同步开始
mylock.acquire()
num+=1
if len(l)==0:
curl="http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/"
print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num)
geturls(curl)
curl=l[num-1]
print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num+1)
mylock.release()
#同步结束
geturls(curl)
if num>=len(l):
#mylock.release()
#print 'len(l):'+ str(len(self.web.l))
print "\n\nl:\n"
#for i in l:
#print i
break
def test():
for i in range(1,6):
threadi=myThread('A'+str(i))
threadi.start()
if __name__== '__main__':
test()