import threading,time,random,htmllib,urllib,formatter,string,re
def getPageNum(data):
result=re.findall( r'pageNum">共(\d+)' , data )
return result[0]
def geturls(data,cururl):
linkdemo = GetLinks()
linkdemo.feed(data)
linkdemo.close()
urls=[]
for item in linkdemo.links:
try:
if item.index('/suppliers/') and item.startswith(cururl)==False:
urls.append(item)
except ValueError:
pass
return urls
class GetLinks(htmllib.HTMLParser):
def __init__(self):
self.links = []
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.link = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.link and text:
self.links.append(self.link)
class weburl():
def __init__(self):
self.l=[]
self.count=0
self.cururl=""
self.data=""
def getUrl(self):
if len(self.l)==0:
self.cururl="http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/"
else:
self.cururl=self.l[self.count-1]
self.count +=1
self.data = urllib.urlopen(self.cururl).read()
urls=geturls(self.data,self.cururl)
cursize=getPageNum(self.data)
if int(cursize)>100:
cursize=100
print "cururl:" + str(self.cururl) + "大小" + str(cursize)
output = open('datadown/listUrl.txt', 'a+')
output.write(self.cururl + "@@@@" + str(cursize) + "\n")
output.close()
#print d[self.cururl] + "大小" + str(cursize)
#print len(urls)
return urls
def getUniqueUrl(self):
nl=self.getUrl()
for ni in nl:
try:
self.l.index(ni)
pass
except ValueError:
self.l.append(ni)
output = open('datadown/historyUrl.txt', 'a+')
output.write(ni + "\n")
output.close()
#print len(self.l)
mylock = threading.RLock()
web=weburl()
class myThread(threading.Thread):
def __init__(self, name, web):
threading.Thread.__init__(self)
self.t_name = name
self.web=web
def run(self):
while True:
#mylock.acquire()
self.web.getUniqueUrl()
print '\nThread(%s), Number: %d'%(self.t_name, self.web.count)
if self.web.count>=len(self.web.l):
#mylock.release()
print 'len(l):'+ str(len(self.web.l))
#for i in self.web.l:
#print i
break
#mylock.release()
time.sleep(0)
def test():
for i in range(1,8):
threadi=myThread('A'+str(i), web)
threadi.start()
if __name__== '__main__':
test()