''' Created on 2013-9-4 @author: sky ''' #!/usr/bin/env python from urllib import urlopen from HTMLParser import HTMLParser import threading import time import re ''' HTML分析代码,实现HTML代码的解析功能 ''' class Scraper(HTMLParser): in_link=False def handle_starttag(self,tag,attrs): attrs=dict(attrs) if tag=='a' and 'href' in attrs: self.in_link=True self.chunks=[] self.url=attrs['href'] def handle_data(self,data): if self.in_link: self.chunks.append(data) def handle_endtag(self,tag): global i # pat=re.compile('http://(.*)') # m=pat.match(self.url) if tag=='a': if self.in_link: i=i+1 urladdress[i]=self.url # print '%s:%s' % (i,count[i]) # print '%s' % self.url self.in_link=False ''' 调用URLOPEN模块和HTMLParser模块,实现HTML网页数据的下载 ''' def crawler(): global newurl,filename print '%s:::::::%s' % (filename,newurl) text=urlopen(newurl).read() f=open(filename,'w') f.write(text) parser=Scraper() parser.feed(text) parser.close() ''' 程序主函数main,程序的入口函数,通过多线程实现并行处理 ''' i=0 newurl='http://www.sina.com.cn' filename='/var/crawler/xinanews/'+str(i) count=1 urladdress={} t=threading.Thread(target=crawler,name=i) t.setDaemon(1) t.start() t.join() print i pat=re.compile('http://(.*)') while 1: for j in range(1,i): m=pat.match(urladdress[j]) if m: count=count+1 newurl=urladdress[j] filename='/var/crawler/xinanews/'+str(count) print '%s:%s' % (newurl,filename) k=threading.Thread(target=crawler,name=i) k.setDaemon(1) k.start() k.join()