__author__="pikyshen" __date__ ="$2010-10-25$" import re import urllib import string from HTMLParser2 import * url = "http://www.newhua.com/sort/151_1.htm" TAG_TITLE_DIV = 'div' TAG_TITLE_A = 'a' TAG_TITLE_CLASS = 'class' TAG_TITLE_PAGE = 'page' TAG_OPTION = 'option' TAG_VALUE = 'value' TAG_H4 = 'h4' symbian = ['Symbian', 'symbian'] android = ['Android', 'android'] winmobile = ['WinMobile', 'winmobile'] windows = ['Win9x','WinXP'] PLANTFROM = [symbian, android, winmobile, windows] def get_plantform(str): plantform = [] for plantform_item in PLANTFROM: for name in plantform_item: if -1 != str.find(name): plantform.append(plantform_item[0]) return plantform def get_abspath(url, page): ap = '' if '/' != page[0:1]: i = url.rfind('/') ap = url[0:i]+'/'+page else: i = url[7:len(url)].find('/') print i ap = url[0:i+7]+page return ap def GetAbsPath(url, page): ap = '' if '/' != page[0:1]: i = url.rfind('/') ap = url[0:i]+'/'+page else: i = url[7:len(url)].find('/') print i ap = url[0:i+7]+page return ap def get_path_from_url(url): i = url.rfind('/') print url[0:i+1] return url[0:i+1] class HtmlParserPageMain(HTMLParser): def __init__(self): self.urls = [] self.tagtitle = False self.taghref = 0 self.node = [] self.url_path = '' self.start = False self.start1 = False self.h4 = False self.plantform = [] HTMLParser.__init__(self) return def set_url(self, url): self.url_path = url def handle_starttag(self, tag, attrs): if tag == TAG_TITLE_DIV: for name,value in attrs: if name == 'class' and value == 'title': self.tagtitle = True break if name == 'class' and value == 'con705 class-sub': self.start = True break if self.tagtitle == True: if tag == TAG_TITLE_A: for name,value in attrs: if name == 'href': self.node = ['name','url', 'size', 'platform'] self.node[0] = value self.taghref = True self.start1 = True if self.start == True and self.start1 == True: if tag == TAG_H4: self.h4 = True self.start1 = False return def handle_data(self, data): if self.taghref == True: self.node[1] = data #self.urls.append(self.node) if self.h4 == True: #print data self.plantform = get_plantform(data) if len(self.plantform) != 0: self.node[3] = self.plantform self.urls.append(self.node) return def handle_endtag(self, tag): if tag == TAG_TITLE_DIV: self.tagtitle = False if tag == TAG_TITLE_A: self.taghref = False if tag == TAG_H4: self.h4 = False return class HtmlParserGetPages(HTMLParser): def __init__(self): self.urls = [] self.tagdiv = False self.url_path = '' HTMLParser.__init__(self) return def set_url(self, url): self.url_path = url def handle_starttag(self, tag, attrs): if tag == TAG_TITLE_DIV: for name,value in attrs: if name == TAG_TITLE_CLASS and value == TAG_TITLE_PAGE: self.tagdiv = True if self.tagdiv == True: if tag == TAG_OPTION: for name,value in attrs: if name == TAG_VALUE: self.urls.append(self.url_path+value) return def handle_data(self, data): return def handle_endtag(self, tag): if tag == TAG_TITLE_DIV: self.tagdiv = False return page = urllib.urlopen(url).read().decode('utf8') pages_parser = HtmlParserGetPages() pages_parser.set_url(get_path_from_url(url)) pages_parser.feed(page) pages_parser.urls.append(url) print pages_parser.urls count = 0 for page_url in pages_parser.urls: page = urllib.urlopen(page_url).read().decode('utf8') parser1 = HtmlParserPageMain() parser1.feed(page) count = count + len(parser1.urls) print count print page_url #for node in parser1.urls: # urltmp = GetAbsPath(page_url, node[0]) # print node[1] # print node[0] # print node[3] # print #break