离线解析百度百科中的“百度百科”,提取各级标题:
#encoding:UTF-8
#_Author_:Ibsen
import urllib2
from sgmllib import SGMLParser
class ListName(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.is_h1=False
self.flag=False
self.getdata=False
self.name = []
def start_h1(self,attrs):
self.is_h1=True
def end_h1(self):
self.is_h1=False
def start_span(self, attrs):
for k,v in attrs:
if k=='class' and v=='title-text':
self.flag=True;
return
def end_span(self):
self.flag=False
def handle_data(self, text):
if self.is_h1:
self.name.append(text)
if self.flag:
self.name.append(text)
content = urllib2.urlopen('file:///C:/Users/John/Desktop/1.html').read()
#content=file('C:/Users/John/Desktop/1.html').read()
listname = ListNam