原本打算练习一下正则表达式,结果看的教程所爬的网站有点老,就是自己写了一个。
# coding=utf-8
import requests
import re
from HTMLParser import HTMLParser
'''创建解析类'''
class PoemParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tangshi_list = []#开列表存放所以数据
self.in_span = False
self.in_a = False
self.r = re.compile(r'\((.+)\)')
self.one = {}#临时字典存放每组数据
def handle_starttag(self, tag, attrs):
def _attr(attrlist, attrname):
for attr in attrlist:
if attr[0] == attrname:
return attr[1]
return None
if tag == 'span':
self.in_span = True
if tag == 'a' and self.in_span:
self.in_a = True
print _attr(attrs, 'href')
self.one['url'] = _attr(attrs, 'href')
def handle_endtag(self, tag):
if tag == 'span':
self.in_span = False
if tag == 'a':
self.in_a = False
def handle_data(self, data):
#可以发现诗句的标题在<span><a>title</a></span>里边
#所以用self.in_span and self.in_a
if self.in_span and self.in_a:
print data
self.one['title'] = data
#可以发现诗句的作者在<span><a>title</a>author</span>里边
#所以用self.in_span and not self.in_a
if self.in_span and not self.in_a:
print data
Data = self.r.match(data)#正则表达式去除"("和")"
if Data:
self.one['author'] = Data.group(1)
self.tangshi_list.append(self.one)
self.one = {}#如果不写会出问题,导致每次存的都是最后一组
def get_url():
url = 'http://www.gushiwen.org/gushi/tangshi.aspx'
u = requests.get(url);
#print u.content
#建立解析器的一个对象
p = PoemParser()
# 向解析器中加数据
p.feed(u.content)
return p.tangshi_list
if __name__ == '__main__':
dataList = get_url()
print len(dataList)
for i in range(10):
print '题目:%(title)s 作者:%(author)s URL:%(url)s'%(dataList[i])