应用到urllib库和re正则表达式实现小说爬取的功能
from urllib import request
import re
first_url="http://www.freexs.org/novel/0/896/"
html=request.urlopen(first_url).read().decode('gbk')
novel_info={}
novel_info['title']=re.findall(r'<meta name="keywords" content=(.*?)>',html)
#print(novel_info['title'][0])
div_info=re.findall(r'<table width="100%"><tr><td><dl>(.*?)</div>',html)
tag_a=re.findall(r'<dd><a(.*?)</a></dd>',div_info[0])
for i in range(0,len(tag_a)):
second_url=re.findall(r'href="(.*?)">',tag_a[i])[0]
#print(second_url)
url="%s%s"%(first_url,second_url)
#print(url)
html2=request.urlopen(url).read().decode('gbk')
print(html2)