爬取小说

应用到urllib库和re正则表达式实现小说爬取的功能

from urllib import request
import re

first_url="http://www.freexs.org/novel/0/896/"
html=request.urlopen(first_url).read().decode('gbk')
novel_info={}
novel_info['title']=re.findall(r'<meta name="keywords" content=(.*?)>',html)
#print(novel_info['title'][0])
div_info=re.findall(r'<table width="100%"><tr><td><dl>(.*?)</div>',html)
tag_a=re.findall(r'<dd><a(.*?)</a></dd>',div_info[0])
for i in range(0,len(tag_a)):
    second_url=re.findall(r'href="(.*?)">',tag_a[i])[0]
    #print(second_url)
    url="%s%s"%(first_url,second_url)
    #print(url)
    html2=request.urlopen(url).read().decode('gbk')
    print(html2)

阅读更多
换一批

没有更多推荐了,返回首页