我以前喜欢看小说,玄幻,武侠,修真是我的最爱,刚学python不久,出于我对小说的热爱,我写了个脚本用来下载我经常看的笔趣阁网站首页所有的小说
首先得到网站首页的html,分析首页中有多少小说,循环得到小说的html,提取出小说名字,创建小说名字为名的文本,将每一章节的章节名和内容提取出来写入到文本中,循环直到最后一个章节,然后开始下一本小说
我这里是先把html下载到本地G:\url\中,然后读取的,其实直接打开url也可以。之前运行时有时候会卡在某个地方,得不到某个网页,我以为是缓存的问题,其实不是。解决方法是我设置了五秒中的超时和异常处理。如果五秒钟得不到这一章节的页面那么就跳过下载下一章
# -*- coding: utf-8 -*-
# -------------------------------------------
# 下载http://www.biquge.la笔趣阁首页上显示的所有小说
# 下载的小说存放在G:\txt文件夹下
# -------------------------------------------
# 2014/8/23
# wyp
# -------------------------------------------
import re
import urllib
import os
import socket
def getHtml(url):
reg = r'http:\.\.(.*)'
res = re.compile(reg)
urlstr = url.replace('/', '.')
print urlstr
name = re.findall(res, urlstr)
urlpathname = r'G:\url' +'\\' + name[0]
print 'urlpathname = '+urlpathname
try:
socket.setdefaulttimeout(5.0)
urllib.urlretrieve(url, urlpathname)
except:
pass
print 'getHtml ---------------over'
return urlpathname
def getBook(html):
reg = r'<a href="/book/(.*?)/'
res = re.compile(reg)
Book = re.findall(res, html)
return Book
def getName(html):
reg = r'<h1>(.*?)</h1>'
res = re.compile(reg)
name = re.findall(res, html)
return name
def getZhangJie(html):
reg = r'<dd><a href="(.*?)">(.*?)</a>'
res = re.compile(reg)
zhangJie = re.findall(res, html)
return zhangJie
def getContent(html):
reg = r'<div id="content">(.*?)</div>'
res = re.compile(reg)
content = re.findall(res, html)
return content
if __name__ == "__main__":
url = raw_input("please input url: ")
urlpathname = getHtml(url)
print urlpathname
f1 = open(urlpathname, 'rb+')
html = f1.read()
print html
Book = getBook(html)
#去重保持元素顺序
book = list(set(Book))
book.sort(key=Book.index)
for b in book:
realurl = url + '/book/' + b + '/'
print realurl
realurlname = getHtml(realurl)
print realurlname
f2 = open(realurlname, 'rb+')
realhtml = f2.read()
BookName = getName(realhtml)
filepath = os.path.join(r"G:\txt", BookName[0])
filename = filepath + '.txt'
print filename
if os.path.exists(filename):
continue
fd = open(filename, 'w+')
zhangjie = getZhangJie(realhtml)
for zj in zhangjie:
sonurl = realurl + zj[0]
print "url = %s" % sonurl
try:
sonurlname = getHtml(sonurl)
print '-----'+sonurlname
except:
continue
try:
f3 = open(sonurlname, 'rb+')
except IOError:
continue
sonhtml = f3.read()
zhangjieming = getName(sonhtml)
if len(zhangjieming) == 0:
continue
fd.write('\t\t\t\t\t' + zhangjieming[0] + '\r\n')
print "downding " + zhangjieming[0]
fd.write('\r\n')
fd.flush()
try:
content = getContent(sonhtml)
except:
pass
if len(content) == 0:
continue
c1 = content[0].replace('<br />', '')
c2 = c1.replace(' ', ' ')
fd.write(c2)
fd.write('\r\n\r\n\r\n\r\n')
fd.flush()
fd.close()
f2.close()
f3.close()