#get jingdong.com 获取京东电子书
#coding=utf8
import re,urllib
#-------------取得目录
def getlist(url):
#获取目录的正则表达式
re_jdebook = r'<li(.+?)</li>'
#'\r\n'代替换行
##re_jdebook = r'<a href="(.+?)"\r\n +?title="(.+?)">\r\n'
page = urllib.urlopen(url).read()
#re.DOTALL Make the '.' special character
#match any character at all, including a newline
#匹配任何符号,包括换行
r1=re.compile(re_jdebook,re.DOTALL)
content = re.findall(r1,page)
return content
#---------------------取得每一篇文章的内容
def getOne(address):
rr = r'<p>(.+?)</p>'#r'<div id="zoom">(.+?)</div>'
paper = urllib.urlopen(address).read()
rr1=re.compile(rr,re.DOTALL)
cont = re.findall(rr1,paper)
#for para in cont:
#print para
return cont
#---------download ebook
def downloadbook(content,bookname='book'):
book = []
for cc in content:
if url in cc:
ncc = re.findall(r'<a href="(.+?)"\r\n +?title="(.+?)">\r\n',cc)
print ncc[0][0],ncc[0][1]
book.append(ncc[0][1])
book += getOne(ncc[0][0])
book.append('\n')
f = open(bookname+".txt",'w+')
for line in book:
f.write(line+'\n')
f.close()
#-----------
#目录页地址
url = 'http://read.jd.com/4281/'
downloadbook(getlist(url),u'日月')#中文前加'u'防止产生乱码
Python 获取txt
最新推荐文章于 2023-09-15 22:30:00 发布