import requests
import os
from pyquery import PyQuery as pq
import re
from multiprocessing import Pool
import sys
url="http://wanmeishijiexiaoshuo.org/"
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'X-Pingback':'http://wanmeishijiexiaoshuo.org/xmlrpc.php',
}
index=1
numbers=2061
#获取章节列表
def get_html():
response=requests.get(url,headers=headers)
doc=pq(response.text)
item=doc('.panel ul li')
chapter=item.items()
for c in chapter:
title=c.text()
texturl=c('a').attr('href')
yield title,texturl,numbers
#后去章节内容
def get_neirong(texturl):
response=requests.get(texturl,headers=headers)
response.encoding='utf-8'
doc=pq(response.text)
content=doc('.content p')
return content.text()
#保存到本地txt
def save_txt():
global index,numbers
if not os.path.exists('完美世界'):
os.mkdir('完美世界')
for item in get_html():
#通过正则替换特殊字符,避免自动创建文件时特殊字符影响导致创建失败
fileName = re.sub('[\/:*?"<>|]','-',item[0])
filepath='{0}/{1}.{2}'.format('完美世界',fileName,'txt')
#string的replace方法替换\xa0字符,避免写入文件失败
content=str(get_neirong(item[1])).replace(u'\xa0', u' ').replace(u'\u25d1', u' ')
sys.stdout.write("已下载:%0.3f%%" % float(index/numbers)+'\r')
sys.stdout.flush()
index += 1
with open(filepath,'w') as f:
f.write(content)
if __name__=='__main__':
p=Pool(10)
p.apply_async(save_txt())
p.close()
p.join()
初学python代码较为粗糙,后期边学习变改善