#-*-coding:utf-8-*-
# 笔趣阁
import requests
from lxml import etree
def url_processing(url): # 网址处理函数
if requests.get(url).status_code > 200 and requests.get(url).status_code < 300:
print('网址输入错误请重新输入,返回的状态码为%s' % (requests.get(url).status_code))
return []
else:
print('正在打开',url)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
res = requests.get(url=url, headers=headers)
html = res.text
return html
def extract(html): # 数据提取函数
tree = etree.HTML(html) # xpath
urs = tree.xpath('//dd/a/@href')
return urs
def urls_cl(urs):
for i in range(9, len(urs)):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.15 Safari/537.36'}
res = requests.get(url=urs[i], headers=headers)
tr = etree.HTML( res.text)
txt_a = tr.xpath('//div[@class="bookname"]/h1/text()')[0] # 标题
txt_b = tr.xpath('//div[@id="content"]/p/text()')[0] # 内容
tra = txt_a + '\n' + txt_b
for i in range(1, len(urs) + 1):
file = '第' + str(i) + '章.txt'
print('开始爬取第', str(i), '章' )
with open(file, 'a', encoding='utf-8') as fp:
fp.write(tra)
print('第',str(i), '章爬取完成')
return '爬取全本完成'
if __name__ == '__main__':
ur = 'https://www.biquge5200.cc/'
a = str(input('请输入书号')) # 例如:0_844
url = ur + a
urls_cl(extract(url_processing(url)))