# _*_ coding :utf-8 _*_
#@Time :2021/10/23 19:33
#@File :爬虫练习_爬取小说剑来
#@Project :
import requests
import parsel
import time
url = 'https://www.shuquge.com/txt/8659/2324752.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
#1.发送请求
#2.获取源数据
#3.进行解析并保存数据
#1.发送请求并获取源数据
def down_book(url,book_name,p):
response=requests.get(url=url,headers=headers)
#设置响应的编码格式
#response.encoding=response.apparent_encoding
response.encoding='utf-8'
#获取源数据
#print(response.text)
#2.进行解析
#2.1xpath解析
#(1)导入需要的库函数
# from lxml import etree
# (2)将获取的字符串数据变成xpath可解析的对象
# bt=etree.HTML(response.text)
#(3)寻找路径(获取内容时候使用text()获取文本内容,xpth解析后得到的是字符串形式)
# title_name=bt.xpath('//div[@class="content"]/h1')
# content_text=bt.xpath('//div[@id="content"]/text()')
# print(title_name[0].text)
# for i in range(len(content_text)):
# print(content_text[i])
#print(bt)
#2.2css解析
#css解析时候,使用get()方法获取文字内容;属性提取器attr
#(1)导入需要的库
import parsel
#(2)将字符数据转换成可解析的对象
bt=parsel.Selector(response.text)
#(3)寻找路径,获取数据
title_name=bt.css('.reader h1::text').get()
content_text=bt.css('.showtxt ::text').getall()
print(title_name)
new_content=[]
for i in content_text:
new_content.append(i.strip())
new_content1='\n'.join(new_content)
#print('\n'.join(new_content))
#3.保存数据
file=open(book_name+str(p)+'.txt',mode='a',encoding='utf-8')
#file.write(title_name)
file.write('\n')
file.write(new_content1)
file.close()
#https://www.shuquge.com/txt/8659/2324752.html
#https://www.shuquge.com/txt/8659/2324753.html
url='https://www.shuquge.com/txt/8659/index.html'
new = requests.get(url=url, headers=headers)
new.encoding = 'utf-8'
new1=parsel.Selector(new.text)
# 设置响应的编码格式
# response.encoding=response.apparent_encoding
book_name=input('bookname:')
p=0
xh=new1.css('.listmain dl dd a::attr(href)').getall()
for i in xh:
print(i)
url_new=url[:-10]+i
p=p+1
down_book(url_new,book_name,p)
time.sleep(1)