python应用——爬取起点小说
小编最近找到了爬取小说的内容,思来想去,照输之后(本来想爬取一个小说的内容)却发现爬取不到章链接和章明。????
无奈。。。
import requests
from lxml import etree
import os
# 定义一个类(使代码更加标准化)
class Spider(object):
def index_request(self):
# 1.请求起点网拿到对应的HTML数据,抽取小说名,小说链接,创建文件夹
response = requests.get("https://www.qidian.com/all?")
# 整理成文档对象
html = etree.HTML(response.text)
Big_src = html.xpath(’//li[@data-rid=“1”]/div[@class=“book-mid-info”]/h4/a/@href’)
Big_tit = html.xpath(’//li[@data-rid=“1”]/div[@class=“book-mid-info”]/h4/a/text()’)
for Bigtit, Bigsrc in zip(Big_tit, Big_src):
if os.path.exists(Bigtit) == False:
os.mkdir(Bigtit)
self.detail_request(Bigtit, Bigsrc)
def detail_request(self,Bigtit, Bigsrc):
# 请求目录拿到HTML数据,抽取章名,章链接
response = requests.get(“https:” + Bigsrc)
# 整理成文档对象
html = etree.HTML(response.text)
littit_list = html.xpath(’//div[@class=“volume”]/ul/li/a/text()’)
litsrc_list = html.xpath(’//div[@class=“volume”]/ul/li/a/@href’)
print(littit_list,litsrc_list)
for littit, litsrc in zip(littit_list, litsrc_list):
self.content_request(littit, litsrc, Bigtit)
def content_request(self, littit, litsrc, Bigtit):
# 请求文章拿到数据,抽取文章内容,保存数据
response = requests.get(“https:” + litsrc)
html = etree.HTML(response.text)
content = “\n”.join(html.xpath(’//div[@class=“read-content j_readContent”]/p/text()’))
filename = Bigtit + “\” + littit + “.txt”
print(“正在保存文件 %s” %filename)
with open(filename, “w”) as f:
f.write(content)
最后的代码实现:
spider = Spider()
spider.index_request()
但是小编在输入之后,却没有访问到章链接和章明,怎么办呢???????
哭哭了。。。。