起点中文网爬虫实战requests库以及xpath的应用
知识梳理:
本次爬虫是一次简单的复习应用,需要用到requests库以及xpath.
在开始爬虫之前,首先需要导入这两个库
import requests
from lxml import etree
首先制定爬虫框架
import requests
from lxml import etree
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
def parse_url(url): #爬取网页章节列表
pass
def parse_list(response):#处理章节列表内容并获取各个章节的具体url
pass
def parse_detail(url):#爬取每个章节的详细内容并加以处理
pass
def save_text(ls,title,base_title):#保存最终结果到txt文件中,
pass
def main():
base_url = input("请输入小说首页的IP地址:")
response = parse_url(base_url)
detail_urls = parse_list(response)
base_title = input("请输入小说名称:")
for detail_url in detail_urls:
text_ls,title = parse_detail(detail_url)
save_text(text_ls,title,base_title)
if __name__ == '__main__':
main()
页面内容分析
因为我要爬取一部小说的全部章节的内容,首先随便打开一部小说,进入到主要章节目录,用Chrome浏览器的网页分析查看网页结构内容,
分析之后可以发现,所有的章节URL都在 li 标签之下的 a 标签的 href 属性里面,并且所有的 li 标签又都在class="cf"的 ul 标签下面,搜索发现,整个网页只有一个这样的 ul 标签,接下来可以开始处理章节列表内容
def parse_url(url):
response = requests.get(url,headers=headers)
return response.content
def parse_list(response):
ls = []
html = etree.HTML(response)
lis = html.xpath('//ul[@class="cf"]/li')
for li in lis:
href = li.xpath('.//a/@href')[0]
href = "https:"+href
ls.append(href)
return ls
首先分析详情章节页面的title,也就是章节的标题,到时候可以拿下来作为文件存取时候的章节区分,分析页面可以看到
章节标题位于class="content-wrap"的 span 标签下面,再次分析详细内容,发现详细内容位于class="read-content j_readContent"的div标签下的p标签下,分析完成之后可以编写接下来的过程了。
def parse_detail(url):
ls = []
response = requests.get(url,headers=headers).content
html = etree.HTML(response)
texts = html.xpath('//div[@class="read-content j_readContent"]//p/text()')
title = html.xpath('//span[@class="content-wrap"]/text()')[0]
for text in texts:
text = text.strip()+'\n'
ls.append(text)
return ls,title
最后是文件保存,
def save_text(ls,title,base_title):
file_name = base_title+title+".txt"
with open(file_name,'w',encoding='utf-8') as fp:
fp.write(title+'\n\n')
for i in ls:
fp.write(i)
print("file {} complete".format(file_name))
编写完成之后尝试运行,
效果还是可以的,不过还是有着很大的缺陷,进一步的学习能够更好的完成这个工作。完整代码如下:
import requests
from lxml import etree
import time
# https://read.qidian.com/chapter/_O7kFXJAAns8kjk6dUsm_A2/Y_96UVhhOa6aGfXRMrUjdw2
# https://read.qidian.com/chapter/_O7kFXJAAns8kjk6dUsm_A2/BqHrIuEKr7bM5j8_3RRvhw2
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
def parse_url(url):
response = requests.get(url,headers=headers)
return response.content
def parse_list(response):
ls = []
html = etree.HTML(response)
lis = html.xpath('//ul[@class="cf"]/li')
for li in lis:
href = li.xpath('.//a/@href')[0]
href = "https:"+href
ls.append(href)
return ls
def parse_detail(url):
ls = []
response = requests.get(url,headers=headers).content
html = etree.HTML(response)
texts = html.xpath('//div[@class="read-content j_readContent"]//p/text()')
title = html.xpath('//span[@class="content-wrap"]/text()')[0]
for text in texts:
text = text.strip()+'\n'
ls.append(text)
return ls,title
def save_text(ls,title,base_title):
file_name = base_title+title+".txt"
with open(file_name,'w',encoding='utf-8') as fp:
fp.write(title+'\n\n')
for i in ls:
fp.write(i)
print("file {} complete".format(file_name))
def main():
base_url = input("请输入小说首页的IP地址:")
response = parse_url(base_url)
detail_urls = parse_list(response)
base_title = input("请输入小说名称:")
for detail_url in detail_urls:
text_ls,title = parse_detail(detail_url)
save_text(text_ls,title,base_title)
# time.sleep(1)
if __name__ == '__main__':
main()