#!/usr/bin/env python
-- coding:utf-8 --
import os
import requests
from lxml import etree
if name == “main”:
# 1.指定url
get_url = ‘http://www.fhxsw.cc/read/5108.html’
# 2.进行UA伪装
headers = {
‘User-Agent’: ’ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36’
}
# 3.get请求参数处理,无data参数
print(‘开始第一次抓取过程----------------------------’)
# 4.请求发送
get_response = requests.get(url=get_url)
# get_response.encoding = ‘gbk’
# 5.获取响应数据:html文本数据
page_text = get_response.content.decode('gbk')
# 数据解析:src的属性值 alt属性
tree = etree.HTML(page_text)
# 6.创建下载文件的存放目录
if not os.path.exists('./xinjingu'):
os.mkdir('./xinjingu')
i = 1
for i in range(9):
i += 1
# chap_name = tree.xpath('//dl[@class="chapterlist"]/dd[1]/a/text()')[0]
# chap_url = tree.xpath('//dl[@class="chapterlist"]/dd[1]/a/@href')[0]
# 7.生成过滤条件的通用路径字符串
name_basic = r'//dl[@class="chapterlist"]/' + r'dd[' + str(i) + r']' + r'/a/text()'
url_basic = r'//dl[@class="chapterlist"]/' + r'dd[' + str(i) + r']' + r'/a/@href'
print('name_basic', url_basic)
print('url_basic', url_basic)
chap_name = tree.xpath('%s' % name_basic)[0]
chap_url = tree.xpath('%s' % url_basic)[0]
print(type(chap_name), chap_name, r"\n\\\\chap_url抓取成功!!\n")
print(type(chap_url), chap_url, r"\n\\\\chap_url抓取成功!!\n")
# 8. 构造访问章节的请求
chap_all_url = 'http://www.fhxsw.cc' + str(chap_url) # 生成内容页的完整访问URL
print(type(chap_all_url), "内容页面URL生成成功!!\n")
chap_response = requests.get(url=chap_all_url)
chap_text = chap_response.content.decode('gbk')
# chap_response.encoding = 'gbk'
tree2 = etree.HTML(chap_text)
chap_text_1 = tree2.xpath('//div[@id="BookText"]/text()')
# print('chap_text_1', chap_text_1)
# 9.构造生成文件的文件名
save_filename = chap_name + '.txt'
# 循环生成文本文件
with open('./xinjingu/%s' % save_filename, 'w', encoding='utf-8') as file:
for data in chap_text_1:
file.write(str(data))
print('--------------', save_filename, "下载成功-----------")