参考了如下文档:
https://blog.csdn.net/hubing_hust/article/details/128295216
导入模块
import time
from selenium import webdriver
import os
from selenium.webdriver.common.by import By
获取网页源码
driver = webdriver.Firefox()
driver.implicitly_wait(5)
url = "http://m.560xs.com/lianqishiwannian/3685788.html"
driver.get(url)
while driver.execute_script("return document.readyState") != "complete":
time.sleep(1)
text = driver.page_source
源码解析
解析网页源码html格式可以用很多解析引擎,如下
from lxml import etree
html = etree.HTML(response.text)
#[0]列表的第0位
novel_name = html.xpath('/html/body/div[1]/div[3]/div[1]/h1/text()')[0]
# print(novel_name)
novel_directory = html.xpath('/html/body/div[1]/div[4]/ul/li[*]/a/@href')
# print(novel_directory)
#访问太快易报错,设置休眠时间
time.sleep(5)
我只是下载小说,不想搞的很麻烦,所以直接分割网页源码,如用spli分割成数组,取数组成员值
TempList1=text.split("<h1 class=\"title\">")
if len(TempList1)>1:
TempList2=TempList1[1].split("</h1>")
if len(TempList2)>1:
title=TempList2[0]
#print(title)
TempList1=text.split("<div class=\"content\" id=\"content\"")
if len(TempList1)>1:
TempList2=TempList1[1].split("</div>")
content=TempList2[1].split("<br>")
for line in content:
if len(line.strip())>0:
# print(line.strip())
file.write(line.strip()+"\n")
获取内容保存
file=open(TxtFilePath,'w',encoding='utf-8')
file.write(line.strip()+"\n")
file.close()
章节地址计算
小说的下一个章节,我是取巧了,直接点击网页的下一页或者下一章,因为网站的小说一章是分成几个网页输出的,每章的网页链接不好计算。
driver.find_element(By.XPATH, '/html/body/div[4]/div/div/div[3]/div[1]/a[3]').click()
效果
我把网页源码和小说的内容都保存了,效果如下
完整源码
import time
from selenium import webdriver
import os
from selenium.webdriver.common.by import By
HtmlSaveDir=r"d:\test\html"
TxtSaveDir=r"d:\test\txt"
driver = webdriver.Firefox()
driver.implicitly_wait(5)
url = "http://m.560xs.com/lianqishiwannian/3685788.html"
driver.get(url)
while driver.execute_script("return document.readyState") != "complete":
time.sleep(1)
driver.implicitly_wait(1)
while 1:
current_url=driver.current_url
HtmlFile=current_url.split("/")[-1]
HtmlName=HtmlFile.split(".")[0]
HtmlFilePath=os.path.join(HtmlSaveDir,HtmlFile)
text = driver.page_source
file=open(HtmlFilePath,'w',encoding='utf-8')
file.write(text)
file.close()
# file = open(HtmlFilePath, 'r',encoding='utf-8') # 创建的这个文件,也是一个可迭代对象
# text = file.read() # 结果为str类型
# # print(text)
# file.close()
TempList1=text.split("<h1 class=\"title\">")
if len(TempList1)>1:
TempList2=TempList1[1].split("</h1>")
if len(TempList2)>1:
title=TempList2[0]
#print(title)
TxtFile=title +"-"+HtmlName+ ".txt"
TxtFilePath=os.path.join(TxtSaveDir,TxtFile)
file=open(TxtFilePath,'w',encoding='utf-8')
TempList1=text.split("<div class=\"content\" id=\"content\"")
# print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@ %d\n",len(TempList1))
# for i in range(0, len(TempList1)):
# print("%d $$$$$$$$$$$$$$$$$$$$\n", i)
# print(TempList1[i])
if len(TempList1)>1:
TempList2=TempList1[1].split("</div>")
if len(TempList2)>2:
# for i in range(0,len(TempList2)):
# print("%d ################ \n",i)
# print(TempList2[i])
content=TempList2[1].split("<br>")
for line in content:
if len(line.strip())>0:
# print(line.strip())
file.write(line.strip()+"\n")
file.close()
print(TxtFile,current_url)
time.sleep(1)
driver.find_element(By.XPATH, '/html/body/div[4]/div/div/div[3]/div[1]/a[3]').click()
while driver.execute_script("return document.readyState") != "complete":
time.sleep(1)
time.sleep(3)
# driver.close()