废话不多说,直接上代码,欢迎留言提问。
#在爬虫爬取的过程中,需要判断传入的链接对应的是静态网页还是动态网页,然后选择不同的代码对其进行爬取
#因为静态爬取比较快,所以当网页为静态网页时就不需要调用动态爬取的代码了,
from bs4 import BeautifulSoup,UnicodeDammit
from urllib.request import Request,urlopen
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
xiao_shuo_ming=input("请输入小说名:")
chrome_options = Options()
#chrome_options.add_argument('--headless')
#chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"')
driver= webdriver.Chrome(options=chrome_options)
#设置隐性等待时间1.5秒,即最长等待1.5秒
driver.implicitly_wait(1.5)
driver.get("https://www.biquge.com.cn/")
#url=driver.find_element_by_xpath('//*[@id="j-catalogWrap"]/div[1]/div/ul/li[7]/a')
#print(url,type(url))
#得到的html即为网页的html文本
#html=driver.page_source
#print(html)
#找到搜索框
shu_ru=driver.find_element_by_xpath('//*[@id="keyword"]')
#输入要搜索的内容
shu_ru.send_keys(xiao_shuo_ming)
#点击搜索的按钮
driver.find_element_by_xpath('//*[@id="wrapper"]/div[2]/div[2]/span/input').click()
#driver.find_element_by_xpath('//*[@id="result-list"]/div/ul/li[1]/div[3]/p/a[1]').click()
result=driver.find_elements_by_xpath('//div[@class="result-item result-game-item"]/div[2]/h3/a')
nu=len(result)
print("搜索得到的小说为:")
for n in range(nu):
result[n]=result[n].get_attribute('title')
print(result[n])
xiao_shuo=input("请输入列表中的任意一本书的全名:")
for m in range(nu):
if xiao_shuo==result[m]:
url = driver.find_elements_by_xpath('//div[@class="result-item result-game-item"]/div[2]/h3/a')[m].get_attribute('href')
break
if m==nu:
print('未找到链接!')
print('链接已找到:', url)
#print(result.extract())
driver.implicitly_wait(5)
#url=driver.find_element_by_xpath('/html/body/div[3]/div/div[2]/h3/a').get_attribute('href')
#print(url)
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
re=Request(url,headers=headers)
data=urlopen(re)
data=data.read()
dammit=UnicodeDammit(data,['gbk','utf-8'])
data=dammit.unicode_markup
soup=BeautifulSoup(data,'lxml')
zhang_jie_s=soup.select('#list > dl > dd > a')
num=len(zhang_jie_s)
#建立一个列表用于存放章节的标题
zhang_jie_name_s=[]
for i in range(num):
# 获取章节的标题
zhang_jie_name = zhang_jie_s[i].get_text()
#将其加入到列表中
zhang_jie_name_s.append(zhang_jie_name)
#获取每一个章节的链接,并拼接成完整的链接
zhang_jie_s[i]="https://www.biquge.com.cn"+zhang_jie_s[i]['href']
#到此已经将全部章节的链接和每一章的标题爬取下来了
#print(zhang_jie_name_s)
#print(len(zhang_jie_name_s))
#以下开始定义下载函数,将每一个章节的链接传进去下载
def download(url):
global k
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
re = Request(url, headers=headers)
data = urlopen(re)
data = data.read()
data = data.decode()
soup = BeautifulSoup(data, 'lxml')
wen_ben_s=soup.select('#content')
n=len(wen_ben_s)
for i in range(n):
wen_ben=wen_ben_s[i].get_text()
#去掉字符串中的多余字符串
wen_ben=wen_ben.replace(' ','')
#print(wen_ben)
#encoding='utf-8'请查看链接https://blog.csdn.net/weixin_42859280/article/details/83689938?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522159427634619195265953536%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fall.%2522%257D&request_id=159427634619195265953536&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~first_rank_ecpm_v3~pc_rank_v4-1-83689938.first_rank_ecpm_v3_pc_rank_v4&utm_term=gbk+codec+cant+encode+characte
f.write(wen_ben)
for j in range(num):
f = open('E:\\Desktop\\武逆\\{}.txt'.format(zhang_jie_name_s[j]), 'w+', encoding='utf-8')
print('开始下载{}!'.format(zhang_jie_name_s[j]))
download(zhang_jie_s[j])
f.close()
print('{}下载完毕'.format(zhang_jie_name_s[j]))