以下载txt为例,涉及三种方法:
网站网址连续,通过循环计算网址爬取
从目录页爬取
循环点击下一章进行爬取
网站网址连续,通过循环计算网址爬取
事先确定需爬取网页网址值域,在run()函数中循环拼接网址,调用SaveText函数下载
注:如需每次执行前清空文件可在run()函数中使用open('file.txt', 'w').close()
import requests
from bs4 import BeautifulSoup
class DownloadText(object):
def __init__(self):
self.headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
def get_html(self,start_url):
url =start_url
response = requests.get(start_url,headers = self.headers)
html = response.content.decode('utf-8')
return html
def SaveText(self,start_url):
html = self.get_html(start_url)
soup = BeautifulSoup(html,'lxml')
file = open(r'F:\pythonTest\红楼梦-爬虫测试.txt','a',encoding = 'utf-8')
#因文件需要循环写入,故以'a'模式打开
title = soup.find("title")
titleText = title.text.strip()
#strip()移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。
file.write(titleText)
file.write('\n')
for d in soup.find_all("div",class_="grap"):
Text = d.text.strip()
#print(Text)
file.write(Text)
#file.write('\n')
file.close()
def run(self):#多页循环
url_base = 'https://hongloumeng.5000yan.com/hlm'
for x in range(1127,1247):
url = url_base + str(x)+'.html'
self.SaveText(url)
x=x+1
if __name__ == '__main__':
TextTxt = DownloadText()
TextTxt.run()
从目录页爬取
传入目录页网址,提取出每一章网址,循环爬取每章内容
#爬虫练习2-根据目录页爬取
import requests
import re
import time
from bs4 import BeautifulSoup
class DownloadTxt(object):
def __init__(self):
self.header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
def gethtml(self,url):
response = requests.get(url,headers = self.header)
html = response.content.decode('utf-8')
return html
def txtdownload(self,url):
html = self.gethtml(url)
soup = BeautifulSoup(html,'lxml')
title = soup.find('div', class_="pull-left panel-heading-title")
titleTxt = title.text
file = open(r'F:\pythonTest\从目录多页爬虫.txt','a',encoding ='utf-8')
file.write(titleTxt)
for d in soup.find_all('div',class_ = "book-content"):
Text = re.sub('<(.*)>|本站网站:www.kuaishuku.net','',d.prettify())
file.write(Text)
file.close()
def run(self):
url_base = 'https://www.kuaishuku.net/178798/'
#url_base 目录页网址
html_con = self.gethtml(url_base)
soup_con = BeautifulSoup(html_con,'lxml')
text_con = soup_con.find(class_="list-group list-charts" ,id="stylechapter")
#所有章节网址存储在class_="list-group list-charts" ,id="stylechapter"下"a"标签中
url_all = text_con.find_all('a')
for url_html in url_all:
url_con = url_base + re.sub('/178798/','',url_html.get("href"))
self.txtdownload(url_con)
if __name__ == '__main__':
TextTxt = DownloadTxt()
TextTxt.run()
循环点击下一章进行爬取
传入第一页网址后,自动提取下一章网址循环至最后一页。
3.1示例:只需点击下一章到直到最后一页
#爬虫练习2-根据下一章字样循环爬取
import requests
import time
import re
from bs4 import BeautifulSoup
class TextDownload(object):
def __init__(self):
self.headers = {
'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'cookie':'the_client=2023-03-03_15:46:19@58.251.218.136@ec352388b656372eaa078da9320ac727; __51uvsct__JdBTiaEl1LjhvDDc=1; __51vcke__JdBTiaEl1LjhvDDc=fb533ae2-4b01-57aa-a8e8-b5a7a4f5f6aa; __51vuft__JdBTiaEl1LjhvDDc=1677829505180; Hm_lvt_6f62fd198e73d61caae82815813bc058=1677829505; __vtins__JdBTiaEl1LjhvDDc={"sid": "02ab6bbf-ffcb-5a7d-8c87-92c6cf281c02", "vd": 3, "stt": 73659, "dr": 70566, "expires": 1677831378827, "ct": 1677829578827}; Hm_lpvt_6f62fd198e73d61caae82815813bc058=1677829579'
}
def get_html(self,start_url):
url = start_url
response = requests.get(start_url,headers = self.headers)
#response = requests.get(url,headers = self.headers)
html = response.content.decode('utf-8')
return html
def save_text(self,start_url):
html = self.get_html(start_url)
soup = BeautifulSoup(html,'lxml')
file = open(r'F:\pythonTest\二分之一剧透.txt','a',encoding = 'utf-8')
title = soup.find('title')
titleText = title.text.strip()
print(titleText)
file.write(titleText)
file.write('\n')
for d in soup.find_all("div", class_="book-content"):
#d.prettify()
#利用prettify美化格式
Text = re.sub('<br/>|</div>|本站网站:www.kuaishuku.net|<div class="book-content">|快书库_kuaishuku.net','',d.prettify())
#Text = d.text.strip()
#print(re.sub('<br/>','',Text))
file.write(Text)
file.close()
def run(self):
url_base = 'https://www.kuaishuku.net/178798/61665209.html'
url = url_base
f = open(r'F:\pythonTest\二分之一剧透.txt','w',encoding = 'utf-8').close()
flag = True
while flag:
self.save_text(url)
time.sleep(1) #推迟执行1秒
try:
html = self.get_html(url)
next_url = re.findall('href="(.*?)">下一章', html)[0]
if next_url:
url = 'https://www.kuaishuku.net'+next_url
print("自动获取下一章的网址: %s"%('https://www.kuaishuku.net'+next_url))
except:
flag = False
if __name__ == '__main__':
TextTxt = TextDownload()
TextTxt.run()
3.2 示例网址中存在“下一页”和“下一章”
代码逻辑:拆分出【download_title()】-下载标题 和【download_txt()】-下载文章内容 两个函数,run()函数优先识别下一页并调用【download_txt()】,
识别到下一章 时调用【download_title()】和【download_txt()】
import requests
from bs4 import BeautifulSoup
import time
class DownloadTxt(object):
def __init__(self):
self.headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
def get_html(self,url):
response = requests.get(url,headers = self.headers)
html = response.content.decode(encoding = 'utf-8')
return html
def get_soup(self,url):
html = self.get_html(url)
soup = BeautifulSoup(html,'lxml')
return soup
def download_title(self,url):
soup = self.get_soup(url)
title = soup.find('title')
titletxt = re.sub('_维持女配的尊严\(淅和\)最新章节-神木小说网','',title.text.strip())
print(titletxt)
title_file = open(r'F:\pythonTest\维持女配的尊严.txt','a',encoding = 'utf-8')
title_file.write(titletxt)
title_file.close()
def download_txt(self,url):
soup = self.get_soup(url)
file = open(r'F:\pythonTest\维持女配的尊严.txt','a',encoding = 'utf-8')
for d in soup.find_all('div',id="booktxt"):
Text = re.sub('<(.*)>','',d.prettify())
#print(Text)
file.write(Text)
file.close()
def run(self):
url_first = 'https://m.shenmuxsw.cc/show/187200/55252171.html'
url_base = 'https://m.shenmuxsw.cc'
url = url_first
flag = True
#self.download_title(url)
self.download_txt(url)
f = open(r'F:\pythonTest\维持女配的尊严.txt','w',encoding = 'utf-8').close()
while flag:
html = self.get_html(url)
next_url1 = re.findall('href="(.*?)" rel="next" id="next_url">下一页', html)
#最后返回的文本只保留匹配上(.*?)的字符串
next_url2 = re.findall('href="(.*?)" rel="next" id="next_url">下一章', html)
time.sleep(1)
if len(next_url1) > 0:
url = url_base+next_url1[0]
#一般会提取出多次连接,取第一条
print('下一页 %s' % url)
self.download_txt(url)
elif len(next_url2) > 0:
url = url_base+next_url2[0]
print('下一章 %s' % url)
#self.download_title(url)
#文本中有说明标题故不再单独下载标题
self.download_txt(url)
else:
flag = False
if __name__ == '__main__':
Txt = DownloadTxt()
Txt.run()