Python 爬取多页网页代码

以下载txt为例,涉及三种方法:

  1. 网站网址连续,通过循环计算网址爬取

  1. 从目录页爬取

  1. 循环点击下一章进行爬取

  1. 网站网址连续,通过循环计算网址爬取

事先确定需爬取网页网址值域,在run()函数中循环拼接网址,调用SaveText函数下载

注:如需每次执行前清空文件可在run()函数中使用open('file.txt', 'w').close()

import requests
from bs4 import BeautifulSoup
class DownloadText(object):
    def __init__(self):
        self.headers = {
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
        }
    def get_html(self,start_url):
        url =start_url
        response = requests.get(start_url,headers = self.headers)
        html = response.content.decode('utf-8')
        return html
    def SaveText(self,start_url):
        html = self.get_html(start_url)
        soup = BeautifulSoup(html,'lxml')
        file = open(r'F:\pythonTest\红楼梦-爬虫测试.txt','a',encoding = 'utf-8')
        #因文件需要循环写入,故以'a'模式打开
        title = soup.find("title")
        titleText = title.text.strip()
        #strip()移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。
        file.write(titleText)
        file.write('\n')
        for d in soup.find_all("div",class_="grap"):
            Text = d.text.strip()
            #print(Text)
            file.write(Text)
            #file.write('\n')
        file.close()
        
    def run(self):#多页循环
        url_base = 'https://hongloumeng.5000yan.com/hlm'
        for x in range(1127,1247):
            url = url_base + str(x)+'.html'
            self.SaveText(url)
            x=x+1

            
if __name__ == '__main__':
    
    TextTxt = DownloadText()
    TextTxt.run()
  1. 从目录页爬取

传入目录页网址,提取出每一章网址,循环爬取每章内容

#爬虫练习2-根据目录页爬取
import requests
import re
import time
from bs4 import BeautifulSoup

class DownloadTxt(object):
    def __init__(self):
        self.header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
        }
    def gethtml(self,url):
        response = requests.get(url,headers = self.header)
        html = response.content.decode('utf-8')
        return html
    def txtdownload(self,url):
        html = self.gethtml(url)
        soup = BeautifulSoup(html,'lxml')
        title = soup.find('div',  class_="pull-left panel-heading-title")
        titleTxt = title.text
        file = open(r'F:\pythonTest\从目录多页爬虫.txt','a',encoding ='utf-8')
        file.write(titleTxt)
        for d in soup.find_all('div',class_ = "book-content"):
            Text =  re.sub('<(.*)>|本站网站:www.kuaishuku.net','',d.prettify())
            file.write(Text)
        file.close()
    def run(self):
        url_base = 'https://www.kuaishuku.net/178798/'
        #url_base 目录页网址
        html_con = self.gethtml(url_base)
        soup_con = BeautifulSoup(html_con,'lxml')
        text_con = soup_con.find(class_="list-group list-charts" ,id="stylechapter")
        #所有章节网址存储在class_="list-group list-charts" ,id="stylechapter"下"a"标签中
        url_all = text_con.find_all('a')
        for url_html in url_all:
            url_con = url_base + re.sub('/178798/','',url_html.get("href"))
            self.txtdownload(url_con)
if __name__ == '__main__':
    TextTxt = DownloadTxt()
    TextTxt.run()
            
  1. 循环点击下一章进行爬取

传入第一页网址后,自动提取下一章网址循环至最后一页。

3.1示例:只需点击下一章到直到最后一页

#爬虫练习2-根据下一章字样循环爬取
import requests
import time
import re
from bs4 import BeautifulSoup
class TextDownload(object):
    def __init__(self):
        self.headers = {
            'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
            'cookie':'the_client=2023-03-03_15:46:19@58.251.218.136@ec352388b656372eaa078da9320ac727; __51uvsct__JdBTiaEl1LjhvDDc=1; __51vcke__JdBTiaEl1LjhvDDc=fb533ae2-4b01-57aa-a8e8-b5a7a4f5f6aa; __51vuft__JdBTiaEl1LjhvDDc=1677829505180; Hm_lvt_6f62fd198e73d61caae82815813bc058=1677829505; __vtins__JdBTiaEl1LjhvDDc={"sid": "02ab6bbf-ffcb-5a7d-8c87-92c6cf281c02", "vd": 3, "stt": 73659, "dr": 70566, "expires": 1677831378827, "ct": 1677829578827}; Hm_lpvt_6f62fd198e73d61caae82815813bc058=1677829579'
        }
    def get_html(self,start_url):
        url = start_url
        response = requests.get(start_url,headers = self.headers)
        #response = requests.get(url,headers = self.headers)
        html = response.content.decode('utf-8')
        return html
    def save_text(self,start_url):
        html = self.get_html(start_url)
        soup = BeautifulSoup(html,'lxml')
        file = open(r'F:\pythonTest\二分之一剧透.txt','a',encoding = 'utf-8')
        title = soup.find('title')
        titleText = title.text.strip()
        print(titleText)
        file.write(titleText)
        file.write('\n')
        for d in soup.find_all("div", class_="book-content"):
            #d.prettify()
            #利用prettify美化格式
            Text = re.sub('<br/>|</div>|本站网站:www.kuaishuku.net|<div class="book-content">|快书库_kuaishuku.net','',d.prettify())
            #Text = d.text.strip()
            #print(re.sub('<br/>','',Text))
            file.write(Text)
        file.close()
    
        
    def run(self):
        url_base = 'https://www.kuaishuku.net/178798/61665209.html'
        url = url_base
        f = open(r'F:\pythonTest\二分之一剧透.txt','w',encoding = 'utf-8').close()
        flag = True
        while flag:
            self.save_text(url)
            time.sleep(1) #推迟执行1秒
            try:
                html = self.get_html(url)
                next_url =  re.findall('href="(.*?)">下一章', html)[0]
                if next_url:
                    url = 'https://www.kuaishuku.net'+next_url
                    print("自动获取下一章的网址: %s"%('https://www.kuaishuku.net'+next_url))
            except:
                flag = False



if __name__ == '__main__':
    TextTxt = TextDownload()
    TextTxt.run()

3.2 示例网址中存在“下一页”和“下一章”

代码逻辑:拆分出【download_title()】-下载标题 和【download_txt()】-下载文章内容 两个函数,run()函数优先识别下一页并调用【download_txt()】,

识别到下一章 时调用【download_title()】和【download_txt()】

import requests
from bs4 import BeautifulSoup
import time
class DownloadTxt(object):
    def __init__(self):
        self.headers = {
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
        }
    def get_html(self,url):
        response = requests.get(url,headers = self.headers)
        html = response.content.decode(encoding = 'utf-8')
        return html
    def get_soup(self,url):
        html = self.get_html(url)
        soup = BeautifulSoup(html,'lxml')
        return soup
    def download_title(self,url):
        soup = self.get_soup(url)
        title = soup.find('title')
        titletxt = re.sub('_维持女配的尊严\(淅和\)最新章节-神木小说网','',title.text.strip())
        print(titletxt)
        title_file = open(r'F:\pythonTest\维持女配的尊严.txt','a',encoding = 'utf-8')
        title_file.write(titletxt)
        title_file.close()
    def download_txt(self,url):
        soup = self.get_soup(url)
        file = open(r'F:\pythonTest\维持女配的尊严.txt','a',encoding = 'utf-8')
        for d in soup.find_all('div',id="booktxt"):
            Text =  re.sub('<(.*)>','',d.prettify())
            #print(Text)
            file.write(Text)
        file.close()
    def run(self):
        url_first = 'https://m.shenmuxsw.cc/show/187200/55252171.html'
        url_base = 'https://m.shenmuxsw.cc'
        url = url_first
        flag = True
        #self.download_title(url)
        self.download_txt(url)
        f = open(r'F:\pythonTest\维持女配的尊严.txt','w',encoding = 'utf-8').close()
        while flag:
            html = self.get_html(url)
            next_url1 = re.findall('href="(.*?)" rel="next" id="next_url">下一页', html)
            #最后返回的文本只保留匹配上(.*?)的字符串
            next_url2 = re.findall('href="(.*?)" rel="next" id="next_url">下一章', html)
            time.sleep(1)
            if len(next_url1) > 0:
                url = url_base+next_url1[0]
                #一般会提取出多次连接,取第一条
                print('下一页 %s' % url)
                self.download_txt(url)
            elif len(next_url2) > 0:
                url = url_base+next_url2[0]
                print('下一章 %s' % url)
                #self.download_title(url)
                #文本中有说明标题故不再单独下载标题
                self.download_txt(url)
            else:
                flag = False
if __name__ == '__main__':
    Txt = DownloadTxt()
    Txt.run()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值