爬《针锋对决》_writes.花怜-CSDN博客

本文链接：https://blog.csdn.net/harry_hurry/article/details/79795275

from bs4 import BeautifulSoup
import re
import requests,sys

class downloader(object):  #定义一个类（包含属性和方法）

    def __init__(self):
        self.urls = []  #链接数

    def get_pageone_content(self):  #获得第一页中的内容
        target = 'https://www.52shuku8.com/xiandaidushi/424.html'
        rep = requests.get(url = target)
        rep.encoding = 'utf-8'     #由于出现乱码问题
        html = rep.text
        bf = BeautifulSoup(html)
        texts = bf.find_all(class_=re.compile('-content'))   #按css搜索
        texts = texts[0].text
        return texts

    def get_remainpage_link(self):  #获得第二页之后的页面链接
        for each_number in range(2,257):
            page_link = 'https://www.52shuku8.com/xiandaidushi/424_{}.html'.format(each_number)
            self.urls.append(page_link)

    def get_remainpage_content(self,target):  #获得第二页之后的内容
        rep = requests.get(url = target)
        rep.encoding = 'utf-8'
        html = rep.text
        bf = BeautifulSoup(html)
        texts = bf.find_all(class_=re.compile('-content'))
        texts = texts[0].text
        return texts

    def writer(self, path, text):  # 将爬取的文章内容写入文件
        write_flag = True
        with open(path, 'a', encoding='utf-8') as f:  #'a'指追加一个文件
            f.writelines(text)  # 章节内容
            f.write('\n\n')

    if __name__ == '__main__':
        d1 = downloader()  # 调用类
        d1.get_remainpage_link()  # 调用类中的方法,获得每页链接
        print('《针锋对决》开始下载:')
        d1.writer('针锋对决.txt', d1.get_pageone_content())
        for i in range(253):
            d1.writer('针锋对决.txt', d1.get_remainpage_content(d1.urls[i]))
        print('《针锋对决》下载完成')