from bs4 import BeautifulSoup import re import requests,sys class downloader(object): #定义一个类(包含属性和方法) def __init__(self): self.urls = [] #链接数 def get_pageone_content(self): #获得第一页中的内容 target = 'https://www.52shuku8.com/xiandaidushi/424.html' rep = requests.get(url = target) rep.encoding = 'utf-8' #由于出现乱码问题 html = rep.text bf = BeautifulSoup(html) texts = bf.find_all(class_=re.compile('-content')) #按css搜索 texts = texts[0].text return texts def get_remainpage_link(self): #获得第二页之后的页面链接 for each_number in range(2,257): page_link = 'https://www.52shuku8.com/xiandaidushi/424_{}.html'.format(each_number) self.urls.append(page_link) def get_remainpage_content(self,target): #获得第二页之后的内容 rep = requests.get(url = target) rep.encoding = 'utf-8' html = rep.text bf = BeautifulSoup(html) texts = bf.find_all(class_=re.compile('-content')) texts = texts[0].text return texts def writer(self, path, text): # 将爬取的文章内容写入文件 write_flag = True with open(path, 'a', encoding='utf-8') as f: #'a'指追加一个文件 f.writelines(text) # 章节内容 f.write('\n\n') if __name__ == '__main__': d1 = downloader() # 调用类 d1.get_remainpage_link() # 调用类中的方法,获得每页链接 print('《针锋对决》开始下载:') d1.writer('针锋对决.txt', d1.get_pageone_content()) for i in range(253): d1.writer('针锋对决.txt', d1.get_remainpage_content(d1.urls[i])) print('《针锋对决》下载完成')
尚有瑕疵,一些无效文字未去除