爬取三国演义
import requests
from bs4 import BeautifulSoup
import urllib.parse
import os
class SanGuoSpider():
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
self.proxies = {
}
self.url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
self.dir = 'sg'
if not os.path.exists(self.dir):
os.mkdir(self.dir)
def send_request(self, full_url):
response = requests.get(full_url, proxies=self.proxies, headers=self.headers)
if response.status_code == 200:
return response
else:
print('出错了~')
def parse_content(self, response):
bs = BeautifulSoup(response.content,'lxml')
a_list = bs.select('.book-mulu li a')
for a in a_list:
href = a.get('href')
title = a.string
full_url = urllib.parse.urljoin(self.url,href)
response = self.send_request(full_url)
if response:
self.parse_detail_content(response,title)
def parse_detail_content(self,response,title):
bs = BeautifulSoup(response.content, 'lxml')
content = bs.find('div',class_='chapter_content').get_text()
self.save_content(content,title)
def save_content(self, content,title):
file_name = self.dir+'/'+title+'.txt'
print('正在写入%s'%file_name)
content = content.replace(' ','')
with open(file_name,'w',encoding='utf8') as f:
f.write(content)
def start(self):
response = self.send_request(self.url)
if response:
self.parse_content(response)
if __name__ == '__main__':
sgs = SanGuoSpider()
sgs.start()