99作文网
import requests
from lxml import etree
import urllib.parse
import os
class ZuoWenSpider():
def __init__(self):
self.url = 'https://www.99zuowen.com/xiaoxuezuowen/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
self.dir = '99zuowen/'
if not os.path.exists(self.dir):
os.mkdir(self.dir)
def send_request(self, full_url):
response = requests.get(full_url, headers=self.headers)
if response.status_code == 200:
return response
def parse_content(self, response):
html = etree.HTML(response.text)
cat = html.xpath('//dl[@class="type_list2"]')[0]
a_list = cat.xpath('./dd/a')
for a in a_list:
title = a.xpath('./text()')[0]
href = a.xpath('./@href')[0]
full_url = urllib.parse.urljoin(self.url, href)
response = self.send_request(full_url)
self.parse_list(response, title)
def parse_list(self, response, title):
html = etree.HTML(response.text)
href_list = html.xpath('//li[@class="lis"]//span[@class="right"]/a/@href')
for href in href_list:
response = self.send_request(href)
self.parse_detail(response, title)
def parse_detail(self, response, title):
html = etree.HTML(response.text)
name = html.xpath('//h1/text()')[0]
content = "".join(html.xpath('//div[@class="content"]//p//text()'))
self.save_content(title, name, content