一、需求
爬取网址:https://www.gushiwen.org/
需求:
(1)获取侧边栏【类型】信息;
(2)获取每个类型中古诗文详情页信息;
(3)提取详情页数据:古诗文名、作者、朝代、类型、内容、译文及注释;
(4)将数据保存到 csv 文件;
二、代码实现
import requests
import csv
from lxml import etree
start_url = "https://so.gushiwen.cn/shiwen/"
base_url = "https://so.gushiwen.cn"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
items = []
def parse_url(url):
"""解析url,得到响应内容"""
# time.sleep(random.random())
response = requests.get(url=url, headers=headers)
return response.content.decode("utf-8")
def parse_html(html):
"""使用xpath解析html,返回xpath对象"""
etree_obj = etree.HTML(html)
return etree_obj
def get_first_type():
"""获取所有的一级类型"""
first_type_list = []
html