前言
闲来无事,用requests试一试爬取csdn,顺便加了一个三格式保存功能
思路
无论怎样,先把html爬下来
def crawl(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
}
print("crawl...")
# 配置header破反爬
response = requests.get(url, headers=headers)
# 200就继续
if response.status_code == 200:
用xpath解析题目以及正文,储存到变量中
还用unescape反编译unicode,让它正确显示
html = response.content.decode("utf8")
# print(html)
tree = etree.HTML(html)
print("look for text...")
# 找到需要的html块
title = tree.xpath('//*[@id="articleContentId"]/text()')[0]
block = tree.xpath('//*[@id="content_views"]')
# html
ohtml = unescape(etree.tostring(block[0]).decode("utf8"))
# 纯文本
text = block[0].xpath('string(.)').strip()
创建输出文件夹
if "output" not in os.listdir():
# 不存在输出文件夹就创建
os.mkdir("output")
os.mkdir("output/html")
os.mkdir("output/text")
os.mkdir("output/markdown")
保存html,纯文本
with open(f"output/html/{title}.html", 'w', encoding='utf8') as html_file:
# 保存html
print("write html...")
html_file.write(html)
with open(f"output/text/{title}.txt", 'w', encoding=