列表页
解析网站
找到url,进行爬取
import re
import requests
from lxml import etree
import o
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Mobile Safari/537.36',
}
for i in range(100):
url_list = f'https://blog.csdn.net/community/home-api/v1/get-business-list?page={i}&size=20&businessType=lately&noMore=false&username=weixin_45195493'
rep_list = requests.get(url_list,headers=headers).text
url_l = re.findall('"url":"(.*?)"',rep_list)
if url_l !=[]:
for url in url_l:
lj_name = url.split('/')[-1]
list_res = requests.get(url,headers=headers)
r = etree.HTML(list_res.text)
title = ''.join(r.xpath('//div[@id="content_views"]//text()'))..replace('\','-')#将斜线换掉
content_text = r.xpath('//div[@id="content_views"]') # 正文
content_text = str(etree.tostring(content_text[0], encoding='utf-8'), encoding='utf-8')
filename = f'D:/lf/day12/md/{lj_name}/'#文件路径
if not os.path.exists(filename): # 判断创建文件夹
os.makedirs(filename)
with open(filename+ title + '.text', 'w', encoding='utf-8') as f:
f.write(content_text)
print('okok')
else:
break