接上一文章 CSDN 文章导出
继续话题,获取cookie信息和 user-agent ( alert(navigator.userAgent) )
注意哦 这些header会改变了 可能今天用了可以 明天header内容就变了 导致request 403 我们先关掉代理(这是window10的),避免一些错误
测试发现
dc_session_id、 uuid_tt_dd、Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac 、UserInfo、UserToken、UserNick、AU、BT、p_uid、没变
cookie信息自己拼接 如右图,在打开浏览器,Ctrl + Shift + I 点击Network ,然后在页面登录;于是
"cookie": "uuid_tt_dd=10_27464470450-1558504101613-960211; dc_session_id=10_1611223080042.656536.420623; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1611218803,1611219167,1611219728,1611220363; dc_sid=04ae20224e8511583ba1508a79e6c379; __gads=ID=ID=ea1401c48f7c8e89-224ef746f8c4004c:T=1606792494:RT=1606792494:R:S=ALNI_MZnrqPFwsIvZeuQrQkgkybkzgFoLQ; c_ref=https%3A//blog.csdn.net/huang_ftpjh/article/list/2; c-toolbar-writeguide=1; UN=huang_ftpjh; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_27464470450-1558504101613-960211!5744*1*huang_ftpjh!1788*1*PC_VC; announcement-new=%7B%22isLogin%22%3Atrue%2C%22announcementUrl%22%3A%22https%3A%2F%2Fblog.csdn.net%2Fblogdevteam%2Farticle%2Fdetails%2F112280974%3Futm_source%3Dgonggao_0107%22%2C%22announcementCount%22%3A0%2C%22announcementExpire%22%3A3600000%7D; UserName=huang_ftpjh; UserInfo=3ece835bdaf24564aa037860b9f5b015; UserToken=3ece835bdaf24564aa037860b9f5b015; UserNick=huang_ftpjh; AU=DCC; BT=1598231188850; p_uid=U010000; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1611223095; TY_SESSION_ID=656a21c7-b70b-4b97-8212-221147973357; dc_tos=qna3p3"
草稿如下,最终版在文章最下面
# -*- coding: utf-8 -*-
import json
import uuid
import time
import requests
import datetime
from bs4 import BeautifulSoup
import re
# bug1:BeautifulSoup(reply.text, "lxml") 值为空 使用'html5lib'替代'lxml'解决问题
# bug2: html代码存在 \xa0\n\t\r & < > 应该是Unicode编码导致
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
# 网站Host
'Host': 'blog.csdn.net'
}
def request_blog_list(page):
"""获取博客列表
主要包括博客的id以及发表时间等
https://blog.csdn.net/huang_ftpjh/article/list/1 cookie信息自己复制
"cookie":"uuid_tt_dd=10_20621362900-1586421086666-163599; dc_session_id=10_1586421086666.420623; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1586421618; dc_sid=d4ceee41911ac755c162110ff811aee3; __gads=ID=608336bee91baf3d:T=1586421689:S=ALNI_MZozulzITWLw3Hxzo3jrnu5fmz8CA; c_ref=https%3A//blog.csdn.net/pang787559613/article/list/2; c-toolbar-writeguide=1; SESSION=3b5e7c88-b27d-4fcc-a2d5-4e97c1438a3c; UN=pang787559613; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_20621362900-1586421086666-163599!5744*1*pang787559613; announcement=%257B%2522isLogin%2522%253Atrue%252C%2522announcementUrl%2522%253A%2522https%253A%252F%252Fblog.csdn.net%252Fblogdevteam%252Farticle%252Fdetails%252F105203745%2522%252C%2522announcementCount%2522%253A0%252C%2522announcementExpire%2522%253A3600000%257D; UserName=pang787559613; UserInfo=604f13922dc04f2d8071fe0834e95db3; UserToken=604f13922dc04f2d8071fe0834e95db3; UserNick=%E7%AC%91%E8%83%96%E4%BB%94; AU=5FE; BT=1586422795507; p_uid=U000000; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1586422842; TY_SESSION_ID=ffc735f4-f5ae-40ed-9b98-ff01e337bf76; dc_tos=q8ijsv",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36" 浏览器输入chrome://version/ 复制用户代理
"""
print("进入request_blog_list 获取博客列表")
url = f'https://blog.csdn.net/huang_ftpjh/article/list/{page}'
reply = requests.get(url, headers=headers, timeout=10)
print("url==", url)
print("reply==", reply)
# 原: reply.content reply.text
parse = BeautifulSoup(reply.text, "html5lib") # lxml
# print("parse==" , parse)
spans = parse.find_all('div', attrs={'class': 'article-item-box csdn-tracking-statistics'})
# print("spans==" , spans)
blogs = []
print("进入request_blog_list 参数准备")
for span in spans[:40]:
try:
print("request_blog_list循环1")
href = span.find('a', attrs={'target': '_blank'})['href']
print("request_blog_list 循环2 获取阅读次数 CSDN该页面的class以前是num 现在改成read-num")
read_num = span.find('span', attrs={'class': 'read-num'}).get_text()
print("request_blog_list循环3 获取编辑时间")
date = span.find('span', attrs={'class': 'date'}).get_text()
print("request_blog_list循环4")
blog_id = href.split("/")[-1]
print("request_blog_list循环5")
blogs.append([blog_id, date, read_num])
print("request_blog_list循环6")
print(href)
except:
print("request_blog_list循环错误")
print('Wrong, ' + href)
return blogs
def request_md(blog_id, date):
print("进入request_md")
"""获取博客包含markdown文本的json数据"""
url = f"https://blog-console-api.csdn.net/v1/editor/getArticle?id={blog_id}"
headers = {
"cookie": "uuid_tt_dd=10_27464470450-1558504101613-960211; dc_session_id=10_1611223080042.656536.420623; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1611218803,1611219167,1611219728,1611220363; dc_sid=04ae20224e8511583ba1508a79e6c379; __gads=ID=ID=ea1401c48f7c8e89-224ef746f8c4004c:T=1606792494:RT=1606792494:R:S=ALNI_MZnrqPFwsIvZeuQrQkgkybkzgFoLQ; c_ref=https%3A//blog.csdn.net/huang_ftpjh/article/list/2; c-toolbar-writeguide=1; UN=huang_ftpjh; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_27464470450-1558504101613-960211!5744*1*huang_ftpjh!1788*1*PC_VC; announcement-new=%7B%22isLogin%22%3Atrue%2C%22announcementUrl%22%3A%22https%3A%2F%2Fblog.csdn.net%2Fblogdevteam%2Farticle%2Fdetails%2F112280974%3Futm_source%3Dgonggao_0107%22%2C%22announcementCount%22%3A0%2C%22announcementExpire%22%3A3600000%7D; UserName=huang_ftpjh; UserInfo=3ece835bdaf24564aa037860b9f5b015; UserToken=3ece835bdaf24564aa037860b9f5b015; UserNick=huang_ftpjh; AU=DCC; BT=1598231188850; p_uid=U010000; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1611223095; TY_SESSION_ID=656a21c7-b70b-4b97-8212-221147973357; dc_tos=qna3p3",
"user-agent": "Mozilla/5.0(Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome / 72.0.3626.109 Safari/537.36"
}
data = {"id": blog_id}
reply = requests.get(url, headers=headers, data=data)
reply.encoding = "utf-8"
# print(reply.json())
try:
print("准备写入", blog_id)
write_hexo_md(reply.json(), date)
except Exception as e:
print("***********************************")
print(e)
print(url)
# print(reply.json())
def write_hexo_md(data, date):
"""将获取的json数据解析为hexo的markdown格式"""
title = data["data"]["title"]
title = title.replace("[", "【")
title = title.replace("]", "】")
# content = content.replace("\xa0", " ")
# content = content.replace("\xa9", " ")
# content = content.replace("&", " ")
# content = content.replace("@[toc]", "").replace('\n', '').replace('\r', '')
# content = content.replace('<', '<'+'')
# content = content.replace('>', '>'+'')
# content = content.replace('lt;', '<'+'')
# content = content.replace('gt;', '>'+'')
# content = content.replace('"', '\"')
content = data["data"]["content"] + data["data"]["markdowncontent"]
# 1、 自己写的存在一些瑕疵
#md = text_to_md(content)
# 2、 md = tomd.Tomd(content).markdown # 会将博客上代码部分被删了(无法识别) 其它正常被优化了
#if '<pre class=' in content:
# md = tomd.Tomd(content).markdown
#else:
md = text_to_md(content)
# 用来博客迁移。遂仅保留内容
with open(f"blogs/{title}.md", "w", encoding="utf-8") as f:
f.write(md)
print(f"写入 {title}")
def text_to_md(a):
# 去除a头尾的空格
a = a.strip()
member = ['\xa0', '\xa9', '&', '"',''', '<', 'lt;', '>', 'gt;', '@[toc]', '\r', '\t'] # ,'\n'
change = [' ', ' ', ' ', '\"', '\'','<', '<', '>', '>', '', '', ''] # , ''
for inx, val in enumerate(change):
a = a.replace(member[inx], val)
# 标题
a = re.sub('<h1>.*?\d*\. (?P<name>.*?)</h1>', '<h1>\g<name>\n\n</h1>', a)
a = re.sub('<h1.*?>', '# ', a)
a = re.sub('<h2>.*?\d*\.\d* (?P<name>.*?)</h2>', '<h2>\g<name>\n\n</h2>', a)
a = re.sub('<h2.*?>', '## ', a)
a = re.sub('<h3>.*?\d*\.\d*\.\d* (?P<name>.*?)</h3>', '<h3>\g<name>\n\n</h3>', a)
a = re.sub('<h3.*?>', '### ', a)
a = re.sub('<h4>.*?\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h4>', '<h4>\g<name>\n\n</h4>', a)
a = re.sub('<h4.*?>', '#### ', a)
a = re.sub('<h5>.*?\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h5>', '<h5>\g<name>\n\n</h5>', a)
a = re.sub('<h5.*?>', '##### ', a)
a = re.sub('<h6>.*?\d*\.\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h6>', '<h6>\g<name>\n\n</h6>', a)
a = re.sub('<h6.*?>', '###### ', a)
a = re.sub('</h1>|</h2>|</h3>|</h4>|</h5>|</h6>|', "", a)
# 三个点 TODO 需要处理 主要是这里一些html标签没有删除
if '<pre class=' in a:
a = re.sub('<pre class="has"><code class="language-bash">', '```', a)
a = re.sub('</code></pre>', '\n```', a)
a = re.sub('<strong>|</strong>', '**', a)
# span标签
a = re.sub('<span.*?>|</span>', '', a)
# pre标签
a = re.sub('<pre.*?>|</pre>', '', a)
# p标签
a = re.sub('<p.*?>|</p>', '', a)
# br标签
a = re.sub('<br/>', '\n', a)
# ul与li
a = re.sub('<ul.*?>|</ul>|</li>', '', a)
a = re.sub('<li.*?>', '- ', a)
return a
def main(total_pages=2):
"""
获取博客列表,包括id,时间
获取博客markdown数据
保存hexo格式markdown
"""
blogs = []
for page in range(1, total_pages + 1):
blogs.extend(request_blog_list(page))
for blog in blogs:
blog_id = blog[0]
date = blog[1].split()[0].split("-")
request_md(blog_id, date)
time.sleep(1)
if __name__ == '__main__':
main()
未解决问题:文章中的代码问题依然不能解决(自己写的有瑕疵 使用python包直接删掉了 )、依然只能获取公开部分(需进入控制台页面而不是https://blog.csdn.net/huang_ftpjh页面)
代码方面处理的逻辑如下, 怎么优化呢? 但是我修改玩还是存在问题
# 三个点 TODO 需要处理 主要是这里一些html标签没有删除
if '<pre class=' in a:
a = re.sub('<pre class="has"><code class="language-bash">', '```', a)
a = re.sub('</code></pre>', '\n```', a)
修改成:
if '<pre class=' in a:
a = re.sub('<pre.*?>', '', a)
a = re.sub('<code.*?>', '```', a)
a = re.sub('</code>', '\n```', a)
a = re.sub('</pre>', '\n', a)
最终版
# -*- coding: utf-8 -*-
import time
import requests
from bs4 import BeautifulSoup
import re
import tomd
import html2md
# 2020/1/25 https://github.com/davidcavazos/html2md
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
# 网站Host
'Host': 'blog.csdn.net'
}
def request_blog_list(page):
"""获取博客列表
主要包括博客的id以及发表时间等
"""
print("进入request_blog_list 获取博客列表")
url = f'https://blog.csdn.net/huang_ftpjh/article/list/{page}'
reply = requests.get(url, headers=headers, timeout=10)
print("url==", url)
print("reply==", reply)
parse = BeautifulSoup(reply.text, "html5lib") # lxml
spans = parse.find_all('div', attrs={'class': 'article-item-box csdn-tracking-statistics'})
blogs = []
for span in spans[:40]:
try:
href = span.find('a', attrs={'target': '_blank'})['href']
# 获取阅读次数 CSDN该页面的class以前是num 现在改成read-num
read_num = span.find('span', attrs={'class': 'read-num'}).get_text()
# 获取编辑时间
date = span.find('span', attrs={'class': 'date'}).get_text()
blog_id = href.split("/")[-1]
blogs.append([blog_id, date, read_num])
print(href)
except:
print('Wrong, ' + href)
return blogs
def request_md(blog_id, date):
"""获取博客包含markdown文本的json数据"""
url = f"https://blog-console-api.csdn.net/v1/editor/getArticle?id={blog_id}"
headers = {
"cookie": "uuid_tt_dd=10_27464470450-1558504101613-960211; dc_session_id=10_1611538618913.836622; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1611540068,1611541339,1611541518,1611541902; dc_sid=a69bc3ceab3d7da75fdaeb27333b4dd9; __gads=ID=ea1401c48f7c8e89-224ef746f8c4004c:T=1606792494:RT=1606792494:R:S=ALNI_MZnrqPFwsIvZeuQrQkgkybkzgFoLQ; c_ref=https%3A//mp.csdn.net/console/home; UN=huang_ftpjh; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_27464470450-1558504101613-960211!5744*1*huang_ftpjh!1788*1*PC_VC; announcement-new=%7B%22isLogin%22%3Atrue%2C%22announcementUrl%22%3A%22https%3A%2F%2Fblog.csdn.net%2Fblogdevteam%2Farticle%2Fdetails%2F112280974%3Futm_source%3Dgonggao_0107%22%2C%22announcementCount%22%3A0%2C%22announcementExpire%22%3A3600000%7D; UserName=huang_ftpjh; UserInfo=3ece835bdaf24564aa037860b9f5b015; UserToken=3ece835bdaf24564aa037860b9f5b015; UserNick=huang_ftpjh; AU=DCC; BT=1598231188850; p_uid=U010000; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1611542781; TY_SESSION_ID=0199273e-097b-4900-8285-4e8bf54b8ca7; dc_tos=qngyd8",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
}
data = {"id": blog_id}
reply = requests.get(url, headers=headers, data=data)
reply.encoding = "utf-8"
try:
print("准备写入", blog_id)
write_hexo_md(reply.json(), date)
except Exception as e:
print("***********************************")
print(e)
print(url)
def write_hexo_md(data, date):
"""将获取的json数据解析为hexo的markdown格式"""
title = data["data"]["title"]
title = title.replace("[", "【")
title = title.replace("]", "】")
content = data["data"]["content"] + data["data"]["markdowncontent"]
# 1、 自己写的存在一些瑕疵
#md = text_to_md(content)
# 2、 md = tomd.Tomd(content).markdown # 会将博客上代码部分被删了(无法识别) 其它正常被优化了
# 3、 就是这个啦
md = html2md.convert(content)
# 生成md文件
with open(f"blogs/{title}.md", "w", encoding="utf-8") as f:
f.write(md)
print(f"写入 {title}")
def text_to_md(a):
# 去除a头尾的空格
a = a.strip()
member = ['\xa0', '\xa9', '&', '"',''', '<', 'lt;', '>', 'gt;', '@[toc]', '\r', '\t'] # ,'\n'
change = [' ', ' ', ' ', '\"', '\'','<', '<', '>', '>', '', '', ''] # , ''
for inx, val in enumerate(change):
a = a.replace(member[inx], val)
# 标题
a = re.sub('<h1>.*?\d*\. (?P<name>.*?)</h1>', '<h1>\g<name>\n\n</h1>', a)
a = re.sub('<h1.*?>', '# ', a)
a = re.sub('<h2>.*?\d*\.\d* (?P<name>.*?)</h2>', '<h2>\g<name>\n\n</h2>', a)
a = re.sub('<h2.*?>', '## ', a)
a = re.sub('<h3>.*?\d*\.\d*\.\d* (?P<name>.*?)</h3>', '<h3>\g<name>\n\n</h3>', a)
a = re.sub('<h3.*?>', '### ', a)
a = re.sub('<h4>.*?\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h4>', '<h4>\g<name>\n\n</h4>', a)
a = re.sub('<h4.*?>', '#### ', a)
a = re.sub('<h5>.*?\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h5>', '<h5>\g<name>\n\n</h5>', a)
a = re.sub('<h5.*?>', '##### ', a)
a = re.sub('<h6>.*?\d*\.\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h6>', '<h6>\g<name>\n\n</h6>', a)
a = re.sub('<h6.*?>', '###### ', a)
a = re.sub('</h1>|</h2>|</h3>|</h4>|</h5>|</h6>|', "", a)
# 三个点 TODO 需要处理 主要是这里一些html标签没有删除
if '<pre class=' in a:
a = re.sub('<pre.*?>', '', a)
a = re.sub('<code.*?>', '```', a)
a = re.sub('</code>', '\n```', a)
a = re.sub('</pre>', '\n', a)
a = re.sub('<strong>|</strong>', '**', a)
# span标签
a = re.sub('<span.*?>|</span>', '', a)
# pre标签
a = re.sub('<pre.*?>|</pre>', '', a)
# p标签
a = re.sub('<p.*?>|</p>', '', a)
# br标签
a = re.sub('<br/>', '\n', a)
# ul与li
a = re.sub('<ul.*?>|</ul>|</li>', '', a)
a = re.sub('<li.*?>', '- ', a)
return a
def main(total_pages=2):
"""
获取博客列表,包括id,时间
获取博客markdown数据
保存hexo格式markdown
"""
blogs = []
for page in range(1, total_pages + 1):
blogs.extend(request_blog_list(page))
for blog in blogs:
blog_id = blog[0]
date = blog[1].split()[0].split("-")
request_md(blog_id, date)
time.sleep(1)
if __name__ == '__main__':
main()