CSDN 文章导出2 python

接上一文章  CSDN 文章导出

继续话题,获取cookie信息和 user-agent  (     alert(navigator.userAgent)    ) 

注意哦  这些header会改变了   可能今天用了可以  明天header内容就变了 导致request 403   我们先关掉代理(这是window10的),避免一些错误

测试发现   

dc_session_id、 uuid_tt_dd、Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac 、UserInfo、UserToken、UserNick、AU、BT、p_uid、没变

cookie信息自己拼接   如右图,在打开浏览器,Ctrl + Shift + I 点击Network  ,然后在页面登录;于是

"cookie": "uuid_tt_dd=10_27464470450-1558504101613-960211; dc_session_id=10_1611223080042.656536.420623; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1611218803,1611219167,1611219728,1611220363; dc_sid=04ae20224e8511583ba1508a79e6c379; __gads=ID=ID=ea1401c48f7c8e89-224ef746f8c4004c:T=1606792494:RT=1606792494:R:S=ALNI_MZnrqPFwsIvZeuQrQkgkybkzgFoLQ; c_ref=https%3A//blog.csdn.net/huang_ftpjh/article/list/2; c-toolbar-writeguide=1; UN=huang_ftpjh; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_27464470450-1558504101613-960211!5744*1*huang_ftpjh!1788*1*PC_VC; announcement-new=%7B%22isLogin%22%3Atrue%2C%22announcementUrl%22%3A%22https%3A%2F%2Fblog.csdn.net%2Fblogdevteam%2Farticle%2Fdetails%2F112280974%3Futm_source%3Dgonggao_0107%22%2C%22announcementCount%22%3A0%2C%22announcementExpire%22%3A3600000%7D; UserName=huang_ftpjh; UserInfo=3ece835bdaf24564aa037860b9f5b015; UserToken=3ece835bdaf24564aa037860b9f5b015; UserNick=huang_ftpjh; AU=DCC; BT=1598231188850; p_uid=U010000; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1611223095; TY_SESSION_ID=656a21c7-b70b-4b97-8212-221147973357; dc_tos=qna3p3"

草稿如下,最终版在文章最下面

# -*- coding: utf-8 -*-

import json
import uuid
import time
import requests
import datetime
from bs4 import BeautifulSoup
import re

# bug1:BeautifulSoup(reply.text, "lxml") 值为空  使用'html5lib'替代'lxml'解决问题
# bug2: html代码存在 \xa0\n\t\r & < >   应该是Unicode编码导致
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
    # 网站Host
    'Host': 'blog.csdn.net'
}


def request_blog_list(page):
    """获取博客列表
    主要包括博客的id以及发表时间等
    https://blog.csdn.net/huang_ftpjh/article/list/1  cookie信息自己复制
    "cookie":"uuid_tt_dd=10_20621362900-1586421086666-163599; dc_session_id=10_1586421086666.420623; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1586421618; dc_sid=d4ceee41911ac755c162110ff811aee3; __gads=ID=608336bee91baf3d:T=1586421689:S=ALNI_MZozulzITWLw3Hxzo3jrnu5fmz8CA; c_ref=https%3A//blog.csdn.net/pang787559613/article/list/2; c-toolbar-writeguide=1; SESSION=3b5e7c88-b27d-4fcc-a2d5-4e97c1438a3c; UN=pang787559613; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_20621362900-1586421086666-163599!5744*1*pang787559613; announcement=%257B%2522isLogin%2522%253Atrue%252C%2522announcementUrl%2522%253A%2522https%253A%252F%252Fblog.csdn.net%252Fblogdevteam%252Farticle%252Fdetails%252F105203745%2522%252C%2522announcementCount%2522%253A0%252C%2522announcementExpire%2522%253A3600000%257D; UserName=pang787559613; UserInfo=604f13922dc04f2d8071fe0834e95db3; UserToken=604f13922dc04f2d8071fe0834e95db3; UserNick=%E7%AC%91%E8%83%96%E4%BB%94; AU=5FE; BT=1586422795507; p_uid=U000000; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1586422842; TY_SESSION_ID=ffc735f4-f5ae-40ed-9b98-ff01e337bf76; dc_tos=q8ijsv",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"     浏览器输入chrome://version/  复制用户代理
    """
    print("进入request_blog_list  获取博客列表")
    url = f'https://blog.csdn.net/huang_ftpjh/article/list/{page}'
    reply = requests.get(url, headers=headers, timeout=10)
    print("url==", url)
    print("reply==", reply)
    # 原: reply.content  reply.text
    parse = BeautifulSoup(reply.text, "html5lib")  # lxml
    # print("parse==" , parse)
    spans = parse.find_all('div', attrs={'class': 'article-item-box csdn-tracking-statistics'})
    # print("spans==" , spans)
    blogs = []
    print("进入request_blog_list 参数准备")
    for span in spans[:40]:
        try:
            print("request_blog_list循环1")
            href = span.find('a', attrs={'target': '_blank'})['href']
            print("request_blog_list 循环2  获取阅读次数 CSDN该页面的class以前是num  现在改成read-num")
            read_num = span.find('span', attrs={'class': 'read-num'}).get_text()
            print("request_blog_list循环3   获取编辑时间")
            date = span.find('span', attrs={'class': 'date'}).get_text()
            print("request_blog_list循环4")
            blog_id = href.split("/")[-1]
            print("request_blog_list循环5")
            blogs.append([blog_id, date, read_num])
            print("request_blog_list循环6")
            print(href)
        except:
            print("request_blog_list循环错误")
            print('Wrong, ' + href)
    return blogs


def request_md(blog_id, date):
    print("进入request_md")
    """获取博客包含markdown文本的json数据"""
    url = f"https://blog-console-api.csdn.net/v1/editor/getArticle?id={blog_id}"
    headers = {
        "cookie": "uuid_tt_dd=10_27464470450-1558504101613-960211; dc_session_id=10_1611223080042.656536.420623; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1611218803,1611219167,1611219728,1611220363; dc_sid=04ae20224e8511583ba1508a79e6c379; __gads=ID=ID=ea1401c48f7c8e89-224ef746f8c4004c:T=1606792494:RT=1606792494:R:S=ALNI_MZnrqPFwsIvZeuQrQkgkybkzgFoLQ; c_ref=https%3A//blog.csdn.net/huang_ftpjh/article/list/2; c-toolbar-writeguide=1; UN=huang_ftpjh; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_27464470450-1558504101613-960211!5744*1*huang_ftpjh!1788*1*PC_VC; announcement-new=%7B%22isLogin%22%3Atrue%2C%22announcementUrl%22%3A%22https%3A%2F%2Fblog.csdn.net%2Fblogdevteam%2Farticle%2Fdetails%2F112280974%3Futm_source%3Dgonggao_0107%22%2C%22announcementCount%22%3A0%2C%22announcementExpire%22%3A3600000%7D; UserName=huang_ftpjh; UserInfo=3ece835bdaf24564aa037860b9f5b015; UserToken=3ece835bdaf24564aa037860b9f5b015; UserNick=huang_ftpjh; AU=DCC; BT=1598231188850; p_uid=U010000; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1611223095; TY_SESSION_ID=656a21c7-b70b-4b97-8212-221147973357; dc_tos=qna3p3",
        "user-agent": "Mozilla/5.0(Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome / 72.0.3626.109 Safari/537.36"
    }
    data = {"id": blog_id}
    reply = requests.get(url, headers=headers, data=data)
    reply.encoding = "utf-8"

    # print(reply.json())
    try:
        print("准备写入", blog_id)
        write_hexo_md(reply.json(), date)
    except Exception as e:
        print("***********************************")
        print(e)
        print(url)
        # print(reply.json())


def write_hexo_md(data, date):
    """将获取的json数据解析为hexo的markdown格式"""
    title = data["data"]["title"]
    title = title.replace("[", "【")
    title = title.replace("]", "】")

    # content = content.replace("\xa0", " ")
    # content = content.replace("\xa9", " ")
    # content = content.replace("&", " ")
    # content = content.replace("@[toc]", "").replace('\n', '').replace('\r', '')
    # content = content.replace('&lt;', '<'+'')
    # content = content.replace('&gt;', '>'+'')
    # content = content.replace('lt;', '<'+'')
    # content = content.replace('gt;', '>'+'')
    # content = content.replace('&quot;', '\"')

    content = data["data"]["content"] + data["data"]["markdowncontent"]
    #  1、  自己写的存在一些瑕疵
    #md = text_to_md(content)

    #  2、 md = tomd.Tomd(content).markdown  # 会将博客上代码部分被删了(无法识别)  其它正常被优化了

    #if '<pre class=' in content:
    #    md = tomd.Tomd(content).markdown
    #else:
    md = text_to_md(content)

    # 用来博客迁移。遂仅保留内容
    with open(f"blogs/{title}.md", "w", encoding="utf-8") as f:
        f.write(md)
    print(f"写入 {title}")




def text_to_md(a):
    # 去除a头尾的空格
    a = a.strip()

    member = ['\xa0', '\xa9', '&amp;', '&quot;','&#39;', '&lt;', 'lt;', '&gt;', 'gt;', '@[toc]', '\r', '\t']  # ,'\n'
    change = [' ', ' ', ' ', '\"', '\'','<', '<', '>', '>', '', '', '']  # , ''

    for inx, val in enumerate(change):
        a = a.replace(member[inx], val)

    # 标题
    a = re.sub('<h1>.*?\d*\. (?P<name>.*?)</h1>', '<h1>\g<name>\n\n</h1>', a)
    a = re.sub('<h1.*?>', '# ', a)
    a = re.sub('<h2>.*?\d*\.\d* (?P<name>.*?)</h2>', '<h2>\g<name>\n\n</h2>', a)
    a = re.sub('<h2.*?>', '## ', a)
    a = re.sub('<h3>.*?\d*\.\d*\.\d* (?P<name>.*?)</h3>', '<h3>\g<name>\n\n</h3>', a)
    a = re.sub('<h3.*?>', '### ', a)
    a = re.sub('<h4>.*?\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h4>', '<h4>\g<name>\n\n</h4>', a)
    a = re.sub('<h4.*?>', '#### ', a)
    a = re.sub('<h5>.*?\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h5>', '<h5>\g<name>\n\n</h5>', a)
    a = re.sub('<h5.*?>', '##### ', a)
    a = re.sub('<h6>.*?\d*\.\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h6>', '<h6>\g<name>\n\n</h6>', a)
    a = re.sub('<h6.*?>', '###### ', a)
    a = re.sub('</h1>|</h2>|</h3>|</h4>|</h5>|</h6>|', "", a)

    # 三个点  TODO 需要处理  主要是这里一些html标签没有删除
    if '<pre class=' in a:
        a = re.sub('<pre class="has"><code class="language-bash">', '```', a)
        a = re.sub('</code></pre>', '\n```', a)

    a = re.sub('<strong>|</strong>', '**', a)
    # span标签
    a = re.sub('<span.*?>|</span>', '', a)

    # pre标签
    a = re.sub('<pre.*?>|</pre>', '', a)

    # p标签
    a = re.sub('<p.*?>|</p>', '', a)

    # br标签
    a = re.sub('<br/>', '\n', a)

    # ul与li
    a = re.sub('<ul.*?>|</ul>|</li>', '', a)
    a = re.sub('<li.*?>', '- ', a)
    return a


def main(total_pages=2):
    """
    获取博客列表,包括id,时间
    获取博客markdown数据
    保存hexo格式markdown
    """
    blogs = []
    for page in range(1, total_pages + 1):
        blogs.extend(request_blog_list(page))
    for blog in blogs:
        blog_id = blog[0]
        date = blog[1].split()[0].split("-")
        request_md(blog_id, date)
        time.sleep(1)


if __name__ == '__main__':
    main()

未解决问题:文章中的代码问题依然不能解决(自己写的有瑕疵  使用python包直接删掉了 )、依然只能获取公开部分(需进入控制台页面而不是https://blog.csdn.net/huang_ftpjh页面

代码方面处理的逻辑如下,  怎么优化呢?   但是我修改玩还是存在问题

# 三个点  TODO 需要处理  主要是这里一些html标签没有删除
    if '<pre class=' in a:
        a = re.sub('<pre class="has"><code class="language-bash">', '```', a)
        a = re.sub('</code></pre>', '\n```', a)
 修改成:
    if '<pre class=' in a:
        a = re.sub('<pre.*?>', '', a)
        a = re.sub('<code.*?>', '```', a)
        a = re.sub('</code>', '\n```', a)
        a = re.sub('</pre>', '\n', a)

 

最终版

# -*- coding: utf-8 -*-

import time
import requests
from bs4 import BeautifulSoup
import re
import tomd
import html2md
# 2020/1/25 https://github.com/davidcavazos/html2md

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
    # 网站Host
    'Host': 'blog.csdn.net'
}


def request_blog_list(page):
    """获取博客列表
    主要包括博客的id以及发表时间等
    """
    print("进入request_blog_list  获取博客列表")
    url = f'https://blog.csdn.net/huang_ftpjh/article/list/{page}'
    reply = requests.get(url, headers=headers, timeout=10)
    print("url==", url)
    print("reply==", reply)
    parse = BeautifulSoup(reply.text, "html5lib")  # lxml
    spans = parse.find_all('div', attrs={'class': 'article-item-box csdn-tracking-statistics'})
    blogs = []
    for span in spans[:40]:
        try:
            href = span.find('a', attrs={'target': '_blank'})['href']
            # 获取阅读次数 CSDN该页面的class以前是num  现在改成read-num
            read_num = span.find('span', attrs={'class': 'read-num'}).get_text()
            # 获取编辑时间
            date = span.find('span', attrs={'class': 'date'}).get_text()
            blog_id = href.split("/")[-1]
            blogs.append([blog_id, date, read_num])
            print(href)
        except:
            print('Wrong, ' + href)
    return blogs


def request_md(blog_id, date):
    """获取博客包含markdown文本的json数据"""
    url = f"https://blog-console-api.csdn.net/v1/editor/getArticle?id={blog_id}"
    headers = {
        "cookie": "uuid_tt_dd=10_27464470450-1558504101613-960211; dc_session_id=10_1611538618913.836622; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1611540068,1611541339,1611541518,1611541902; dc_sid=a69bc3ceab3d7da75fdaeb27333b4dd9; __gads=ID=ea1401c48f7c8e89-224ef746f8c4004c:T=1606792494:RT=1606792494:R:S=ALNI_MZnrqPFwsIvZeuQrQkgkybkzgFoLQ; c_ref=https%3A//mp.csdn.net/console/home; UN=huang_ftpjh; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_27464470450-1558504101613-960211!5744*1*huang_ftpjh!1788*1*PC_VC; announcement-new=%7B%22isLogin%22%3Atrue%2C%22announcementUrl%22%3A%22https%3A%2F%2Fblog.csdn.net%2Fblogdevteam%2Farticle%2Fdetails%2F112280974%3Futm_source%3Dgonggao_0107%22%2C%22announcementCount%22%3A0%2C%22announcementExpire%22%3A3600000%7D; UserName=huang_ftpjh; UserInfo=3ece835bdaf24564aa037860b9f5b015; UserToken=3ece835bdaf24564aa037860b9f5b015; UserNick=huang_ftpjh; AU=DCC; BT=1598231188850; p_uid=U010000; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1611542781; TY_SESSION_ID=0199273e-097b-4900-8285-4e8bf54b8ca7; dc_tos=qngyd8",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
    }
    data = {"id": blog_id}
    reply = requests.get(url, headers=headers, data=data)
    reply.encoding = "utf-8"

    try:
        print("准备写入", blog_id)
        write_hexo_md(reply.json(), date)
    except Exception as e:
        print("***********************************")
        print(e)
        print(url)


def write_hexo_md(data, date):
    """将获取的json数据解析为hexo的markdown格式"""
    title = data["data"]["title"]
    title = title.replace("[", "【")
    title = title.replace("]", "】")

    content = data["data"]["content"] + data["data"]["markdowncontent"]
    #  1、  自己写的存在一些瑕疵
    #md = text_to_md(content)

    #  2、 md = tomd.Tomd(content).markdown  # 会将博客上代码部分被删了(无法识别)  其它正常被优化了

    # 3、 就是这个啦
    md = html2md.convert(content)

    # 生成md文件
    with open(f"blogs/{title}.md", "w", encoding="utf-8") as f:
        f.write(md)
    print(f"写入 {title}")




def text_to_md(a):
    # 去除a头尾的空格
    a = a.strip()

    member = ['\xa0', '\xa9', '&amp;', '&quot;','&#39;', '&lt;', 'lt;', '&gt;', 'gt;', '@[toc]', '\r', '\t']  # ,'\n'
    change = [' ', ' ', ' ', '\"', '\'','<', '<', '>', '>', '', '', '']  # , ''

    for inx, val in enumerate(change):
        a = a.replace(member[inx], val)

    # 标题
    a = re.sub('<h1>.*?\d*\. (?P<name>.*?)</h1>', '<h1>\g<name>\n\n</h1>', a)
    a = re.sub('<h1.*?>', '# ', a)
    a = re.sub('<h2>.*?\d*\.\d* (?P<name>.*?)</h2>', '<h2>\g<name>\n\n</h2>', a)
    a = re.sub('<h2.*?>', '## ', a)
    a = re.sub('<h3>.*?\d*\.\d*\.\d* (?P<name>.*?)</h3>', '<h3>\g<name>\n\n</h3>', a)
    a = re.sub('<h3.*?>', '### ', a)
    a = re.sub('<h4>.*?\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h4>', '<h4>\g<name>\n\n</h4>', a)
    a = re.sub('<h4.*?>', '#### ', a)
    a = re.sub('<h5>.*?\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h5>', '<h5>\g<name>\n\n</h5>', a)
    a = re.sub('<h5.*?>', '##### ', a)
    a = re.sub('<h6>.*?\d*\.\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h6>', '<h6>\g<name>\n\n</h6>', a)
    a = re.sub('<h6.*?>', '###### ', a)
    a = re.sub('</h1>|</h2>|</h3>|</h4>|</h5>|</h6>|', "", a)

    # 三个点  TODO 需要处理  主要是这里一些html标签没有删除
    if '<pre class=' in a:
        a = re.sub('<pre.*?>', '', a)
        a = re.sub('<code.*?>', '```', a)
        a = re.sub('</code>', '\n```', a)
        a = re.sub('</pre>', '\n', a)



    a = re.sub('<strong>|</strong>', '**', a)
    # span标签
    a = re.sub('<span.*?>|</span>', '', a)

    # pre标签
    a = re.sub('<pre.*?>|</pre>', '', a)

    # p标签
    a = re.sub('<p.*?>|</p>', '', a)

    # br标签
    a = re.sub('<br/>', '\n', a)

    # ul与li
    a = re.sub('<ul.*?>|</ul>|</li>', '', a)
    a = re.sub('<li.*?>', '- ', a)
    return a


def main(total_pages=2):
    """
    获取博客列表,包括id,时间
    获取博客markdown数据
    保存hexo格式markdown
    """
    blogs = []
    for page in range(1, total_pages + 1):
        blogs.extend(request_blog_list(page))
    for blog in blogs:
        blog_id = blog[0]
        date = blog[1].split()[0].split("-")
        request_md(blog_id, date)
        time.sleep(1)


if __name__ == '__main__':
    main()

 

 

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

huang_ftpjh

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值