csdn文章分类修改脚本

以前写的文章分类太多,太乱了,所以决定来一次清理,把一些文章类别给替换掉。和LX同学讨论了一个下午,终于有一个方案了,搞了一晚上才弄好。

我是混合了python和js代码一起做的。js可以在浏览器的 控制窗下运行,不用登陆了,所以很方便。python我用的比较习惯,所以处理数据比较方便,而且不会丢失。


步骤如下:

step 1:先把文章列表抓下来,知道有哪些文章。markdown编写的文章不能处理!都进eid了!

import urllib.request

import re
ids = {} #保存所有的文章id

for i in range(30):#按15篇文章一页算自己有多少页
	url = "http://blog.csdn.net/firenet1/article/list/"+str(i+1)
	try:
		data = urllib.request.urlopen(url).read()
	except :
		continue
	data = data.decode('UTF-8')
	patterm = "/firenet1/article/details/[0-9]{5,10}"
	patterm = re.compile(patterm)
	data = patterm.findall(data,re.S|re.M|re.I)
	digit = re.compile(r"[0-9]{5,10}",re.S|re.M|re.I)
	print(len(data))
	for d in data:
		# print(d)
		d = digit.search(d)
		# print(d)
		# print(d.group())
		ids[d.group()] = 1
print(len(ids))
file = open("js_array.txt","w")
out = "var id_array = new Array("
for i in ids.keys():
	out+="\""+i+"\",\n"
out+=");"
file.write(out)  
file.close()
'''
输出文件保存成js数组
var id_array = new Array("77187506",
"77073144",
"77046721",
"76766916",
"76642319",
"76195994")
'''
step 2: 打开一个csdn博客编辑页,任何一页都行。F12进入控制台,在控制台里执行以下代码,这一步就能获得大部分文章的id,tag,类别

var id_array = new Array("77187506", //这个就是上一步得到的文章id数组
"77073144",
"77046721",
);
#抓取文章标签和类别
var time_out_th = 500;
var i;
var a;
var id;
var out;
var eid; //执行不成功的文章id记录下来,可以自己打印出来看
function getdata(){  //调用函数入口,通过timeout设置,每次抓取一个文章才进行下一个文章抓取
     i = 0;
     out = new Array();
     id = id_array[i];
     eid = new Array();
     a = window.open(id,"_blank"); //打开新的编辑页面
     setTimeout(doing,time_out_th);
}
function doing(){
    if(i == id_array.length) {
            console.log("finish");
            return i;
    }
    try{
        if(a != 0 && a.document.readyState == "complete"){ //判断新打开的页面是不是加载完毕了
            let cla = a.document.getElementById("txtTag").value;
            let lable = a.document.getElementById("d_tag2").innerHTML;
            lable = lable.replace("\n","");
            out.push(new Array(id,cla,lable));
            a.close();
            console.log("ok: "+i)
            i++;
            if(i == id_array.length) {
                console.log("finish");
                return i;
            }
            id = id_array[i];
            a = window.open(id,"_blank");
        }
    }
    catch(err){
        eid.push(id);
        console.log("err: "+i);
        i++;
        a.close();
        if(i == id_array.length) {
            console.log("finish");
            return i;
        }
        id = id_array[i];
        a = window.open(id,"_blank");
    }
    setTimeout(doing,time_out_th);
}
function output(){ //输出文本函数
    let res = "";
    for(let i = 0;i < out.length;i++){
        res += out[i][0]+"###,###"+out[i][1]+"###,###"+out[i][2]+"\n";
    }
    console.log(res);
}
/*
输出数据如下:复制黏贴到本地文本后用于后面的步骤
47311009###,###多校联合训练赛###,###<span title="单击删除该标签">hdu 5328</span><span title="单击删除该标签">hdu</span>
*/
step 3:打开自己的类别管理,F12进入控制台,执行以下代码

##获取文章类别总数
var x = document.getElementsByClassName("tdleft")
var y = ""
for(var i = 1;i < x.length; i++){
    y += (x[i].firstChild.innerHTML)+"###,###\n";
}
console.log(y)
/**
输出如下:每一行就是一个类别  ###,###以及后面的部分就是我想把这个类别换成其他类别,分割开,可以没有
动态规划###,###ACM-ICPC编程题,动态规划
数据结构###,###ACM-ICPC编程题,数据结构
字符串###,###ACM-ICPC编程题,字符串
模拟###,###ACM-ICPC编程题,模拟
*/

step 4:把所有文章的类别和tag都换成新的,我的tag是使用原来的tag,如果不够5个,会把原先的类别变成tag。python代码
import urllib.request

import re
patterm = "<span title=\"单击删除该标签\">[\w]+</span>"
patterm = re.compile(patterm)
articles = []
#读取原来文章的tag和分类,并且处理成数组
with open("classify_lable.txt","r",encoding="utf-8") as f:
	for i in f:
		i = i.split("###,###")
		i[2] = patterm.findall(i[2])
		k = ""
		j = 0
		while j < len(i[2]):
			i[2][j] = i[2][j].replace("<span title=\"单击删除该标签\">","")
			i[2][j] = i[2][j].replace("</span>","")
			k+=" "+i[2][j]
			j+=1

		articles.append(i)
print("articles: "+str(len(articles)))

#读取原来的分类列表,并且映射成字典,方便下一步
origin = {}
now = {}
with open("classfyMap.txt","r",encoding="utf-8") as f:
	for i in f:
		i = i.replace("\n","")
		i = i.split("###,###")
		if(len(i[1]) == 0):
			continue
		i[1] = i[1].split(",")
		origin[i[0]] = i[1]
		for j in i[1]:
			now[j] = 1
print("old category: %d"%(len(origin)))
print("now category: %d"%(len(now)))
# for i in now.keys():
# 	print(i)
#对于每一篇文章,更新tag以及把旧的类别换成新的,新的类别以'##'开头
#这里用字典可以去重,tag和类别都不会重复
new_article = []
nolable = 0
nocategory = 0
for i in articles:
	lable_c = {}
	new_c = {}
	for j in i[2]:
		if(len(j) > 0):
			lable_c[j] = 1
	for j in i[1].split(","):
		if(len(j) > 0):
			lable_c[j] = 1
		if j in origin:
			for k in origin[j]:
				new_c[k] = 1
	category = []
	lable = []
	for j in lable_c.keys():
		lable.append(j)
	if(len(new_c) == 0):
		for j in lable:
			if(j in origin):
				new_c[j] = 1
	for j in new_c.keys():
		category.append(j)
	if(len(lable) == 0):
		nolable += 1
	if(len(category) == 0):
		nocategory += 1
	new_article.append([i[0],lable,category])
print("finale set: %d nolable: %d nocategory: %d"%(len(new_article),nolable,nocategory))
# for i in new_article:
# 	print(i)
#输出成js数组,用于下一步操作
js_arry = "var arti = ["
m = 0
for i in new_article:
	lab = ""
	cat = ""
	for j in i[1]:
		lab += ("<span title=\'单击删除该标签\'>%s</span>"%(j))
	k = 0
	while k < len(i[2]):
		if k > 0:
			cat +=','
		cat += "##"+i[2][k]
		k += 1
	if (m > 0):
		js_arry += ",\n"
	js_arry += '["%s","%s","%s"]'%(i[0],lab,cat)


	m+=1
js_arry +="];\n"
print(js_arry)
step 5:这一步就把js代码放到控制台运行了,控制台还是要编辑页面的,这样就没有域的问题。因为url我没处理哦!执行比较久300+文章呢
#更新文章类别和标签
var arti = [["77187506","<span title='单击删除该标签'>布隆过滤器</span><span title='单击删除该标签'>我只想找工作</span>","##我只想找工作"],
["77073144","<span title='单击删除该标签'>hyperloglog</span><span title='单击删除该标签'>基数计数</span><span title='单击删除该标签'>我只想找工作</span>","##我只想找工作"],
["77046721","<span title='单击删除该标签'>信号量</span><span title='单击删除该标签'>临界区</span><span title='单击删除该标签'>自旋锁</span><span title='单击删除该标签'>操作系统</span><span title='单击删除该标签'>我只想找工作</span>","##操作系统,##我只想找工作"],
47438411","<span title='单击删除该标签'>2015多校联合训练赛</span><span title='单击删除该标签'>模拟</span>","##ACM-ICPC编程题,##模拟"],
];
var time_out_th = 500;
var i;
var a;
var id;
var step = 0;
var eid = new Array();
function getdata(){
     i = 0;
     id = arti[i];
     step = 0;
     eid = new Array();
     a = window.open(id[0],"_blank");
     setTimeout(doing,time_out_th);
}
function doing(){
    if(i == arti.length) {
        console.log("finish");
        return i;
    }
    try{//加载完成后修改内容并点击保存,延时200毫秒再判断是否保存好了
        if(step == 0 && a != 0 && a.document.readyState == "complete"){
            a.document.getElementById("txtTag").value = id[2];
            a.document.getElementById("d_tag2").innerHTML = id[1];
            console.log("complete: "+i)
            step += 1
            a.document.getElementById("btnDraft").click()
        }
    }
    catch(err){
        console.log(err);
        console.log("err: "+i);
        eid.push(id[0]);
        i++;
        a.close();
        if(i == arti.length) {
            console.log("finish");
            return i;
        }
        step = 0
        id = arti[i];
        a = window.open(id[0],"_blank");
    }
    if(step == 1 && !a.saving){ //保存完毕,打开下一个网页
        a.close();
        console.log("ok: "+i)
        i++;
        if(i == arti.length) {
            console.log("finish");
            return i;
        }
        id = arti[i];
        a = window.open(id[0],"_blank");
        step = 0;
    }
    setTimeout(doing,time_out_th);
}
step 6:打开文章分类,把只有0篇的分类删除掉。alert手动确认,不然好像因为缓存的原因,删不掉后面的
#删除文章数为0的类别
var a = document.getElementsByClassName("red");
for(var i = 0;i < a.length;i++){
    var b = a[i].href;
    b = b.substring(b.length-7,b.length);
    var c = a[i].text;
    if(c == "0"){
        console.log(b+" "+c);
         $.get("?t=" + "del", { id: b, r: csdn.random() }, function (ret) {
         alert(1);
        });
        //ory?t=del&id=1380215&r=83505
    }
}

源代码参考:https://gitlab.com/linyuwang/csdn-classfy-change/tree/master

step 7:按文章数对类型进行排序。在文章类别管理的console执行

#把标签按文章数排序  这段代码有问题  总是有些没有移动的难道是缓存的问题?
#反复执行几次才行
var a = document.getElementsByClassName("red");
var ids = new Array();
var compare = function(x,y){
    if(x[1] > y[1]) return -1;
    if(x[1] == y[1]) return 0;
    return 1;
}
for(var i = 0;i < a.length;i++){
    var b = a[i].href;
    b = b.substring(b.length-7,b.length);
    var c = a[i].text;
    ids.push(new Array(b,Number(c),i));
//        console.log(b+" "+c);
//         $.get("?t=" + "del", { id: b, r: csdn.random() }, function (ret) {
}
ids.sort(compare);
console.log(ids);

var i = 0
function doing(){
    while(i != ids.length && ids[i][2] <= i){
        i++;
    }
    if(i == ids.length) return "finish";
    if(ids[i][2] > i){
        ids[i][2]--;
        let t = i;
//        $.get("?t=" + "up", { id: ids[i][0], r: csdn.random() },function () {
//            alert(t+","+ids[t][2]+","+ids[t][1]);});
        doExec("", ids[i][0], "up");
    }
    console.log(i+","+ids[i][2]+","+ids[i][1]);
    setTimeout(doing,500);
}
doing();
















评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

GDRetop

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值