以前写的文章分类太多,太乱了,所以决定来一次清理,把一些文章类别给替换掉。和LX同学讨论了一个下午,终于有一个方案了,搞了一晚上才弄好。
我是混合了python和js代码一起做的。js可以在浏览器的 控制窗下运行,不用登陆了,所以很方便。python我用的比较习惯,所以处理数据比较方便,而且不会丢失。
步骤如下:
step 1:先把文章列表抓下来,知道有哪些文章。markdown编写的文章不能处理!都进eid了!
-
import urllib.request
-
import re
-
ids = {} #保存所有的文章id
-
for i in range(30):#按15篇文章一页算自己有多少页
-
url = "http://blog.csdn.net/firenet1/article/list/"+str(i+1)
-
try:
-
data = urllib.request.urlopen(url).read()
-
except :
-
continue
-
data = data.decode('UTF-8')
-
patterm = "/firenet1/article/details/[0-9]{5,10}"
-
patterm = re.compile(patterm)
-
data = patterm.findall(data,re.S|re.M|re.I)
-
digit = re.compile(r"[0-9]{5,10}",re.S|re.M|re.I)
-
print(len(data))
-
for d in data:
-
# print(d)
-
d = digit.search(d)
-
# print(d)
-
# print(d.group())
-
ids[d.group()] = 1
-
print(len(ids))
-
file = open("js_array.txt","w")
-
out = "var id_array = new Array("
-
for i in ids.keys():
-
out+="\""+i+"\",\n"
-
out+=");"
-
file.write(out)
-
file.close()
-
'''
-
输出文件保存成js数组
-
var id_array = new Array("77187506",
-
"77073144",
-
"77046721",
-
"76766916",
-
"76642319",
-
"76195994")
-
'''
step 2: 打开一个csdn博客编辑页,任何一页都行。F12进入控制台,在控制台里执行以下代码,这一步就能获得大部分文章的id,tag,类别
-
var id_array = new Array("77187506", //这个就是上一步得到的文章id数组
-
"77073144",
-
"77046721",
-
);
-
#抓取文章标签和类别
-
var time_out_th = 500;
-
var i;
-
var a;
-
var id;
-
var out;
-
var eid; //执行不成功的文章id记录下来,可以自己打印出来看
-
function getdata(){ //调用函数入口,通过timeout设置,每次抓取一个文章才进行下一个文章抓取
-
i = 0;
-
out = new Array();
-
id = id_array[i];
-
eid = new Array();
-
a = window.open(id,"_blank"); //打开新的编辑页面
-
setTimeout(doing,time_out_th);
-
}
-
function doing(){
-
if(i == id_array.length) {
-
console.log("finish");
-
return i;
-
}
-
try{
-
if(a != 0 && a.document.readyState == "complete"){ //判断新打开的页面是不是加载完毕了
-
let cla = a.document.getElementById("txtTag").value;
-
let lable = a.document.getElementById("d_tag2").innerHTML;
-
lable = lable.replace("\n","");
-
out.push(new Array(id,cla,lable));
-
a.close();
-
console.log("ok: "+i)
-
i++;
-
if(i == id_array.length) {
-
console.log("finish");
-
return i;
-
}
-
id = id_array[i];
-
a = window.open(id,"_blank");
-
}
-
}
-
catch(err){
-
eid.push(id);
-
console.log("err: "+i);
-
i++;
-
a.close();
-
if(i == id_array.length) {
-
console.log("finish");
-
return i;
-
}
-
id = id_array[i];
-
a = window.open(id,"_blank");
-
}
-
setTimeout(doing,time_out_th);
-
}
-
function output(){ //输出文本函数
-
let res = "";
-
for(let i = 0;i < out.length;i++){
-
res += out[i][0]+"###,###"+out[i][1]+"###,###"+out[i][2]+"\n";
-
}
-
console.log(res);
-
}
-
/*
-
输出数据如下:复制黏贴到本地文本后用于后面的步骤
-
47311009###,###多校联合训练赛###,###<span title="单击删除该标签">hdu 5328</span><span title="单击删除该标签">hdu</span>
-
*/
step 3:打开自己的类别管理,F12进入控制台,执行以下代码
-
##获取文章类别总数
-
var x = document.getElementsByClassName("tdleft")
-
var y = ""
-
for(var i = 1;i < x.length; i++){
-
y += (x[i].firstChild.innerHTML)+"###,###\n";
-
}
-
console.log(y)
-
/**
-
输出如下:每一行就是一个类别 ###,###以及后面的部分就是我想把这个类别换成其他类别,分割开,可以没有
-
动态规划###,###ACM-ICPC编程题,动态规划
-
数据结构###,###ACM-ICPC编程题,数据结构
-
字符串###,###ACM-ICPC编程题,字符串
-
模拟###,###ACM-ICPC编程题,模拟
-
*/
step 4:把所有文章的类别和tag都换成新的,我的tag是使用原来的tag,如果不够5个,会把原先的类别变成tag。python代码
-
import urllib.request
-
import re
-
patterm = "<span title=\"单击删除该标签\">[\w]+</span>"
-
patterm = re.compile(patterm)
-
articles = []
-
#读取原来文章的tag和分类,并且处理成数组
-
with open("classify_lable.txt","r",encoding="utf-8") as f:
-
for i in f:
-
i = i.split("###,###")
-
i[2] = patterm.findall(i[2])
-
k = ""
-
j = 0
-
while j < len(i[2]):
-
i[2][j] = i[2][j].replace("<span title=\"单击删除该标签\">","")
-
i[2][j] = i[2][j].replace("</span>","")
-
k+=" "+i[2][j]
-
j+=1
-
articles.append(i)
-
print("articles: "+str(len(articles)))
-
#读取原来的分类列表,并且映射成字典,方便下一步
-
origin = {}
-
now = {}
-
with open("classfyMap.txt","r",encoding="utf-8") as f:
-
for i in f:
-
i = i.replace("\n","")
-
i = i.split("###,###")
-
if(len(i[1]) == 0):
-
continue
-
i[1] = i[1].split(",")
-
origin[i[0]] = i[1]
-
for j in i[1]:
-
now[j] = 1
-
print("old category: %d"%(len(origin)))
-
print("now category: %d"%(len(now)))
-
# for i in now.keys():
-
# print(i)
-
#对于每一篇文章,更新tag以及把旧的类别换成新的,新的类别以'##'开头
-
#这里用字典可以去重,tag和类别都不会重复
-
new_article = []
-
nolable = 0
-
nocategory = 0
-
for i in articles:
-
lable_c = {}
-
new_c = {}
-
for j in i[2]:
-
if(len(j) > 0):
-
lable_c[j] = 1
-
for j in i[1].split(","):
-
if(len(j) > 0):
-
lable_c[j] = 1
-
if j in origin:
-
for k in origin[j]:
-
new_c[k] = 1
-
category = []
-
lable = []
-
for j in lable_c.keys():
-
lable.append(j)
-
if(len(new_c) == 0):
-
for j in lable:
-
if(j in origin):
-
new_c[j] = 1
-
for j in new_c.keys():
-
category.append(j)
-
if(len(lable) == 0):
-
nolable += 1
-
if(len(category) == 0):
-
nocategory += 1
-
new_article.append([i[0],lable,category])
-
print("finale set: %d nolable: %d nocategory: %d"%(len(new_article),nolable,nocategory))
-
# for i in new_article:
-
# print(i)
-
#输出成js数组,用于下一步操作
-
js_arry = "var arti = ["
-
m = 0
-
for i in new_article:
-
lab = ""
-
cat = ""
-
for j in i[1]:
-
lab += ("<span title=\'单击删除该标签\'>%s</span>"%(j))
-
k = 0
-
while k < len(i[2]):
-
if k > 0:
-
cat +=','
-
cat += "##"+i[2][k]
-
k += 1
-
if (m > 0):
-
js_arry += ",\n"
-
js_arry += '["%s","%s","%s"]'%(i[0],lab,cat)
-
m+=1
-
js_arry +="];\n"
-
print(js_arry)
step 5:这一步就把js代码放到控制台运行了,控制台还是要编辑页面的,这样就没有域的问题。因为url我没处理哦!执行比较久300+文章呢
-
#更新文章类别和标签
-
var arti = [["77187506","<span title='单击删除该标签'>布隆过滤器</span><span title='单击删除该标签'>我只想找工作</span>","##我只想找工作"],
-
["77073144","<span title='单击删除该标签'>hyperloglog</span><span title='单击删除该标签'>基数计数</span><span title='单击删除该标签'>我只想找工作</span>","##我只想找工作"],
-
["77046721","<span title='单击删除该标签'>信号量</span><span title='单击删除该标签'>临界区</span><span title='单击删除该标签'>自旋锁</span><span title='单击删除该标签'>操作系统</span><span title='单击删除该标签'>我只想找工作</span>","##操作系统,##我只想找工作"],
-
47438411","<span title='单击删除该标签'>2015多校联合训练赛</span><span title='单击删除该标签'>模拟</span>","##ACM-ICPC编程题,##模拟"],
-
];
-
var time_out_th = 500;
-
var i;
-
var a;
-
var id;
-
var step = 0;
-
var eid = new Array();
-
function getdata(){
-
i = 0;
-
id = arti[i];
-
step = 0;
-
eid = new Array();
-
a = window.open(id[0],"_blank");
-
setTimeout(doing,time_out_th);
-
}
-
function doing(){
-
if(i == arti.length) {
-
console.log("finish");
-
return i;
-
}
-
try{//加载完成后修改内容并点击保存,延时200毫秒再判断是否保存好了
-
if(step == 0 && a != 0 && a.document.readyState == "complete"){
-
a.document.getElementById("txtTag").value = id[2];
-
a.document.getElementById("d_tag2").innerHTML = id[1];
-
console.log("complete: "+i)
-
step += 1
-
a.document.getElementById("btnDraft").click()
-
}
-
}
-
catch(err){
-
console.log(err);
-
console.log("err: "+i);
-
eid.push(id[0]);
-
i++;
-
a.close();
-
if(i == arti.length) {
-
console.log("finish");
-
return i;
-
}
-
step = 0
-
id = arti[i];
-
a = window.open(id[0],"_blank");
-
}
-
if(step == 1 && !a.saving){ //保存完毕,打开下一个网页
-
a.close();
-
console.log("ok: "+i)
-
i++;
-
if(i == arti.length) {
-
console.log("finish");
-
return i;
-
}
-
id = arti[i];
-
a = window.open(id[0],"_blank");
-
step = 0;
-
}
-
setTimeout(doing,time_out_th);
-
}
step 6:打开文章分类,把只有0篇的分类删除掉。alert手动确认,不然好像因为缓存的原因,删不掉后面的
-
#删除文章数为0的类别
-
var a = document.getElementsByClassName("red");
-
for(var i = 0;i < a.length;i++){
-
var b = a[i].href;
-
b = b.substring(b.length-7,b.length);
-
var c = a[i].text;
-
if(c == "0"){
-
console.log(b+" "+c);
-
$.get("?t=" + "del", { id: b, r: csdn.random() }, function (ret) {
-
alert(1);
-
});
-
//ory?t=del&id=1380215&r=83505
-
}
-
}
源代码参考:https://gitlab.com/linyuwang/csdn-classfy-change/tree/master
step 7:按文章数对类型进行排序。在文章类别管理的console执行
-
#把标签按文章数排序 这段代码有问题 总是有些没有移动的难道是缓存的问题?
-
#反复执行几次才行
-
var a = document.getElementsByClassName("red");
-
var ids = new Array();
-
var compare = function(x,y){
-
if(x[1] > y[1]) return -1;
-
if(x[1] == y[1]) return 0;
-
return 1;
-
}
-
for(var i = 0;i < a.length;i++){
-
var b = a[i].href;
-
b = b.substring(b.length-7,b.length);
-
var c = a[i].text;
-
ids.push(new Array(b,Number(c),i));
-
// console.log(b+" "+c);
-
// $.get("?t=" + "del", { id: b, r: csdn.random() }, function (ret) {
-
}
-
ids.sort(compare);
-
console.log(ids);
-
var i = 0
-
function doing(){
-
while(i != ids.length && ids[i][2] <= i){
-
i++;
-
}
-
if(i == ids.length) return "finish";
-
if(ids[i][2] > i){
-
ids[i][2]--;
-
let t = i;
-
// $.get("?t=" + "up", { id: ids[i][0], r: csdn.random() },function () {
-
// alert(t+","+ids[t][2]+","+ids[t][1]);});
-
doExec("", ids[i][0], "up");
-
}
-
console.log(i+","+ids[i][2]+","+ids[i][1]);
-
setTimeout(doing,500);
-
}
-
doing();