以前写的文章分类太多,太乱了,所以决定来一次清理,把一些文章类别给替换掉。和LX同学讨论了一个下午,终于有一个方案了,搞了一晚上才弄好。
我是混合了python和js代码一起做的。js可以在浏览器的 控制窗下运行,不用登陆了,所以很方便。python我用的比较习惯,所以处理数据比较方便,而且不会丢失。
步骤如下:
step 1:先把文章列表抓下来,知道有哪些文章。markdown编写的文章不能处理!都进eid了!
import urllib.request
import re
ids = {} #保存所有的文章id
for i in range(30):#按15篇文章一页算自己有多少页
url = "http://blog.csdn.net/firenet1/article/list/"+str(i+1)
try:
data = urllib.request.urlopen(url).read()
except :
continue
data = data.decode('UTF-8')
patterm = "/firenet1/article/details/[0-9]{5,10}"
patterm = re.compile(patterm)
data = patterm.findall(data,re.S|re.M|re.I)
digit = re.compile(r"[0-9]{5,10}",re.S|re.M|re.I)
print(len(data))
for d in data:
# print(d)
d = digit.search(d)
# print(d)
# print(d.group())
ids[d.group()] = 1
print(len(ids))
file = open("js_array.txt","w")
out = "var id_array = new Array("
for i in ids.keys():
out+="\""+i+"\",\n"
out+=");"
file.write(out)
file.close()
'''
输出文件保存成js数组
var id_array = new Array("77187506",
"77073144",
"77046721",
"76766916",
"76642319",
"76195994")
'''
step 2: 打开一个csdn博客编辑页,任何一页都行。F12进入控制台,在控制台里执行以下代码,这一步就能获得大部分文章的id,tag,类别
var id_array = new Array("77187506", //这个就是上一步得到的文章id数组
"77073144",
"77046721",
);
#抓取文章标签和类别
var time_out_th = 500;
var i;
var a;
var id;
var out;
var eid; //执行不成功的文章id记录下来,可以自己打印出来看
function getdata(){ //调用函数入口,通过timeout设置,每次抓取一个文章才进行下一个文章抓取
i = 0;
out = new Array();
id = id_array[i];
eid = new Array();
a = window.open(id,"_blank"); //打开新的编辑页面
setTimeout(doing,time_out_th);
}
function doing(){
if(i == id_array.length) {
console.log("finish");
return i;
}
try{
if(a != 0 && a.document.readyState == "complete"){ //判断新打开的页面是不是加载完毕了
let cla = a.document.getElementById("txtTag").value;
let lable = a.document.getElementById("d_tag2").innerHTML;
lable = lable.replace("\n","");
out.push(new Array(id,cla,lable));
a.close();
console.log("ok: "+i)
i++;
if(i == id_array.length) {
console.log("finish");
return i;
}
id = id_array[i];
a = window.open(id,"_blank");
}
}
catch(err){
eid.push(id);
console.log("err: "+i);
i++;
a.close();
if(i == id_array.length) {
console.log("finish");
return i;
}
id = id_array[i];
a = window.open(id,"_blank");
}
setTimeout(doing,time_out_th);
}
function output(){ //输出文本函数
let res = "";
for(let i = 0;i < out.length;i++){
res += out[i][0]+"###,###"+out[i][1]+"###,###"+out[i][2]+"\n";
}
console.log(res);
}
/*
输出数据如下:复制黏贴到本地文本后用于后面的步骤
47311009###,###多校联合训练赛###,###<span title="单击删除该标签">hdu 5328</span><span title="单击删除该标签">hdu</span>
*/
step 3:打开自己的类别管理,F12进入控制台,执行以下代码
##获取文章类别总数
var x = document.getElementsByClassName("tdleft")
var y = ""
for(var i = 1;i < x.length; i++){
y += (x[i].firstChild.innerHTML)+"###,###\n";
}
console.log(y)
/**
输出如下:每一行就是一个类别 ###,###以及后面的部分就是我想把这个类别换成其他类别,分割开,可以没有
动态规划###,###ACM-ICPC编程题,动态规划
数据结构###,###ACM-ICPC编程题,数据结构
字符串###,###ACM-ICPC编程题,字符串
模拟###,###ACM-ICPC编程题,模拟
*/
step 4:把所有文章的类别和tag都换成新的,我的tag是使用原来的tag,如果不够5个,会把原先的类别变成tag。python代码
import urllib.request
import re
patterm = "<span title=\"单击删除该标签\">[\w]+</span>"
patterm = re.compile(patterm)
articles = []
#读取原来文章的tag和分类,并且处理成数组
with open("classify_lable.txt","r",encoding="utf-8") as f:
for i in f:
i = i.split("###,###")
i[2] = patterm.findall(i[2])
k = ""
j = 0
while j < len(i[2]):
i[2][j] = i[2][j].replace("<span title=\"单击删除该标签\">","")
i[2][j] = i[2][j].replace("</span>","")
k+=" "+i[2][j]
j+=1
articles.append(i)
print("articles: "+str(len(articles)))
#读取原来的分类列表,并且映射成字典,方便下一步
origin = {}
now = {}
with open("classfyMap.txt","r",encoding="utf-8") as f:
for i in f:
i = i.replace("\n","")
i = i.split("###,###")
if(len(i[1]) == 0):
continue
i[1] = i[1].split(",")
origin[i[0]] = i[1]
for j in i[1]:
now[j] = 1
print("old category: %d"%(len(origin)))
print("now category: %d"%(len(now)))
# for i in now.keys():
# print(i)
#对于每一篇文章,更新tag以及把旧的类别换成新的,新的类别以'##'开头
#这里用字典可以去重,tag和类别都不会重复
new_article = []
nolable = 0
nocategory = 0
for i in articles:
lable_c = {}
new_c = {}
for j in i[2]:
if(len(j) > 0):
lable_c[j] = 1
for j in i[1].split(","):
if(len(j) > 0):
lable_c[j] = 1
if j in origin:
for k in origin[j]:
new_c[k] = 1
category = []
lable = []
for j in lable_c.keys():
lable.append(j)
if(len(new_c) == 0):
for j in lable:
if(j in origin):
new_c[j] = 1
for j in new_c.keys():
category.append(j)
if(len(lable) == 0):
nolable += 1
if(len(category) == 0):
nocategory += 1
new_article.append([i[0],lable,category])
print("finale set: %d nolable: %d nocategory: %d"%(len(new_article),nolable,nocategory))
# for i in new_article:
# print(i)
#输出成js数组,用于下一步操作
js_arry = "var arti = ["
m = 0
for i in new_article:
lab = ""
cat = ""
for j in i[1]:
lab += ("<span title=\'单击删除该标签\'>%s</span>"%(j))
k = 0
while k < len(i[2]):
if k > 0:
cat +=','
cat += "##"+i[2][k]
k += 1
if (m > 0):
js_arry += ",\n"
js_arry += '["%s","%s","%s"]'%(i[0],lab,cat)
m+=1
js_arry +="];\n"
print(js_arry)
step 5:这一步就把js代码放到控制台运行了,控制台还是要编辑页面的,这样就没有域的问题。因为url我没处理哦!执行比较久300+文章呢
#更新文章类别和标签
var arti = [["77187506","<span title='单击删除该标签'>布隆过滤器</span><span title='单击删除该标签'>我只想找工作</span>","##我只想找工作"],
["77073144","<span title='单击删除该标签'>hyperloglog</span><span title='单击删除该标签'>基数计数</span><span title='单击删除该标签'>我只想找工作</span>","##我只想找工作"],
["77046721","<span title='单击删除该标签'>信号量</span><span title='单击删除该标签'>临界区</span><span title='单击删除该标签'>自旋锁</span><span title='单击删除该标签'>操作系统</span><span title='单击删除该标签'>我只想找工作</span>","##操作系统,##我只想找工作"],
47438411","<span title='单击删除该标签'>2015多校联合训练赛</span><span title='单击删除该标签'>模拟</span>","##ACM-ICPC编程题,##模拟"],
];
var time_out_th = 500;
var i;
var a;
var id;
var step = 0;
var eid = new Array();
function getdata(){
i = 0;
id = arti[i];
step = 0;
eid = new Array();
a = window.open(id[0],"_blank");
setTimeout(doing,time_out_th);
}
function doing(){
if(i == arti.length) {
console.log("finish");
return i;
}
try{//加载完成后修改内容并点击保存,延时200毫秒再判断是否保存好了
if(step == 0 && a != 0 && a.document.readyState == "complete"){
a.document.getElementById("txtTag").value = id[2];
a.document.getElementById("d_tag2").innerHTML = id[1];
console.log("complete: "+i)
step += 1
a.document.getElementById("btnDraft").click()
}
}
catch(err){
console.log(err);
console.log("err: "+i);
eid.push(id[0]);
i++;
a.close();
if(i == arti.length) {
console.log("finish");
return i;
}
step = 0
id = arti[i];
a = window.open(id[0],"_blank");
}
if(step == 1 && !a.saving){ //保存完毕,打开下一个网页
a.close();
console.log("ok: "+i)
i++;
if(i == arti.length) {
console.log("finish");
return i;
}
id = arti[i];
a = window.open(id[0],"_blank");
step = 0;
}
setTimeout(doing,time_out_th);
}
step 6:打开文章分类,把只有0篇的分类删除掉。alert手动确认,不然好像因为缓存的原因,删不掉后面的
#删除文章数为0的类别
var a = document.getElementsByClassName("red");
for(var i = 0;i < a.length;i++){
var b = a[i].href;
b = b.substring(b.length-7,b.length);
var c = a[i].text;
if(c == "0"){
console.log(b+" "+c);
$.get("?t=" + "del", { id: b, r: csdn.random() }, function (ret) {
alert(1);
});
//ory?t=del&id=1380215&r=83505
}
}
源代码参考:https://gitlab.com/linyuwang/csdn-classfy-change/tree/master
step 7:按文章数对类型进行排序。在文章类别管理的console执行
#把标签按文章数排序 这段代码有问题 总是有些没有移动的难道是缓存的问题?
#反复执行几次才行
var a = document.getElementsByClassName("red");
var ids = new Array();
var compare = function(x,y){
if(x[1] > y[1]) return -1;
if(x[1] == y[1]) return 0;
return 1;
}
for(var i = 0;i < a.length;i++){
var b = a[i].href;
b = b.substring(b.length-7,b.length);
var c = a[i].text;
ids.push(new Array(b,Number(c),i));
// console.log(b+" "+c);
// $.get("?t=" + "del", { id: b, r: csdn.random() }, function (ret) {
}
ids.sort(compare);
console.log(ids);
var i = 0
function doing(){
while(i != ids.length && ids[i][2] <= i){
i++;
}
if(i == ids.length) return "finish";
if(ids[i][2] > i){
ids[i][2]--;
let t = i;
// $.get("?t=" + "up", { id: ids[i][0], r: csdn.random() },function () {
// alert(t+","+ids[t][2]+","+ids[t][1]);});
doExec("", ids[i][0], "up");
}
console.log(i+","+ids[i][2]+","+ids[i][1]);
setTimeout(doing,500);
}
doing();