昨天写Java写的有点累了,看着自己的博客分类乱七八糟的,遂心生了整理一下博客文章分类的想法。
手动整理了一会发现好累啊,重复的工作让程序来做就好了嘛。
于是就写了这个小脚本。
user_request_to_change.py
import requests
from lxml import etree
import re
# 获取指定类别的文章的 url
def get_article_url_list(page_url, session_obj):
page_article_list = session_obj.get(page_url).text
my_selector = etree.HTML(page_article_list)
article_url_list = my_selector.xpath('//td[@class="tdleft"]/a/@href')
return article_url_list
# 获取包含指定关键字的文章的 url
def get_spcify_article_url_list(page_url, session_obj):
page_article_list = session_obj.get(page_url).text
my_selector = etree.HTML(page_article_list)
article_url_list = my_selector.xpath('//a[contains(text(),"基于《PythonCookbook》")]/@href')
return article_url_list
if __name__ == "__main__":
session = requests.Session()
initial_category = 'Django'
target_categhory = '------Python------'
# 设置 cookie
session.cookies.set('Cookie',
'yourCookie')
# 这个 option 标签的文本内容是用 js 动态生成的...
# 获取初始目录和目标目录的 id
page_source_code = session.get('http://write.blog.csdn.net/postlist').text
selector = etree.HTML(page_source_code)
initial_category_id = selector.xpath('//option[contains(text(),"{}")]/@value'.format(initial_category))
target_categhory_id = selector.xpath('//option[contains(text(),"{}")]/@value'.format(target_categhory))
print({initial_category: initial_category_id, target_categhory: target_categhory_id})
# 获取该分类的总页数
page_source_code = session.get('http://write.blog.csdn.net/postlist/{}/all'.format(initial_category_id)).text
selector = etree.HTML(page_source_code)
dirty_page_nums = selector.xpath('//div[@class="page_nav"]/span/text()')[0]
page_nums = int(re.findall('\d+', dirty_page_nums)[-1])
print(dirty_page_nums)
all_article_url_list = [] # 存储该分类下全部文章的 url
article_id_list = [] # 存储该分类下全部文章的 id
# 获取该分类全部文章的 url
for i in range(page_nums):
url = 'http://write.blog.csdn.net/postlist/{}/0/enabled/{}'.format(initial_category_id, i + 1)
# all_article_url_list += get_article_url_list(url, session)
all_article_url_list += get_spcify_article_url_list(url, session)
print(all_article_url_list)
# 获取该分类全部文章的 id
for article_url in all_article_url_list:
article_id_list.append(int(re.findall('\d+', article_url)[0]))
print(article_id_list)
initial_category_id = 7324441
target_categhory_id = 6897792
# 设置分类
for article_id in article_id_list:
url = 'http://write.blog.csdn.net/article/setcategory?id={}&cats={}%2C{}'.format(article_id,
target_categhory_id,
initial_category_id)
session.get(url)
# url = 'http://write.blog.csdn.net/article/setcategory?id={}&cats={}'.format(article_id,
# target_categhory_id,
# )
# session.get(url)
整理的效果如下图
变得整齐美观又大方了有木有!有木有!!
其实还有另外一个基于 Selenium 的失败的程序
csdnChangeClass.py
from selenium import webdriver
from xpath_map_dict import XPATH_MAP_DICT
from lxml import etree
import re
driver = webdriver.Chrome()
initial_category = 'Python基础'
target_categhory = '------Python------'
try:
csdn_login_page = "https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn"
driver.get(csdn_login_page)
driver.maximize_window()
# 登录用户
username = "username"
password = "password"
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_login_username_input']).clear()
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_login_username_input']).send_keys(username)
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_login_password_input']).clear()
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_login_password_input']).send_keys(password)
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_login_remberMe_input']).click()
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_login_button']).click()
# 跳转到文章管理页面
driver.get('http://write.blog.csdn.net/postlist')
# 获取页面源码
page_source = driver.page_source
selector = etree.HTML(page_source)
# 选择需要修改的分类
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_select_category']).click()
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_option_cat_1'].format(initial_category)).click()
# 获取此页文章数量
page_article_list = selector.xpath("//tbody/tr")
page_article_nums = len(page_article_list) - 1
# print(page_article_nums)
# 获取该分类的总页数
dirty_page_nums = selector.xpath('//div[@class="page_nav"]/span/text()')
page_nums = int(re.findall('\d+', dirty_page_nums)[-1])
# print(page_nums)
scroll_js = "var q=document.documentElement.scrollTop=10000"
for page in range(page_nums):
# 修改此页文章的分类
for page_article in range(page_article_nums):
if page_article == 8:
driver.execute_script(scroll_js)
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_a_cat'].format(target_categhory)).click()
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_input_category'].format((page_article + 1))).click()
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_input_close']).click()
# 翻页
driver.find_element_by_xpath(XPATH_MAP_DICT['xpath_a_next_page']).click()
except:
pass
finally:
driver.quit()
driver.close()
xpath_map_dict.py
XPATH_MAP_DICT = {
'xpath_login_username_input': '//input[@id="username"]',
'xpath_login_password_input': '//input[@id="password"]',
'xpath_login_remberMe_input': '//input[@id="rememberMe"]',
'xpath_login_button': '//input[@value="登 录"]',
'xpath_select_category': '//select[@value="selCat"]',
'xpath_option_cat_1': '//option[contians(text(),"{}")]',
'xpath_a_cat': '(//a[@class="cat"])[{}]',
'xpath_input_category': '//div[@id="setcat_box"]/label/span[contains(text(),"{}")]/../input',
'xpath_input_close': '//*[@id="setcat_div"]/div/input',
'xpath_a_next_page': '//a[contains(text(),"下一页")]',
}
失败原因,在测试程序的时候,总是报错说是“目标计算机积极拒绝连接”balabala的。