#!/usr/bin/env python
# coding=utf-8
# Python 2.7.3
import os
import GetCategoryAndMonth
import GetArticleList
import GetArticle
import urllib2
import httplib
def GetTypeList(host, blogName, list, type):
'''
获取类型列表
'''
conn = httplib.HTTPConnection(host)
# 要模拟成IE发送, 否则CSDN不接受Python的请求
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headersP = { 'User-Agent' : user_agent }
conn.request(method = "GET", url = "/" + blogName, headers = headersP)
r1 = conn.getresponse() # 获得响应
htmlByte = r1.read() # 获得HTML
htmlStr = htmlByte.decode("utf8") # 需要转换成utf8编码, 否则分析异常
my = GetCategoryAndMonth.CHYGetCategoryAndMonth()
my.Parser(htmlByte, type, list)
def GetTypeArticleList(host, articleListUrl, list):
'''
获取一类型的文章列表
'''
conn = httplib.HTTPConnection(host)
# 要模拟成IE发送, 否则CSDN不接受Python的请求
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headersP = { 'User-Agent' : user_agent }
conn.request(method = "GET", url = articleListUrl, headers = headersP)
r1 = conn.getresponse() # 获得响应
htmlByte = r1.read() # 获得HTML
htmlStr = htmlByte.decode("utf8") # 需要转换成utf8编码, 否则分析异常
my = GetArticleList.CHYGetArticleList()
my.Parser(htmlByte, list)
def GetArticleFun(host, articleUrl, article):
'''
获取文章内容
'''
conn = httplib.HTTPConnection(host)
# 要模拟成IE发送, 否则CSDN不接受Python的请求
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headersP = { 'User-Agent' : user_agent }
conn.request(method = "GET", url = articleUrl, headers = headersP)
r1 = conn.getresponse() # 获得响应
htmlByte = r1.read() # 获得HTML
htmlStr = htmlByte.decode("utf8") # 需要转换成utf8编码, 否则分析异常
my = GetArticle.CHYGetArticle()
my.Parser(htmlByte, article)
def ValidFileName(fileName):
validFileName = fileName.replace("/", "");
validFileName = validFileName.replace("?", "");
validFileName = validFileName.replace(":", "");
validFileName = validFileName.replace('"', "");
validFileName = validFileName.replace("'", "");
return validFileName
if __name__ == '__main__':
# 创建一个目录
host = "blog.csdn.net"
blogName = "bagboy_taobao_com"
blogDir = "F:" + os.sep + blogName # F:\<blogName> 目录下
os.mkdir(blogDir)
# 获取分类列表
listType = []
GetTypeList(host, blogName, listType, 1)
# 循环创建类型目录
for listTypeItem in listType:
typeDir = blogDir + os.sep + listTypeItem[1]
os.mkdir(typeDir)
listArticle = []
GetTypeArticleList(host, listTypeItem[0], listArticle)
for listArticleItem in listArticle:
article = ["", ""]
GetArticleFun(host, listArticleItem, article)
articleDir = typeDir + os.sep + listArticleItem.replace("/" + blogName + "/article/details/", "") + "_" + ValidFileName(article[0])
print(articleDir)
# 以文章的标题名为保存的文件名
os.mkdir(articleDir)
title = articleDir + os.sep + "article.txt"
# print(title)
f = open(title, 'w');
print >> f, article[0].encode("utf8")
print >> f, article[1].encode("utf8")
[Python下载CSDN博客]2. 使用BeautifulSoup分析HTML(二)
最新推荐文章于 2024-01-15 14:08:20 发布