[Python下载CSDN博客]2. 使用BeautifulSoup分析HTML(二)

最新推荐文章于 2024-01-15 14:08:20 发布

小大小丑

最新推荐文章于 2024-01-15 14:08:20 发布

阅读量1.4k

点赞数

分类专栏： Python 文章标签： Python BeautifulSoup 分析HTML

本文链接：https://blog.csdn.net/bagboy_taobao_com/article/details/15810271

版权

Python 专栏收录该内容

51 篇文章 0 订阅

订阅专栏

#!/usr/bin/env python
# coding=utf-8
# Python 2.7.3
import os
import GetCategoryAndMonth
import GetArticleList
import GetArticle

import urllib2
import httplib

def GetTypeList(host, blogName, list, type):
	'''
	获取类型列表
	'''
	conn = httplib.HTTPConnection(host)
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = "/" + blogName, headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	htmlByte = r1.read()				# 获得HTML
	htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
	my = GetCategoryAndMonth.CHYGetCategoryAndMonth()
	my.Parser(htmlByte, type, list)

def GetTypeArticleList(host, articleListUrl, list):
	'''
	获取一类型的文章列表
	'''
	conn = httplib.HTTPConnection(host)
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = articleListUrl, headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	htmlByte = r1.read()				# 获得HTML
	htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
	my = GetArticleList.CHYGetArticleList()
	my.Parser(htmlByte, list)

def GetArticleFun(host, articleUrl, article):
	'''
	获取文章内容
	'''
	conn = httplib.HTTPConnection(host)
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = articleUrl, headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	htmlByte = r1.read()				# 获得HTML
	htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
	my = GetArticle.CHYGetArticle()
	my.Parser(htmlByte, article)

def ValidFileName(fileName):
	validFileName = fileName.replace("/", "");
	validFileName = validFileName.replace("?", "");
	validFileName = validFileName.replace(":", "");
	validFileName = validFileName.replace('"', "");
	validFileName = validFileName.replace("'", "");
	return validFileName
	
if __name__ == '__main__':
	# 创建一个目录
	host = "blog.csdn.net"
	blogName = "bagboy_taobao_com"
	blogDir = "F:" + os.sep + blogName     # F:\<blogName> 目录下
	os.mkdir(blogDir)
	# 获取分类列表
	listType = []
	GetTypeList(host, blogName, listType, 1)
	# 循环创建类型目录
	for listTypeItem in listType:
		typeDir = blogDir + os.sep + listTypeItem[1]
		os.mkdir(typeDir)
		listArticle = []
		GetTypeArticleList(host, listTypeItem[0], listArticle)
		for listArticleItem in listArticle:
			article = ["", ""]
			GetArticleFun(host, listArticleItem, article)
			articleDir = typeDir + os.sep + listArticleItem.replace("/" + blogName + "/article/details/", "") + "_" + ValidFileName(article[0])
			print(articleDir)
			# 以文章的标题名为保存的文件名
			os.mkdir(articleDir)
			title = articleDir + os.sep + "article.txt"
			# print(title)
			f = open(title, 'w');
			print >> f, article[0].encode("utf8")
			print >> f, article[1].encode("utf8")