[Python下载CSDN博客]2. 使用BeautifulSoup分析HTML(二)

#!/usr/bin/env python
# coding=utf-8
# Python 2.7.3
import os
import GetCategoryAndMonth
import GetArticleList
import GetArticle

import urllib2
import httplib

def GetTypeList(host, blogName, list, type):
	'''
	获取类型列表
	'''
	conn = httplib.HTTPConnection(host)
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = "/" + blogName, headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	htmlByte = r1.read()				# 获得HTML
	htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
	my = GetCategoryAndMonth.CHYGetCategoryAndMonth()
	my.Parser(htmlByte, type, list)

def GetTypeArticleList(host, articleListUrl, list):
	'''
	获取一类型的文章列表
	'''
	conn = httplib.HTTPConnection(host)
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = articleListUrl, headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	htmlByte = r1.read()				# 获得HTML
	htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
	my = GetArticleList.CHYGetArticleList()
	my.Parser(htmlByte, list)

def GetArticleFun(host, articleUrl, article):
	'''
	获取文章内容
	'''
	conn = httplib.HTTPConnection(host)
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = articleUrl, headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	htmlByte = r1.read()				# 获得HTML
	htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
	my = GetArticle.CHYGetArticle()
	my.Parser(htmlByte, article)

def ValidFileName(fileName):
	validFileName = fileName.replace("/", "");
	validFileName = validFileName.replace("?", "");
	validFileName = validFileName.replace(":", "");
	validFileName = validFileName.replace('"', "");
	validFileName = validFileName.replace("'", "");
	return validFileName
	
if __name__ == '__main__':
	# 创建一个目录
	host = "blog.csdn.net"
	blogName = "bagboy_taobao_com"
	blogDir = "F:" + os.sep + blogName     # F:\<blogName> 目录下
	os.mkdir(blogDir)
	# 获取分类列表
	listType = []
	GetTypeList(host, blogName, listType, 1)
	# 循环创建类型目录
	for listTypeItem in listType:
		typeDir = blogDir + os.sep + listTypeItem[1]
		os.mkdir(typeDir)
		listArticle = []
		GetTypeArticleList(host, listTypeItem[0], listArticle)
		for listArticleItem in listArticle:
			article = ["", ""]
			GetArticleFun(host, listArticleItem, article)
			articleDir = typeDir + os.sep + listArticleItem.replace("/" + blogName + "/article/details/", "") + "_" + ValidFileName(article[0])
			print(articleDir)
			# 以文章的标题名为保存的文件名
			os.mkdir(articleDir)
			title = articleDir + os.sep + "article.txt"
			# print(title)
			f = open(title, 'w');
			print >> f, article[0].encode("utf8")
			print >> f, article[1].encode("utf8")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值