使用python爬取csdn博客访问量

最新推荐文章于 2021-08-07 12:11:38 发布

better_huirong

最新推荐文章于 2021-08-07 12:11:38 发布

阅读量884

点赞数

分类专栏： Python 文章标签： python

本文链接：https://blog.csdn.net/mrzhang628/article/details/50696872

版权

Python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-


import urllib2
import re

#当前的博客列表页号
page_num = 1
#不是最后列表的一页，如果有尾页说明不是最后一页
notLast = 1

account = "mrzhang628"

#首页地址
baseUrl = 'http://blog.csdn.net/'+account

while notLast:
	#连接页号，组成爬取的页面网址
	myUrl = baseUrl+'/article/list/' + str(page_num)

	#伪装成浏览器访问，直接访问的话csdn会拒绝
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; windows NT)'
	headers = {'User-Agent':user_agent}
	#构造请求
	req = urllib2.Request(baseUrl,headers=headers)

	#request = urllib2.Request(myUrl);
	#request.add_header(headers);	

	#print("req = ",str(req));

	#访问页面
	try:
		myResponse = urllib2.urlopen(req) #timeout用来设置超时
	except Exception, e:
		raise
	else:
		pass #Python pass是空语句，是为了保持程序结构的完整性。
	finally:
		pass


	myPage = myResponse.read()

	#print("notLast = " ,str(notLast));

	print '-----------------------------第%d页---------------------------------' % (page_num,)

	#利用正则表达式来获取博客的标题
	titles = re.findall('<span class="link_title"><a href=".*?">(.*?)</a></span>',myPage,re.S)	
	titleList=[] #表用[ ]标识。是python最通用的复合数据类型
	for items in titles:
		titleList.append(str(items).lstrip().rstrip())#list.append(obj)：在列表末尾添加新的对象

	#利用正则表达式获取博客的访问量
	views = re.findall('<span class="link_view".*?><a href=".*?" title="阅读次数">阅读</a>\((.*?)\)</span>',myPage,re.S)
	viewList=[]
	for items in views:
		viewList.append(str(items).lstrip().rstrip()) 


	#这里相当于创造一个迭代器
	index = []
	count = 0
	while (count < len(viewList)):
		index.insert(count,count)
		count = count + 1
	
	print(index)
	
	#将结果输出 
	for n in index: #range(len(titleList)):
		print '访问量:%s 标题:%s' % (viewList[n].zfill(4),titleList[n])

	#页号加1
	page_num = page_num + 1

	#在页面中查找是否存在‘尾页’这一个标签来判断是否为最后一页
	notLast = re.findall('<a href=".*?">尾页</a>',myPage,re.S)

有问题请及时交流学习，谢谢！

better_huirong

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
使用python爬取csdn博客访问量

# -*- coding: utf-8 -*-import urllib2import re#当前的博客列表页号page_num = 1#不是最后列表的一页，如果有尾页说明不是最后一页notLast = 1account = "mrzhang628"#首页地址baseUrl = 'http://blog.csdn.net/'+accountwhile notLast
复制链接

扫一扫

专栏目录