统计个人CSDN的博客文章数量

本文介绍了如何使用Python爬虫统计个人在CSDN上的博客文章数量,从最初的只能统计第一页并排序,逐步演进到能够处理多页数据,将统计结果保存到本地文本和Excel文件,最后在Python3环境下优化了代码,实现了数据模块和处理模块的分离。
摘要由CSDN通过智能技术生成

统计个人CSDN的博客文章数量

 

第一版

原始版本比较简单

只能统计第一页,而且没有进行排序

 

# coding:utf-8
import urllib2
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def getPage(): #伪装成浏览器登陆,获取网页源代码
	url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'

	totalList=[]
	contentList=[]
	headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}  
	req = urllib2.Request(url=url,headers=headers)
	try:
		html = urllib2.urlopen(req).read()
	except urllib2.HTTPError,e:
		print e.code
		print e.reason
	fd=open('counter.txt','w')
	page = BeautifulSoup(html,'lxml')
	mytimes=page.find(id='blog_rank')
	i =1
	for aa in mytimes.find_all('li'):
		if i<3:
			print aa.text
			fd.write(aa.text)
			fd.write('\n')
			totalList.append(aa.text)
		i +=1


	items = page.find_all('div',class_ ='list_item list_view')
	print '总共有文章%d 篇' % len(items)
	for item in items:
		content=item.find('a')
		read_time=item.find('span',class_ ='link_view')
		comments_time=item.find('span',class_ ='link_comments')
		
		totalcontent=content.text.strip()+read_time.text.strip()+comments_time.text.strip()
		print totalcontent
		contentList.append(totalcontent)
		fd.write(totalcontent)
		fd.write('\n')

	fd.close()
	return totalList,contentList
urls=getPage()

 

第二版

 

再增加一个版本

这个版本,直接能按照访问次数进行排序

2017.4.11日重新更新代码,本次更新内容:

将统计的内容,重新在程序文件下再建立一个文件夹,同时将统计内容放入到以当前时间为名字的文本中

避免了每次统计直接覆盖了上一次统计的数据

第二版

 

# coding:utf-8
import urllib2,re,datetime,os
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def getPage(): #伪装成浏览器登陆,获取网页源代码
	url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
	baseurl='http://blog.csdn.net'
	totalList=[]
	contentList=[]
	sortlist=[]
	sortlist1=[]
	headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}  
	req = urllib2.Request(url=url,headers=headers)
	try:
		html = urllib2.urlopen(req).read()
	except urllib2.HTTPError,e:
		print e.code
		print e.reason
	path='count'
	if not os.path.exists(path):  
            os.makedirs(path) 
	fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
	fd=open(fname,'w')
	page = BeautifulSoup(html,'lxml')
	mytimes=page.find(id='blog_rank')
	i =1
	for aa in mytimes.find_all('li'):
		if i<3:
			print aa.text
			fd.write(aa.text)
			fd.write('\n')
			totalList.append(aa.text)
		i +=1


	items = page.find_all('div',class_ ='list_item list_view')
	print '总共有文章%d 篇' % len(items)
	fd.write('总共有文章%d 篇' % len(items))
	fd.write('\n')
	for item in items:
		aa={}
		content=item.find('a')
		contemtUrl=baseurl+content.get('href')
		
		read_time=item.find('span',class_ ='link_view')
		tmp=str(read_time.text.strip())

		number = int(filter(str.isdigit, tmp))
		sortlist1.append(number)

		comments_time=item.find('span',class_ ='link_comments')
		aa['indexs']=number
		aa['content']=content.text.strip()
		aa['read_time']=tmp
		aa['comments_time']=comments_time.text.strip()
		aa['contemtUrl']=contemtUrl
		sortlist.append(aa)
	sortlist1.sort()
	print sortlist1
	
	for i in sortlist1:
		for a in sortlist:
			if int(i) == int(a['indexs']):
				totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['comments_time']+'\t'+a['contemtUrl']
				print totalcontent
				fd.write(totalcontent)
				fd.write('\n')
				contentList.append(totalcontent)
	fd.close()
	return contentList
urls=getPage()

第三版

这一个版本比较有意思

 

#coding:utf-8  
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
import webbrowser as web
import sys  
reload(sys)  
sys.setdefaultencoding('utf-8')  
  
def getPage(): #伪装成浏览器登陆,获取网页源代码  
    url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'  
    baseurl='http://blog.csdn.net' 
    contentList=[]  
    sortlist=[]
    sortlist1=[]
    urlList=[]
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}    
    req = urllib2.Request(url=url,headers=headers)  
    try:  
        html = urllib2.urlopen(req).read()  
    except urllib2.HTTPError,e:  
        print e.code  
        print e.reason  
    path=u'count'
    if not os.path.exists(path):  
            os.makedirs(path) 	
    fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'	
    print fname	
    fd=open(fname,'w')
    page = BeautifulSoup(html,'lxml')
    items = page.find_all('div',class_ ='list_item list_view')  
    print u'总共有文章%d 篇' % len(items)  
    fd.write('总共有文章%d 篇' % len(items))
    fd.write('\n')
    for item in items:  
        aa={}  
        content=item.find('a')
        
        contemtUrl=baseurl+content.get('href')
        #print contemtUrl
          
        read_time=item.find('span',class_ ='link_view')  
        readtime=str(read_time.text.strip())
        #print readtime
  
        readtimeNumber = int(filter(str.isdigit, readtime))
        #print readtimeNumber
        sortlist1.append(readtimeNumber)  
        #time.sleep(2)
        aa['indexs']=readtimeNumber  
        aa['content']=content.text.strip()  
        aa['read_time']=readtime   
        aa['contemtUrl']=contemtUrl  
        sortlist.append(aa)  
    sortlist1.sort()  
    print sortlist1  
      
    for i in sortlist1:  
        for a in sortlist:  
            if int(i) == int(a['indexs']):  
                totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']  
                print totalcontent  
                fd.write(totalcontent)  
                fd.write('\n')
                urlList.append(a['contemtUrl'])
                contentList.append(totalcontent)  
    fd.close()  
    return urlList  

urls=getPage()

count=random.randint(10,50)
print u'将要打开关闭浏览器次数为:',count
for i in range(5):
	print urls[i]

j=0
while j< count:
    if j == 15:
        j=0
    for i in range(5):
        web.open_new_tab(urls[i+38])
        time.sleep(1)
        web.open_new_tab(urls[random.randint(1,44)])
        time.sleep(1)
    web.open_new_tab('http://blog.csdn.net/qiqiyingse/article/details/51801918')
    time.sleep(3)
    os.system('taskkill /f /IM Chrome.exe')
    j = j+1

第四版

本次更新是博客文章大于50篇以后,可能需要2页显示,但是只能统计两页内容

因此重新更新

 

#coding:utf-8  
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
import webbrowser as web
import sys  
reload(sys)  
sys.setdefaultencoding('utf-8')  
  
def getPage(): #伪装成浏览器登陆,获取网页源代码  
    url1 = 'http://blog.csdn.net/qiqiyingse/article/list/1?viewmode=contents'  
    url2 = 'http://blog.csdn.net/qiqiyingse/article/list/2?viewmode=contents'  
    baseurl='http://blog.csdn.net' 
    contentList=[]  
    sortlist=[]
    sortlist1=[]
    urlList=[]
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}    
    req1 = urllib2.Request(url=url1,headers=headers)  
    req2 = urllib2.Request(url=url2,headers=headers)  
    try:  
        html1 = urllib2.urlopen(req1).read()  
        html2 = urllib2.urlopen(req2).read()  
    except urllib2.HTTPError,e:  
        print e.code  
        print e.reason  
    path=u'count'
    if not os.path.exists(path):  
            os.makedirs(path) 	
    fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'	
    print fname	
    fd=open(fname,'w')
    page1 = BeautifulSoup(html1,'lxml')
    page2 = BeautifulSoup(html2,'lxml')
    items1 = page1.find_all('div',class_ ='list_item list_view')  
    items2 = page2.find_all('div',class_ ='list_item list_view')  
    cont_print= u'总共有文章%d 篇' % (len(items1)+len(items2))
    print cont_print
    fd.write(cont_print)  
    fd.write('\n')
    for item in items1:  
        aa={}  
        content=item.find('a')
        
        contemtUrl=baseurl+content.get('href')
        #print contemtUrl
          
        read_time=item.find('span',class_ ='link_view')  
        readtime=str(read_time.text.strip())
        #print readtime
  
        readtimeNumber = int(filter(str.isdigit, readtime))
        #print readtimeNumber
        sortlist1.append(readtimeNumber)  
        #time.sleep(2)
        aa['indexs']=readtimeNumber  
        aa['content']=content.text.strip()  
        aa['read_time']=readtime   
        aa['contemtUrl']=contemtUrl  
        sortlist.append(aa)
    for item in items2:  
        aa={}  
        content=item.find('a')
        
        contemtUrl=baseurl+content.get('href')
        #print contemtUrl
          
        read_time=item.find('span',class_ ='link_view')  
        readtime=str(read_time.text.strip())
        #print readtime
  
        readtimeNumber = int(filter(str.isdigit, readtime))
        #print readtimeNumber
        sortlist1.append(readtimeNumber)  
        #time.sleep(2)
        aa['indexs']=readtimeNumber  
        aa['content']=content.text.strip()  
        aa['read_time']=readtime   
        aa['contemtUrl']=contemtUrl  
        sortlist.append(aa)  		
    sortlist1.sort()  
    print sortlist1  
      
    for i in sortlist1:  
        for a in sortlist:  
            if int(i) == int(a['indexs']):  
                totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']  
                print totalcontent  
                fd.write(totalcontent)  
                fd.write('\n')
                urlList.append(a['contemtUrl'])
                contentList.append(totalcontent)  
    fd.close()  
    return urlList  

urls=getPage()

 

第五版

这次版本对整个函数进行了调整

1.让每一部分看起来更易读

2.可以统计个人名下所有的博客内容了,不管你有多少篇多少页博客,都能给统计到

3.更新了排序算法,这样就修复了之前的一个bug

 

代码如下:

 

#coding:utf-8
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
import webbrowser as web
import sys
reload(sys)  
sys.setdefaultencoding('utf-8')

#自定义打印函数
def self_log(msg):
	print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)

#获取页面内容
def  get_html(url):
	headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
	req = urllib2.Request(url=url,headers=headers)
	try:
		html = urllib2.urlopen(req).read()
	except urllib2.HTTPError,e:
		print e.code
	return html
	
#得到博客页面总数
def get_last_page(html,fd):
	if not html:
		self_log(u'页面错误,停止运行') 
		return
	page = BeautifulSoup(html,'lxml')
	if page.find('div',class_ ='pagelist').find_all('a'):
		last_page=page.find('div',class_ ='pagelist').find_all('a')
		last_page= last_page[len(last_page)-1].get('href')[-1:]
		self_log('总共有%s 页博客' % last_page)
		fd.write('总共有%s 页博客\n' % last_page)

		return last_page
	else:
		return 1
		
#获取积分内容		
def get_rank(html,fd):
	if not html:
		self_log(u'页面错误,停止运行') 
		return
	page = BeautifulSoup(html,'lxml')
	rank_list=[]
	if page.find(id='blog_rank'):
		
		rank_content=page.find(id='blog_rank')
		i =1
		for rank in rank_content.find_all('li'):
			if i<3:
				self_log(rank.text)
				fd.write(rank.text)
				fd.write('\n')
				rank_list.append(rank.text)
			i +=1
	return rank_list
	
#获取页面列表
def get_items(url):
	content_html=get_html(url)
	page = BeautifulSoup(content_html,'lxml')
	items = page.find_all('div',class_ ='list_item list_view')
	return items

#根据每一个items list 提取需要的元素
def handle_items(items,content_list,read_num_for_sort):
	for item in items:
		temp={}#临时变量
		
		title=item.find('a')#标题
		content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址
		read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数
		comments_time=item.find('span',class_ ='link_comments')#评论次数
		
		read_number = int(filter(str.isdigit, str(read_times)))	#提取出来具体阅读次数的数字,为之后的排序做准备
		read_num_for_sort.append(read_number)

		#将数据打包
		temp['indexs']=read_number
		temp['title']=title.text.strip()
		temp['read_times']=read_times
		temp['comments_time']=comments_time.text.strip()
		temp['content_url']=content_url
		content_list.append(temp)

#创建文件夹
def mkdir_folder(path):
	if not os.path.exists(path):  
		os.makedirs(path) 

#程序运行主函数		
def run(url):
	read_num_for_sort=[]
	content_list=[]
	content_totle_list=[]
	
	#定义文件夹名字并创建文件夹
	dir_path='count'
	mkdir_folder(dir_path)
	
	#定义文件名字
	count_file_name=dir_path
  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值