统计个人CSDN的博客文章数量
第一版
原始版本比较简单
只能统计第一页,而且没有进行排序
# coding:utf-8
import urllib2
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
totalList=[]
contentList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
fd=open('counter.txt','w')
page = BeautifulSoup(html,'lxml')
mytimes=page.find(id='blog_rank')
i =1
for aa in mytimes.find_all('li'):
if i<3:
print aa.text
fd.write(aa.text)
fd.write('\n')
totalList.append(aa.text)
i +=1
items = page.find_all('div',class_ ='list_item list_view')
print '总共有文章%d 篇' % len(items)
for item in items:
content=item.find('a')
read_time=item.find('span',class_ ='link_view')
comments_time=item.find('span',class_ ='link_comments')
totalcontent=content.text.strip()+read_time.text.strip()+comments_time.text.strip()
print totalcontent
contentList.append(totalcontent)
fd.write(totalcontent)
fd.write('\n')
fd.close()
return totalList,contentList
urls=getPage()
第二版
再增加一个版本
这个版本,直接能按照访问次数进行排序
2017.4.11日重新更新代码,本次更新内容:
将统计的内容,重新在程序文件下再建立一个文件夹,同时将统计内容放入到以当前时间为名字的文本中
避免了每次统计直接覆盖了上一次统计的数据
第二版
# coding:utf-8
import urllib2,re,datetime,os
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
baseurl='http://blog.csdn.net'
totalList=[]
contentList=[]
sortlist=[]
sortlist1=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
path='count'
if not os.path.exists(path):
os.makedirs(path)
fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
fd=open(fname,'w')
page = BeautifulSoup(html,'lxml')
mytimes=page.find(id='blog_rank')
i =1
for aa in mytimes.find_all('li'):
if i<3:
print aa.text
fd.write(aa.text)
fd.write('\n')
totalList.append(aa.text)
i +=1
items = page.find_all('div',class_ ='list_item list_view')
print '总共有文章%d 篇' % len(items)
fd.write('总共有文章%d 篇' % len(items))
fd.write('\n')
for item in items:
aa={}
content=item.find('a')
contemtUrl=baseurl+content.get('href')
read_time=item.find('span',class_ ='link_view')
tmp=str(read_time.text.strip())
number = int(filter(str.isdigit, tmp))
sortlist1.append(number)
comments_time=item.find('span',class_ ='link_comments')
aa['indexs']=number
aa['content']=content.text.strip()
aa['read_time']=tmp
aa['comments_time']=comments_time.text.strip()
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
sortlist1.sort()
print sortlist1
for i in sortlist1:
for a in sortlist:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['comments_time']+'\t'+a['contemtUrl']
print totalcontent
fd.write(totalcontent)
fd.write('\n')
contentList.append(totalcontent)
fd.close()
return contentList
urls=getPage()
第三版
这一个版本比较有意思
#coding:utf-8
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
import webbrowser as web
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
baseurl='http://blog.csdn.net'
contentList=[]
sortlist=[]
sortlist1=[]
urlList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
path=u'count'
if not os.path.exists(path):
os.makedirs(path)
fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
print fname
fd=open(fname,'w')
page = BeautifulSoup(html,'lxml')
items = page.find_all('div',class_ ='list_item list_view')
print u'总共有文章%d 篇' % len(items)
fd.write('总共有文章%d 篇' % len(items))
fd.write('\n')
for item in items:
aa={}
content=item.find('a')
contemtUrl=baseurl+content.get('href')
#print contemtUrl
read_time=item.find('span',class_ ='link_view')
readtime=str(read_time.text.strip())
#print readtime
readtimeNumber = int(filter(str.isdigit, readtime))
#print readtimeNumber
sortlist1.append(readtimeNumber)
#time.sleep(2)
aa['indexs']=readtimeNumber
aa['content']=content.text.strip()
aa['read_time']=readtime
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
sortlist1.sort()
print sortlist1
for i in sortlist1:
for a in sortlist:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']
print totalcontent
fd.write(totalcontent)
fd.write('\n')
urlList.append(a['contemtUrl'])
contentList.append(totalcontent)
fd.close()
return urlList
urls=getPage()
count=random.randint(10,50)
print u'将要打开关闭浏览器次数为:',count
for i in range(5):
print urls[i]
j=0
while j< count:
if j == 15:
j=0
for i in range(5):
web.open_new_tab(urls[i+38])
time.sleep(1)
web.open_new_tab(urls[random.randint(1,44)])
time.sleep(1)
web.open_new_tab('http://blog.csdn.net/qiqiyingse/article/details/51801918')
time.sleep(3)
os.system('taskkill /f /IM Chrome.exe')
j = j+1
第四版
本次更新是博客文章大于50篇以后,可能需要2页显示,但是只能统计两页内容
因此重新更新
#coding:utf-8
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
import webbrowser as web
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getPage(): #伪装成浏览器登陆,获取网页源代码
url1 = 'http://blog.csdn.net/qiqiyingse/article/list/1?viewmode=contents'
url2 = 'http://blog.csdn.net/qiqiyingse/article/list/2?viewmode=contents'
baseurl='http://blog.csdn.net'
contentList=[]
sortlist=[]
sortlist1=[]
urlList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req1 = urllib2.Request(url=url1,headers=headers)
req2 = urllib2.Request(url=url2,headers=headers)
try:
html1 = urllib2.urlopen(req1).read()
html2 = urllib2.urlopen(req2).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
path=u'count'
if not os.path.exists(path):
os.makedirs(path)
fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
print fname
fd=open(fname,'w')
page1 = BeautifulSoup(html1,'lxml')
page2 = BeautifulSoup(html2,'lxml')
items1 = page1.find_all('div',class_ ='list_item list_view')
items2 = page2.find_all('div',class_ ='list_item list_view')
cont_print= u'总共有文章%d 篇' % (len(items1)+len(items2))
print cont_print
fd.write(cont_print)
fd.write('\n')
for item in items1:
aa={}
content=item.find('a')
contemtUrl=baseurl+content.get('href')
#print contemtUrl
read_time=item.find('span',class_ ='link_view')
readtime=str(read_time.text.strip())
#print readtime
readtimeNumber = int(filter(str.isdigit, readtime))
#print readtimeNumber
sortlist1.append(readtimeNumber)
#time.sleep(2)
aa['indexs']=readtimeNumber
aa['content']=content.text.strip()
aa['read_time']=readtime
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
for item in items2:
aa={}
content=item.find('a')
contemtUrl=baseurl+content.get('href')
#print contemtUrl
read_time=item.find('span',class_ ='link_view')
readtime=str(read_time.text.strip())
#print readtime
readtimeNumber = int(filter(str.isdigit, readtime))
#print readtimeNumber
sortlist1.append(readtimeNumber)
#time.sleep(2)
aa['indexs']=readtimeNumber
aa['content']=content.text.strip()
aa['read_time']=readtime
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
sortlist1.sort()
print sortlist1
for i in sortlist1:
for a in sortlist:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']
print totalcontent
fd.write(totalcontent)
fd.write('\n')
urlList.append(a['contemtUrl'])
contentList.append(totalcontent)
fd.close()
return urlList
urls=getPage()
第五版
这次版本对整个函数进行了调整
1.让每一部分看起来更易读
2.可以统计个人名下所有的博客内容了,不管你有多少篇多少页博客,都能给统计到
3.更新了排序算法,这样就修复了之前的一个bug
代码如下:
#coding:utf-8
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
import webbrowser as web
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#自定义打印函数
def self_log(msg):
print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)
#获取页面内容
def get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
return html
#得到博客页面总数
def get_last_page(html,fd):
if not html:
self_log(u'页面错误,停止运行')
return
page = BeautifulSoup(html,'lxml')
if page.find('div',class_ ='pagelist').find_all('a'):
last_page=page.find('div',class_ ='pagelist').find_all('a')
last_page= last_page[len(last_page)-1].get('href')[-1:]
self_log('总共有%s 页博客' % last_page)
fd.write('总共有%s 页博客\n' % last_page)
return last_page
else:
return 1
#获取积分内容
def get_rank(html,fd):
if not html:
self_log(u'页面错误,停止运行')
return
page = BeautifulSoup(html,'lxml')
rank_list=[]
if page.find(id='blog_rank'):
rank_content=page.find(id='blog_rank')
i =1
for rank in rank_content.find_all('li'):
if i<3:
self_log(rank.text)
fd.write(rank.text)
fd.write('\n')
rank_list.append(rank.text)
i +=1
return rank_list
#获取页面列表
def get_items(url):
content_html=get_html(url)
page = BeautifulSoup(content_html,'lxml')
items = page.find_all('div',class_ ='list_item list_view')
return items
#根据每一个items list 提取需要的元素
def handle_items(items,content_list,read_num_for_sort):
for item in items:
temp={}#临时变量
title=item.find('a')#标题
content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址
read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数
comments_time=item.find('span',class_ ='link_comments')#评论次数
read_number = int(filter(str.isdigit, str(read_times))) #提取出来具体阅读次数的数字,为之后的排序做准备
read_num_for_sort.append(read_number)
#将数据打包
temp['indexs']=read_number
temp['title']=title.text.strip()
temp['read_times']=read_times
temp['comments_time']=comments_time.text.strip()
temp['content_url']=content_url
content_list.append(temp)
#创建文件夹
def mkdir_folder(path):
if not os.path.exists(path):
os.makedirs(path)
#程序运行主函数
def run(url):
read_num_for_sort=[]
content_list=[]
content_totle_list=[]
#定义文件夹名字并创建文件夹
dir_path='count'
mkdir_folder(dir_path)
#定义文件名字
count_file_name=dir_path