__author__ = 'change'
# coding=utf-8
"""
** Python Blog's Visit Count V2.0
** (V1.0 http://blog.csdn.net/change518/article/details/14108511)
** By change
** 2015.11.4
** http://blog.csdn.net/change518
** 首先遍历获取文章列表,提取每篇博客的地址
** 再构造HTTP请求访问这些地址,使用了线程提高速度
** 将博客中所有文章访问一遍,从而达到刷访问量的目的
** 由于缓存的原因,访问量一段时间后才会更新
"""
import urllib2
import re
import datetime
import Queue
import threading
# 记录程序运行开始时间
startTime = datetime.datetime.now()
# 线程数
threadNum = 10
threadList = []
# 所有文章链接地址列表
myList = list()
myLinks = Queue.Queue()
# 添加请求头
requestHeader = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
# 循环读取分页
for i in range(1, 5):
url = "http://blog.csdn.net/change518/article/list/" + str(i) + "?viewmode=contents"
request = urllib2.Request(url, headers=requestHeader)
response = urllib2.urlopen(request)
htmlResult = response.read()
myPattern = '<span class="link_title"><a href="/change518/article/details/\d{7,8}">'
firstResult = re.findall(myPattern, htmlResult)
myPattern = '/change518/article/details/\d{7,8}'
firstResultStr = ''.join(firstResult)
myList += re.findall(myPattern, firstResultStr)
# 将List中的所有元素添加到Queue中
for linkAddress in myList:
myLinks.put('http://blog.csdn.net' + linkAddress)
def mySpiderThread(j):
"""
循环读取URL列表,使用Queue进行线程间资源同步
:return:
"""
while not (myLinks.empty()):
singleLink = myLinks.get()
request = urllib2.Request(singleLink, headers=requestHeader)
print singleLink + " :" +str(j)
for i in range(10):
urllib2.urlopen(request)
# 建立 threadNum 个线程
for i in range(threadNum):
t = threading.Thread(target=mySpiderThread, args=(i,))
threadList.append(t)
# 开启 threadNum 个线程
for i in range(threadNum):
threadList[i].start()
# 程序挂起,直到所有线程结束
for i in range(threadNum):
threadList[i].join()
"""
如果不需要在所有线程执行完毕后进行一些操作,如统计所有线程的执行时间等
上面3个for循环也可以写成:
# 开启 threadNum 个线程
for i in range(threadNum):
t = threading.Thread(target=mySpiderThread, args=(i,))
t.start()
"""
print 'Done'
# 记录程序运行结束时间
endTime = datetime.datetime.now()
# 计算程序运行时长
print (endTime - startTime).seconds
.