# -*- coding: utf-8 -*-
__author__ = 'wangjingyao'
import urllib
import urllib2
import re
import sys
import threading, Queue, time
import user_agents,random,time
reload(sys)
sys.setdefaultencoding('utf8')#设置默认编码
_DATA = []
FILE_LOCK = threading.Lock()
SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列
_WORKER_THREAD_NUM = 10 #设置线程的个数
class MyThread(threading.Thread) :
def __init__(self, func) :
super(MyThread, self).__init__() #调用父类的构造函数
self.func = func #传入线程函数逻辑
def run(self) :
self.func()
def worker() :
global SHARE_Q
while not SHARE_Q.empty():
url = SHARE_Q.get() #获得任务
my_page = get_page(url)
getPageItems(my_page) #获得当前页面的电影名
#write_into_file(temp_data)
time.sleep(1)
SHARE_Q.task_done()
def get_page(url) :
"""
根据所给的url爬取网页HTML
Args:
url: 表示当前要爬取页面的url
Returns:
返回抓取到整个页面的HTML(unicode编码)
Raises:
URLError:url引发的异常
"""
try :
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36'
headers={'User-Agent' : user_agent}
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
my_page = response.read().encode('gbk','ignore')
return my_page
except urllib2.URLError, e :
if hasattr(e, "code"):
print "The server couldn't fulfill the request."
print "Error code: %s" % e.code
return None
elif hasattr(e, "reason"):
print "We failed to reach a server. Please check your url and read the Reason"
print "Reason: %s" % e.reason
return None
def getPageItems(pageCode) :
"""
通过返回的整个网页HTML, 正则匹配前100的电影名称
Args:
my_page: 传入页面的HTML文本用于正则匹配
"""
if not pageCode:
print 'pageCode init error'
return None
# 作者爬取
pattern = re.compile('<span itemprop="author">(.*?)</span>')
items = re.findall(pattern,pageCode)
for item in items:
_DATA.append(item)
print "authorSpider------"
# 评论会员爬取
partternComment = re.compile('<div class="comment-detail"><a href=".*?">(.*?)</a>')
itemcomments= re.findall(partternComment,pageCode)
for itemcomment in itemcomments:
if itemcomment.decode('gbk') != '极客漫游者':
_DATA.append(itemcomment)
print "commentSpider------"
def main() :
global SHARE_Q
threads = []
gkgy_url ="http://www.geekpark.net/topics/{page}"
#向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务
for index in xrange(210714,213394) :
SHARE_Q.put(gkgy_url.format(page = index))
for i in xrange(_WORKER_THREAD_NUM) :
thread = MyThread(worker)
thread.start() #线程开始处理任务
threads.append(thread)
for thread in threads :
thread.join()
SHARE_Q.join()
_DATAs=list(set(_DATA))
with open("outGKGY.txt", "w+") as my_file :
for page in _DATAs :
my_file.write(page + "\t")
print "Spider Successful!!!"
if __name__ == '__main__': main()