# -*- coding: utf-8 -*-
__author__ = 'wangjingyao'
import urllib2
import re
import sys
import threading, Queue
import time
reload(sys)
sys.setdefaultencoding('utf8')#设置默认编码
_DATA = []
FILE_LOCK = threading.Lock()
SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列
_WORKER_THREAD_NUM = 10 #设置线程的个数
URLS =[]
class MyThread(threading.Thread) :
def __init__(self, func) :
super(MyThread, self).__init__() #调用父类的构造函数
self.func = func #传入线程函数逻辑
def run(self) :
self.func()
def worker() :
global SHARE_Q
while not SHARE_Q.empty():
url = SHARE_Q.get() #获得任务
my_page = get_page('http://'+url)
getPageItems(my_page,url) #获得当前页面的title
time.sleep(1)
SHARE_Q.task_done()
def get_page(url) :
"""
根据所给的url爬取网页HTML
Args:
url: 表示当前要爬取页面的url
Returns:
返回抓取到整个页面的HTML(unicode编码)
Raises:
URLError:url引发的异常
"""
try :
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36'
headers={'User-Agent' : user_agent}
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
my_page = response.read().encode('gbk','ignore')
return my_page
except urllib2.URLError, e :
if hasattr(e, "code"):
print "The server couldn't fulfill the request."
print "Error code: %s" % e.code
return None
elif hasattr(e, "reason"):
print "We failed to reach a server. Please check your url and read the Reason"
print "Reason: %s" % e.reason
return None
def getPageItems(pageCode,url) :
"""
通过返回的整个网页HTML, 筛选title
Args:
my_page: 传入页面的HTML文本用于正则匹配
"""
if not pageCode:
print 'pageCode init error'
return None
# title爬取
# re.S 匹配多行
title = re.search(r'<title>(.*?)</title>',pageCode,re.S)
if title:
_title=title.group().decode('gbk')
_DATA.append('url:'+url+"_____"+_title)
else:
_DATA.append('url:'+url+"_____"+"title is none")
def main() :
global SHARE_Q
threads = []
f = open("D://test.txt", "r")
#向队列中放入任务
while True:
line = f.readline()
if line:
SHARE_Q.put(line) # do something here
else:
break
f.close()
for i in xrange(_WORKER_THREAD_NUM) :
thread = MyThread(worker)
thread.start() #线程开始处理任务
threads.append(thread)
for thread in threads :
thread.join()
SHARE_Q.join()
_DATAs=list(set(_DATA))
with open("D://title_script.txt", "w+") as my_file :
for i in range(len(_DATAs)):
my_file.write(_DATAs[i] + "\r\n")
my_file.write("\r\n")
print "Spider Successful!!!"
if __name__ == '__main__':
main()
------------------------------------------------------------以上存在页面返回字符编码不是utf8的情况 解析不出title
程序员学习公众号:
# -*- coding: utf-8 -*- __author__ = 'wangjingyao' import urllib2 import re import sys import threading, Queue import time reload(sys) sys.setdefaultencoding('utf8')#设置默认编码 _DATA = [] FILE_LOCK = threading.Lock() SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列 _WORKER_THREAD_NUM = 10 #设置线程的个数 class MyThread(threading.Thread) : def __init__(self, func) : super(MyThread, self).__init__() #调用父类的构造函数 self.func = func #传入线程函数逻辑 def run(self) : self.func() def worker() : global SHARE_Q while not SHARE_Q.empty(): url = SHARE_Q.get() #获得任务 my_page = get_page('http://'+url) getPageItems(my_page,url) #获得当前页面的title time.sleep(1) SHARE_Q.task_done() def encoding(data): types = ["ASCII","utf-8","gb2312","GBK","iso-8859-1"] #可以添加其他字符编码 for type in types: try: return unicode(data,type,'ignore')#data.decode(type) except: pass return None def get_page(url) : """ 根据所给的url爬取网页HTML Args: url: 表示当前要爬取页面的url Returns: 返回抓取到整个页面的HTML(unicode编码) Raises: URLError:url引发的异常 """ try : req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept':'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection':'close', 'Referer':None } request = urllib2.Request(url,headers = req_header) response = urllib2.urlopen(request) _re_data=response.read() # 判断返回字符编码 import chardet _charset = chardet.detect(_re_data)['encoding'] # g根据对应编码解码在转换成utf-8 if _charset: ssss = _re_data.decode(_charset,'ignore') return ssss.encode('utf-8') response.close() except Exception,e: print e return None def getPageItems(pageCode,url) : """ 通过返回的整个网页HTML, 筛选title Args: my_page: 传入页面的HTML文本用于正则匹配 """ if not pageCode: print 'pageCode init error' return None # title爬取 # re.S 匹配多行 title = re.search(r'<title>(.*?)</title>',pageCode,re.S) if title: _title=title.group().decode('utf-8') _DATA.append('url:'+url+"_____"+_title) else: _DATA.append('url:'+url+"_____"+"title is none") def main() : global SHARE_Q threads = [] f = open("D://url.txt", "r") #向队列中放入任务 while True: line = f.readline() if line: SHARE_Q.put(line) # do something here else: break f.close() for i in xrange(_WORKER_THREAD_NUM) : thread = MyThread(worker) thread.start() #线程开始处理任务 threads.append(thread) for thread in threads : thread.join() SHARE_Q.join() _DATAs=list(set(_DATA)) with open("D://title_script_new.txt", "w+") as my_file : for i in range(len(_DATAs)): my_file.write(_DATAs[i] + "\r\n") my_file.write("\r\n") print "Spider Successful!!!" if __name__ == '__main__': main()