百度后,今天加入了线程池,昨天的问题原因可能是所有的线程被阻塞了
现在可以跑四五分钟,会有[Errno 11004] getaddrinfo failed
</pre><pre name="code" class="python">import urllib.request
import urllib.parse
import time
import os
import threading
import queue
import bs4
from bs4 import BeautifulSoup
import shutil
import errno
import sys
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding':'gzip',
'Connection':'close',
'Referer':None #注意如果依然不能抓取,这里可以设置抓取网站的host
}
#############################################################
# working thread
class Worker(threading.Thread):
worker_count = 0
def __init__( self, workQueue, resultQueue, timeout = 0, **kwds):
threading.Thread.__init__( self, **kwds )
self.id = Worker.worker_count
Worker.worker_count += 1
self.setDaemon( True )
self.workQueue = workQueue
self.resultQueue = resultQueue
self.timeout = timeout
self.start()
def run( self ):
''' the get-some-work, do-some-work main loop of worker threads '''
while True:
try:
callable, args, kwds = self.workQueue.get()
res = callable(*args, **kwds)
#print("worker[%2d]: %s" % (self.id, str(res) ))
#self.resultQueue.put( res )
except queue.Empty:
pass
#break
except:
print('worker[%2d]' % self.id, sys.exc_info()[:2])
class WorkerManager:
def __init__(self, num_of_workers=10, timeout = 1):
self.workQueue = queue.Queue()
self.resultQueue = queue.Queue()
self.workers = []
self.timeout = timeout
self._recruitThreads(num_of_workers)
def _recruitThreads(self, num_of_workers):
for i in range(num_of_workers):
worker = Worker(self.workQueue, self.resultQueue, self.timeout)
self.workers.append(worker)
def wait_for_complete(self):
# ...then, wait for each of them to terminate:
while len(self.workers):
worker = self.workers.pop()
worker.join()
if worker.isAlive() and not self.workQueue.empty():
self.workers.append( worker )
print("All jobs are are completed.")
def add_job( self, callable, *args, **kwds ):
self.workQueue.put( (callable, args, kwds) )
def get_result( self, *args, **kwds ):
return self.resultQueue.get( *args, **kwds )
#############################################################
class Fetcher:
def __init__(self,manager):
self.opener = urllib.request.build_opener(urllib.request.HTTPHandler)
self.opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')]
self.lock = threading.Lock() #线程锁
self.q_req = queue.Queue() #任务队列
self.q_ans = queue.Queue() #完成队列
self.__q_retry = queue.Queue() #重试队列
#self.threads_num = threads_num + 1 # 1代表了重试线程
#self.__threads = []
#重试线程
manager.add_job(self.threadretry,self)
#self.__retry_thread = threading.Thread(target=self.threadretry)
#self.__threads.append(self.__retry_thread)
#self.__retry_thread.setDaemon(True)
#self.__retry_thread.start()
#开启@threads_num个工作线程
manager.add_job(self.threadget,self)
#for i in range(threads_num):
# t = threading.Thread(target=self.threadget)
# self.__threads.append(t)
# t.setDaemon(True)
# t.start()
self.running = 0
#def __del__(self): #解构时需等待两个队列完成
#print("will delete")
#self.q_req.join()
#self.q_ans.join()
#self.__q_retry.join()
#print("deleted")
#for i in range(self.threads_num+1):
# self.__threads[i].exit()
#print("还有"+str(threading.active_count())+"个活跃线程")
def taskleft(self):
return self.q_req.qsize()+self.q_ans.qsize()+self.running
def push(self,req):
self.q_req.put(req)
def pop(self):
return self.q_ans.get()
def threadretry(self,_self):
while True:
try:
req = _self.__q_retry.get(timeout=1) #get(self,block=True,timeout=None)
except queue.Empty:
break
#with _self.lock:
_self.running += 1
try:
ans = _self.opener.open(req).read()
except urllib.error.URLError as e:
ans = ''
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server cannot fulfill the request.')
print('Reason: ', e.code)
else:
if ans:
_self.q_ans.put((req,ans))
finally:
#with _self.lock:
_self.running -= 1
_self.__q_retry.task_done()
def threadget(self,_self):
while True:
try:
req = _self.q_req.get(timeout=1)
except queue.Empty:
break
#with _self.lock: #要保证该操作的原子性,进入critical area
_self.running += 1
try:
ans = _self.opener.open(req).read()
except urllib.error.URLError as e:
ans = ''
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
_self.__q_retry.put(req)
elif hasattr(e, 'code'):
print('The server cannot fulfill the request.')
print('Reason: ', e.code)
_self.__q_retry.put(req)
else:
if ans:
_self.q_ans.put((req,ans))
else:
_self.__q_retry.put(req)
finally:
#with _self.lock:
_self.running -= 1
_self.q_req.task_done()
def create_dir(userid,domain='qiushibaike'):
dir_name = domain + '/' + userid
try:
os.mkdir(dir_name)
except OSError as e:
if e.errno == errno.EEXIST and os.path.isdir(dir_name):
pass
else:
print(str(e))
def userid_exist(userid):
return os.path.isdir('qiushibaike'+'/'+userid)
def get_file_name(userid):
current_time = time.strftime("%Y-%m-%d",time.localtime())
return 'qiushibaike'+'/'+userid+'/'+current_time+'.txt'
def write_file(file,soup):
count=0
for ii in soup.find_all("div",class_="content clearfix"):
#print(ii.a["href"])
#print(ii.a.text)
if ii.a.text:
count += 1
file.write(bytes(ii.a["href"],encoding="utf-8"))
file.write(bytes('\r\n',encoding="utf-8"))
file.write(bytes(ii.a.text,encoding="utf-8"))
file.write(bytes("\r\n\r\n",encoding="utf-8"))
return count
def get_max_page(soup):
#ii=bs4.element.Tag()
num=0
for jj in soup.find_all('a',rel="next",class_=None):
num=int(jj.text)
return num
def store_this_user(userid,manager):
if userid_exist(userid):
print("该用户貌似已经检索")
return
create_dir(userid)
file_name = get_file_name(userid)
file = open(file_name, 'wb')
ff = Fetcher(manager)
ff.push('http://www.qiushibaike.com/users/'+userid)
req,ans = ff.pop()
soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
user_name=""
for ii in soup.find_all('span',class_="user_center"):
user_name = ii.text
if not user_name:
del ff
file.close()
return
#把第一页的写进文件
count = write_file(file,soup)
print(user_name+" "+str(count)+"条糗事 [http://www.qiushibaike.com/users/"+userid+"/articles/page/1]")
#把余下的页面请求完
max_page = get_max_page(soup)+1
for i in range(2,max_page):
#print("加入列表 [http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i)+"]")
ff.push("http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i))
while ff.taskleft():
req,ans = ff.pop()
soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
count = write_file(file,soup)
print(user_name+" "+str(count)+"条糗事 ["+req+"]")
print(user_name+" 的线程资源被删除")
del ff
file.close()
return
def main():
#os.mkdir('qiushibaike')
#store_this_user("13843355")
opener = urllib.request.build_opener(urllib.request.HTTPHandler)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')]
worker_manager = WorkerManager(128)
#manager = WorkerManager(128)
#ff = Fetcher(worker_manager)
next_link = 'http://www.qiushibaike.com'
#ff.push(next_link)
page_num=0
while True:
page_num += 1
print("自动翻到第"+str(page_num)+"页 " + next_link)
ans = opener.open(next_link).read()
next_link=""
#req,ans = ff.pop()
#print("adfsdfsdfsdfds")
soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
for ii in soup.find_all('a',class_="next",text="下一页"):
next_link = ii["href"]
if not next_link:
print("what the fuck!!!")
break
next_link = "http://www.qiushibaike.com"+next_link
#ff.push(next_link)
for ii in soup.find_all('div',class_="author"):
print(ii.a["href"].split('/')[2])
store_this_user(ii.a["href"].split('/')[2], worker_manager)
'''
file.close()
ff = Fetcher(10)
ff.push('http://www.qiushibaike.com/users/14870461')
req,ans = ff.pop()
print(ans.decode('utf8'))
#os.system("pause")
testgbk='汉字'
testunit=testgbk.encode('gbk') #--汉字解码
print(testunit)
testutf8=testgbk.encode('utf-8') #--转utf-8编码
print(testutf8)
testunit=testutf8.decode('utf-8') #--utf-8解码
print(testunit)
testgbk=testunit.encode('gbk') #--转gbk编码
print(testgbk)
'''
'''
links = ['http://item.jd.com/%d.html'%i for i in range(1746854,1746860)]
ff = Fetcher(10)
for url in links:
ff.push(url)
while ff.taskleft():
(url,content) = ff.pop()
print(url,len(content))
'''
'''
url = 'http://www.sina.com'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {'name' : 'Michael Foord',
'location' : 'pythontab',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
#req = urllib.request.Request(url, data, headers)
req = urllib.request.Request('http://www.baidu.com')
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
'''
'''
##################################
def cbk(a, b, c):
#回调函数
#@a: 已经下载的数据块
#@b: 数据块的大小
#@c: 远程文件的大小
per = 100.0 * a * b / c
if per > 100:
per = 100
num = int(per)
print('[',end='')
for i in range(num):
print('#',end='')
print('%.2f]' %(per), end='')
url = 'http://www.sina.com.cn'
local = 'e:\\sina.html'
urllib.request.urlretrieve(url, local, cbk)
input()
os.system("pause")
##################################
'''
'''
try:
response = urllib.request.urlopen(req)
print('ffdfsdfsf')
except urllib.error.URLError as e:
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server cannot fulfill the request.')
print('Reason: ', e.code)
else:
#print(response.info)
#print(response.getcode())
response_context = response.read()
print(response_context.decode("utf8"))
'''
if __name__=="__main__":
main()