#目前问题:爬一会就报“RuntimeError: can't start new thread”错误
#Python的语法有些不太适应,这两天从网上搬了些代码组了个小爬虫,把糗事百科的段子按用户ID分类写入到文件
import urllib.request
import urllib.parse
import time
import os
import threading
import queue
import bs4
from bs4 import BeautifulSoup
import shutil
import errno
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding':'gzip',
'Connection':'close',
'Referer':None #注意如果依然不能抓取,这里可以设置抓取网站的host
}
class Fetcher:
def __init__(self,threads_num):
self.opener = urllib.request.build_opener(urllib.request.HTTPHandler)
self.opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')]
self.lock = threading.Lock() #线程锁
self.q_req = queue.Queue() #任务队列
self.q_ans = queue.Queue() #完成队列
self.__q_retry = queue.Queue() #重试队列
self.threads_num = threads_num + 1 # 1代表了重试线程
self.__threads = []
#重试线程
self.__retry_thread = threading.Thread(target=self.threadretry)
self.__threads.append(self.__retry_thread)
self.__retry_thread.setDaemon(True)
self.__retry_thread.start()
#开启@threads_num个工作线程
for i in range(threads_num):
t = threading.Thread(target=self.threadget)
self.__threads.append(t)
t.setDaemon(True)
t.start()
self.running = 0
def __del__(self): #解构时需等待两个队列完成
#self.q_req.join()
#self.q_ans.join()
#self.__q_retry.join()
for i in range(self.threads_num+1):
self.__threads[i].join(1)
def taskleft(self):
return self.q_req.qsize()+self.q_ans.qsize()+self.running
def push(self,req):
self.q_req.put(req)
def pop(self):
return self.q_ans.get()
def threadretry(self):
while True:
req = self.__q_retry.get() #get(self,block=True,timeout=None)
with self.lock:
self.running += 1
try:
ans = self.opener.open(req).read()
except urllib.error.URLError as e:
ans = ''
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server cannot fulfill the request.')
print('Reason: ', e.code)
else:
if ans:
self.q_ans.put((req,ans))
with self.lock:
self.running -= 1
self.__q_retry.task_done()
def threadget(self):
while True:
req = self.q_req.get()
with self.lock: #要保证该操作的原子性,进入critical area
self.running += 1
try:
ans = self.opener.open(req).read()
except urllib.error.URLError as e:
ans = ''
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server cannot fulfill the request.')
print('Reason: ', e.code)
else:
if ans:
self.q_ans.put((req,ans))
else:
self.__q_retry.put(req)
with self.lock:
self.running -= 1
self.q_req.task_done()
def create_dir(userid,domain='qiushibaike'):
dir_name = domain + '/' + userid
try:
os.mkdir(dir_name)
except OSError as e:
if e.errno == errno.EEXIST and os.path.isdir(dir_name):
pass
else:
print(str(e))
def userid_exist(userid):
return os.path.isdir('qiushibaike'+'/'+userid)
def get_file_name(userid):
current_time = time.strftime("%Y-%m-%d",time.localtime())
return 'qiushibaike'+'/'+userid+'/'+current_time+'.txt'
def write_file(file,soup):
count=0
for ii in soup.find_all("div",class_="content clearfix"):
#print(ii.a["href"])
#print(ii.a.text)
if ii.a.text:
count += 1
file.write(bytes(ii.a["href"],encoding="utf-8"))
file.write(bytes('\r\n',encoding="utf-8"))
file.write(bytes(ii.a.text,encoding="utf-8"))
file.write(bytes("\r\n\r\n",encoding="utf-8"))
return count
def get_max_page(soup):
#ii=bs4.element.Tag()
num=0
for jj in soup.find_all('a',rel="next",class_=None):
num=int(jj.text)
return num
def store_this_user(userid):
if userid_exist(userid):
print("该用户貌似已经检索")
return
create_dir(userid)
file_name = get_file_name(userid)
file = open(file_name, 'wb')
ff = Fetcher(3)
ff.push('http://www.qiushibaike.com/users/'+userid)
req,ans = ff.pop()
soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
user_name=""
for ii in soup.find_all('span',class_="user_center"):
user_name = ii.text
if not user_name:
del ff
file.close()
return
#把第一页的写进文件
count = write_file(file,soup)
print(user_name+" "+str(count)+"条糗事 [http://www.qiushibaike.com/users/"+userid+"/articles/page/1]")
#把余下的页面请求完
max_page = get_max_page(soup)+1
for i in range(2,max_page):
#print("加入列表 [http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i)+"]")
ff.push("http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i))
while ff.taskleft():
req,ans = ff.pop()
soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
count = write_file(file,soup)
print(user_name+" "+str(count)+"条糗事 ["+req+"]")
del ff
file.close()
def main():
#os.mkdir('qiushibaike')
#store_this_user("13843355")
ff = Fetcher(3)
ff.push('http://www.qiushibaike.com/textnew')
while True:
next_link=""
req,ans = ff.pop()
soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
for ii in soup.find_all('a',class_="next",text="下一页"):
next_link = ii["href"]
next_link = "http://www.qiushibaike.com"+next_link
ff.push(next_link)
for ii in soup.find_all('div',class_="author"):
#print(ii.a["href"].split('/')[2])
store_this_user(ii.a["href"].split('/')[2])
'''
file.close()
ff = Fetcher(10)
ff.push('http://www.qiushibaike.com/users/14870461')
req,ans = ff.pop()
print(ans.decode('utf8'))
#os.system("pause")
testgbk='汉字'
testunit=testgbk.encode('gbk') #--汉字解码
print(testunit)
testutf8=testgbk.encode('utf-8') #--转utf-8编码
print(testutf8)
testunit=testutf8.decode('utf-8') #--utf-8解码
print(testunit)
testgbk=testunit.encode('gbk') #--转gbk编码
print(testgbk)
'''
'''
links = ['http://item.jd.com/%d.html'%i for i in range(1746854,1746860)]
ff = Fetcher(10)
for url in links:
ff.push(url)
while ff.taskleft():
(url,content) = ff.pop()
print(url,len(content))
'''
'''
url = 'http://www.sina.com'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {'name' : 'Michael Foord',
'location' : 'pythontab',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
#req = urllib.request.Request(url, data, headers)
req = urllib.request.Request('http://www.baidu.com')
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
'''
'''
##################################
def cbk(a, b, c):
#回调函数
#@a: 已经下载的数据块
#@b: 数据块的大小
#@c: 远程文件的大小
per = 100.0 * a * b / c
if per > 100:
per = 100
num = int(per)
print('[',end='')
for i in range(num):
print('#',end='')
print('%.2f]' %(per), end='')
url = 'http://www.sina.com.cn'
local = 'e:\\sina.html'
urllib.request.urlretrieve(url, local, cbk)
input()
os.system("pause")
##################################
'''
'''
try:
response = urllib.request.urlopen(req)
print('ffdfsdfsf')
except urllib.error.URLError as e:
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server cannot fulfill the request.')
print('Reason: ', e.code)
else:
#print(response.info)
#print(response.getcode())
response_context = response.read()
print(response_context.decode("utf8"))
'''
if __name__=="__main__":
main()