小小Python爬虫（0）

最新推荐文章于 2024-04-10 14:22:10 发布

vegebirdfly

最新推荐文章于 2024-04-10 14:22:10 发布

阅读量1.2k

点赞数 1

文章标签： python 爬虫 python 线程爬虫 http协议

本文链接：https://blog.csdn.net/vegebirdfly/article/details/49492461

版权

#目前问题：爬一会就报“RuntimeError: can't start new thread”错误

#Python的语法有些不太适应，这两天从网上搬了些代码组了个小爬虫，把糗事百科的段子按用户ID分类写入到文件

import urllib.request
import urllib.parse
import time
import os
import threading
import queue
import bs4
from bs4 import BeautifulSoup
import shutil
import errno

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
             'Accept':'text/html;q=0.9,*/*;q=0.8',
             'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
             'Accept-Encoding':'gzip',
             'Connection':'close',
             'Referer':None #注意如果依然不能抓取，这里可以设置抓取网站的host
             }


class Fetcher:
    def __init__(self,threads_num):
        self.opener = urllib.request.build_opener(urllib.request.HTTPHandler)
        self.opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')]
        self.lock = threading.Lock() #线程锁
        self.q_req = queue.Queue() #任务队列
        self.q_ans = queue.Queue() #完成队列
        self.__q_retry = queue.Queue() #重试队列
        self.threads_num = threads_num + 1 # 1代表了重试线程
        self.__threads = []

        #重试线程
        self.__retry_thread = threading.Thread(target=self.threadretry)
        self.__threads.append(self.__retry_thread)
        self.__retry_thread.setDaemon(True)
        self.__retry_thread.start()

        #开启@threads_num个工作线程
        for i in range(threads_num):
            t = threading.Thread(target=self.threadget)
            self.__threads.append(t)
            t.setDaemon(True)
            t.start()
            
        self.running = 0

    def __del__(self): #解构时需等待两个队列完成
        #self.q_req.join()
        #self.q_ans.join()
        #self.__q_retry.join()
        for i in range(self.threads_num+1):
            self.__threads[i].join(1)

    def taskleft(self):
        return self.q_req.qsize()+self.q_ans.qsize()+self.running

    def push(self,req):
        self.q_req.put(req)

    def pop(self):
        return self.q_ans.get()

    def threadretry(self):
        while True:
            req = self.__q_retry.get() #get(self,block=True,timeout=None)
            
            with self.lock:
                self.running += 1

            try:
                ans = self.opener.open(req).read()

            except urllib.error.URLError as e:
                ans = ''
                if hasattr(e, 'reason'):
                    print('We failed to reach a server.')
                    print('Reason: ', e.reason)
                elif hasattr(e, 'code'):
                    print('The server cannot fulfill the request.')
                    print('Reason: ', e.code)
            else:
                if ans:
                    self.q_ans.put((req,ans))

            with self.lock:
                self.running -= 1

            
            self.__q_retry.task_done()
            


    def threadget(self):
        while True:
            req = self.q_req.get()
            with self.lock: #要保证该操作的原子性，进入critical area
                self.running += 1
                
            try:
                ans = self.opener.open(req).read()
            except urllib.error.URLError as e:
                ans = ''
                if hasattr(e, 'reason'):
                    print('We failed to reach a server.')
                    print('Reason: ', e.reason)
                elif hasattr(e, 'code'):
                    print('The server cannot fulfill the request.')
                    print('Reason: ', e.code)
            else:
                if ans:
                    self.q_ans.put((req,ans))
                else:
                    self.__q_retry.put(req)
            with self.lock:
                self.running -= 1
            self.q_req.task_done()
            

def create_dir(userid,domain='qiushibaike'):
    dir_name = domain + '/' + userid
    try:
        os.mkdir(dir_name)
    except OSError as e:
        if e.errno == errno.EEXIST and os.path.isdir(dir_name):
            pass
        else:
            print(str(e))

def userid_exist(userid):
    return os.path.isdir('qiushibaike'+'/'+userid)

def get_file_name(userid):
    current_time = time.strftime("%Y-%m-%d",time.localtime())
    return 'qiushibaike'+'/'+userid+'/'+current_time+'.txt'



def write_file(file,soup):
    count=0
    for ii in soup.find_all("div",class_="content clearfix"):
        #print(ii.a["href"])
        #print(ii.a.text)
        if ii.a.text:
            count += 1
            file.write(bytes(ii.a["href"],encoding="utf-8"))
            file.write(bytes('\r\n',encoding="utf-8"))
            file.write(bytes(ii.a.text,encoding="utf-8"))
            file.write(bytes("\r\n\r\n",encoding="utf-8"))
    return count
    
def get_max_page(soup):
    #ii=bs4.element.Tag()
    num=0
    for jj in soup.find_all('a',rel="next",class_=None):
        num=int(jj.text)
    return num

            
def store_this_user(userid):
    if userid_exist(userid):
       print("该用户貌似已经检索")
       return
    create_dir(userid)
    
    file_name = get_file_name(userid)
    file = open(file_name, 'wb')
    
    ff = Fetcher(3)
    ff.push('http://www.qiushibaike.com/users/'+userid)
    req,ans = ff.pop()
    
    soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
    user_name=""
    for ii in soup.find_all('span',class_="user_center"):
        user_name = ii.text
    if not user_name:
        del ff
        file.close()
        return
    
    #把第一页的写进文件
    count = write_file(file,soup)
    print(user_name+" "+str(count)+"条糗事 [http://www.qiushibaike.com/users/"+userid+"/articles/page/1]")

    #把余下的页面请求完
    max_page = get_max_page(soup)+1
    for i in range(2,max_page):
        #print("加入列表 [http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i)+"]")
        ff.push("http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i))

    while ff.taskleft():
        req,ans = ff.pop()
        soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")
        count = write_file(file,soup)
        print(user_name+" "+str(count)+"条糗事 ["+req+"]")

    del ff
    file.close()

    

def main():
    #os.mkdir('qiushibaike')
    #store_this_user("13843355")



    ff = Fetcher(3)
    ff.push('http://www.qiushibaike.com/textnew')


    while True:
        next_link=""
        req,ans = ff.pop()
        soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")

        for ii in soup.find_all('a',class_="next",text="下一页"):
            next_link = ii["href"]
        next_link = "http://www.qiushibaike.com"+next_link
        ff.push(next_link)

        for ii in soup.find_all('div',class_="author"):
            #print(ii.a["href"].split('/')[2])
            store_this_user(ii.a["href"].split('/')[2])
    


'''

    file.close()


    ff = Fetcher(10)
    ff.push('http://www.qiushibaike.com/users/14870461')
    req,ans = ff.pop()
    
    print(ans.decode('utf8'))
    

    #os.system("pause")

    


    testgbk='汉字'
    testunit=testgbk.encode('gbk')    #--汉字解码
    print(testunit)
    
    testutf8=testgbk.encode('utf-8')  #--转utf-8编码
    print(testutf8)
    
    testunit=testutf8.decode('utf-8')  #--utf-8解码
    print(testunit)
    
    testgbk=testunit.encode('gbk')    #--转gbk编码
    print(testgbk)
'''


    
'''
    links = ['http://item.jd.com/%d.html'%i for i in range(1746854,1746860)]
    ff = Fetcher(10)
    for url in links:
        ff.push(url)

    while ff.taskleft():
        (url,content) = ff.pop()
        print(url,len(content))
'''

    

'''
url = 'http://www.sina.com'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 
values = {'name' : 'Michael Foord', 
          'location' : 'pythontab', 
          'language' : 'Python' }
headers = { 'User-Agent' : user_agent }

data = urllib.parse.urlencode(values) 
#req = urllib.request.Request(url, data, headers)
req = urllib.request.Request('http://www.baidu.com')
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
'''


'''
##################################
def cbk(a, b, c):
    
    #回调函数
    #@a: 已经下载的数据块
    #@b: 数据块的大小
    #@c: 远程文件的大小
    
    per = 100.0 * a * b / c
    if per > 100:
        per = 100
    num = int(per)
    
 
    print('[',end='')
    for i in range(num):
        print('#',end='')
    print('%.2f]' %(per), end='')

url = 'http://www.sina.com.cn'
local = 'e:\\sina.html'
urllib.request.urlretrieve(url, local, cbk)

input()
os.system("pause")
##################################
'''

'''
try:
    response = urllib.request.urlopen(req)
    print('ffdfsdfsf')
except urllib.error.URLError as e:
    if hasattr(e, 'reason'):
        print('We failed to reach a server.')
        print('Reason: ', e.reason)
    elif hasattr(e, 'code'):
        print('The server cannot fulfill the request.')
        print('Reason: ', e.code)
else:
        #print(response.info)
        #print(response.getcode())
        response_context = response.read()
        print(response_context.decode("utf8"))
'''


if __name__=="__main__":
    main()

vegebirdfly

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
小小Python爬虫（0）

#目前问题：爬一会就报“RuntimeError: can't start new thread”错误#Python的语法有些不太适应，这两天从网上搬了些代码组了个小爬虫，把糗事百科的段子按用户ID分类写入到文件import urllib.requestimport urllib.parseimport timeimport osimport threadingimport queue
复制链接

扫一扫