Python爬虫使用线程池实现异步爬取知乎(例子)

在爬虫中使用线程池是个不错的选择,这里有个对比例子。

假设你已经知道爬虫的基本知识(可以网上查找),至于具体细节请自行参悟或者私信我。

1,常规方法

import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import os
import re
article_numbers = input('请输入文章号:')
url = 'https://zhuanlan.zhihu.com/p/'+article_numbers
def get_one_page(url):
    try:
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'}
        #请求头要添加,不然访问失败
        response = requests.get(url,headers=headers)
        if(response.status_code == 200):
            return response.text
            #返回页面html文本
        return None
    except RequestException:
        return None

def get_image_url(html):
    soup = BeautifulSoup(html, 'lxml')
    items = soup.select('figure')
    #item_numbers = len(items)
    #将figure节点选择出来
    # print(item_numbers)
    # for i in range(0,item_numbers):
    #    print(items[i])
    return items
        

def save_image(item,i):   
    try:
        os.chdir('D:\\Desktop\\math and Program\\发明tools')
        # 改变工作目录
        pattern = re.compile('<noscript>.*?data-original="(.*?)".*?</noscript>',re.S)
        # 构建正则表达式
        image_url = re.search(pattern,str(item))
        print(type(image_url))
        # 因为item格式为<soup.tag>,要转换成string
        r = requests.get(image_url.group(1))
        if r.status_code == 200:
            file_path = str(i) + '.jpg'
            with open(file_path, 'wb') as f:
                f.write(r.content)
    except requests.ConnectionError:
        print('Failed to save image')

def main():
    html = get_one_page(url)
    image_urls = get_image_url(html)
    
    i=0
    for image_url in image_urls:
        i += 1
        save_image(image_url,i)
        print('第',i,'张好看的壁纸!')

if __name__ == "__main__":

    main()

   

2,使用线程池

import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import os
import re
from multiprocessing.dummy import Pool
import time

url1 = 'https://zhuanlan.zhihu.com/p/133296596'

def init():
    path = 'D:\Desktop\project\python\content'
    #os.mkdir(path)  # 创建文件夹
    os.chdir(path)  # 切换到指定路径

def get_page(url):
    try:
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'}
        response = requests.get(url1,headers=headers)  # 发送GET请求
        if(response.status_code == 200):
            return response.text  # 返回页面html文本
        return None
    except RequestException:
        return '网页获取失败'

def split_list(lst):
    n = len(lst)
    base_size = n // 5
    remainder = n % 5

    result = []
    start = 0
    for i in range(5):
        size = base_size
        if remainder > 0:
            size += 1
            remainder -= 1
        end = start + size
        result.append(lst[start:end])
        start = end

    return result

def page_analysis(html):
    try:
        soup = BeautifulSoup(html, 'lxml')
        items = soup.select('figure')
        pattern = re.compile('<noscript>.*?data-original="(.*?)".*?</noscript>',re.S)
        # 构建正则表达式
        urls = []
        for item in items:
             url = re.search(pattern,str(items))
             urls.append(url)
        
        return split_list(urls)

    except RequestException:
        return '解析失败'
    
def download(urls, pool_name):
    i = 0 
    for url in urls:
        i += 1 
        r = requests.get(url.group(1))
        file_path = pool_name + str(i) + '.jpg'
        if r.status_code == 200:
            with open(file_path, 'wb') as f:
                f.write(r.content)            
            print('第',pool_name, i, '个')

def main(url):
    init()
    html = get_page(url)
    if html:
        image_urls = page_analysis(html)
        if image_urls:
            pool_names = ['a', 'b', 'c', 'd', 'e']
            tasks = zip(image_urls, pool_names)
            pool = Pool(5)  # 创建包含5个线程的线程池
            pool.starmap(download, tasks)  # 并行地执行下载任务
            pool.close()  # 关闭线程池
        else:
            print('图片链接获取失败')
    else:
        print('页面获取失败')

if __name__ == "__main__":
    #url = input('请输入网址:')
    t1 = time.time()
    main(url1)
    cost_time = time.time()-t1
    print('下载完成,时间为:'+str(cost_time) )

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值