python中threading和concurrent实现多线程

最新推荐文章于 2024-06-29 17:27:03 发布

G_scsd

最新推荐文章于 2024-06-29 17:27:03 发布

阅读量724

点赞数

分类专栏： python python学习爬虫文章标签： python thread 多线程

本文链接：https://blog.csdn.net/Gscsd_T/article/details/111083825

版权

python 同时被 3 个专栏收录

66 篇文章 6 订阅

订阅专栏

python学习

30 篇文章 0 订阅

订阅专栏

爬虫

18 篇文章 4 订阅

订阅专栏

一、threading

import requests
from lxml import etree
import threading


THREAD_NUM = 10  # 启动十个线程


def request(url):
    """
    发起请求
    :param url: 需要请求的url
    :return:
    """
    response = requests.get(url)
    if response.status_code == 200:
        text = response.text
        html_xpath = etree.HTML(text)
        rows = html_xpath.xpath('//div[@class="row results-row"]/div')
        for row in rows:
            title = row.xpath('.//h4/a/text()')[0]
            url = row.xpath('.//img/@src')[0]
            temp_dict = {
                'title': title,
                'url': url,
            }
            print(temp_dict)
    else:
        print('错误响应码为：' + str(response.status_code))


def start_thread(works):
    """
    开启多线程
    :param works: 需要抓取的url列表
    :return:
    """
    nums = len(works)
    x = nums // THREAD_NUM
    ys = nums % THREAD_NUM
    if ys > 0:
        x += 1
    for i in range(x):
        print('循环第  {}   次， 共有   {}   次'.format(i, x))
        if i == x + 1:
            work = works[i * THREAD_NUM:]
        else:
            work = works[i * THREAD_NUM:(i + 1) * THREAD_NUM]
        threads = [threading.Thread(target=request(job), args=(job,)) for job in work]
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()


def main():
    """
    threading多线程
    :return:
    """
    url = 'https://digital.ucd.ie/index.php?q=&start={}&rows=10'
    works = [url.format(_) for _ in range(1,100)]
    start_thread(works)


if __name__ == '__main__':
    main()

二、concurrent

from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from lxml import etree


def request(url):
    """
    发起请求
    :param url: 需要请求的url
    :return:
    """
    response = requests.get(url)
    if response.status_code == 200:
        text = response.text
        html_xpath = etree.HTML(text)
        rows = html_xpath.xpath('//div[@class="row results-row"]/div')
        for row in rows:
            title = row.xpath('.//h4/a/text()')[0]
            url = row.xpath('.//img/@src')[0]
            temp_dict = {
                'title': title,
                'url': url,
            }
            return temp_dict
    else:
        print('错误响应码为：' + str(response.status_code))


def main():
    """
    concurrent多线程
    :return:
    """
    url = 'https://digital.ucd.ie/index.php?q=&start={}&rows=10'
    works = [url.format(_) for _ in range(1,100)]
    pool = ThreadPoolExecutor(max_workers=10)  # 设置最大的线程数为10

    # 方法一:  用list将任务包裹起来，使用as_completed进行迭代
    jobs = []
    for work in works:
        p = pool.submit(request, work)  # 异步提交任务
        jobs.append(p)
    for _ in as_completed(jobs): # 当某一个future任务执行完毕后，执行下面代码。会阻塞，等待线程完成后执行
        print(_.result())

    # 方法二
    # for work in works:
    #     p = pool.submit(request, work)  # 异步提交任务
    #     p.add_done_callback(lambda x: print(x.result()))

    #方法三:
    # data = pool.map(request, works)  # 取代for循环submit的操作
    # for _ in data:
    #     print(_)
    

if __name__ == '__main__':
    main()

G_scsd

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
python中threading和concurrent实现多线程

一、threadingimport requestsfrom lxml import etreeimport threadingTHREAD_NUM = 10 # 启动十个线程def request(url): """ 发起请求 :param url: 需要请求的url :return: """ response = requests.get(url) if response.status_code == 200:
复制链接

扫一扫

专栏目录