day26-多进程多线程

day26

总结

  • 多线程.py

    """
    !./env python
    -*- coding: utf-8 -*-
    @Time:  2021/6/1 17:09
    @Author:    三玖天下第一
    @File: 多线程.py
    @Software: PyCharm
    """
    
    # 一个进程默认有一个线程,该线程叫主线程。其他线程都叫子线程(需要手动创建)
    # 如果一个Python程序需要子线程需要手动创建子线程类Thread对象
    
    import time
    import threading
    from threading import Thread
    from datetime import datetime
    from random import randint
    from mine_thread import MyThread
    
    print_lock = threading.Lock()
    
    def my_print(*args, out=True, **kwargs):
        with print_lock:
            if out:
                print(*args, **kwargs)
            else:
                input('请输入数据:')
    
    def download(name):
        my_print(f'"{name}"开始下载:{datetime.now()}')
        time.sleep(randint(3, 7))
        my_print(f'"{name}"下载结束:{datetime.now()}')
    
    
    if __name__ == '__main__':
        # new_thread = MyThread(download, '小薇', thread_name='子线程1')
        # new_thread.start()
        # new_thread.join()
        t1 = Thread(target=download, args=('小薇',))
        t2 = Thread(target=download, args=('猪猪侠',))
        t3 = Thread(target=download, args=('你好,世界',))
        # ============电影下载完了再执行主线程===================
        # t1.start()
        # t2.start()
        # t3.start()
        # t1.join()
        # t2.join()
        # t3.join()
        # ============t1电影下载完了再执行主线程===================
        # t1.start()
        # t1.join()
        # t2.start()
        # t3.start()
        # t2.join()
        # t3.join()
        # ==========t4等待电影全部下载完了提示=====================
        def wati_download():
            t1.start()
            t2.start()
            t3.start()
            t1.join()
            t2.join()
            t3.join()
            my_print('下载完成...')
        t4 = Thread(target=wati_download)
        t4.start()
        while True:
            my_print(out=False)
            time.sleep(0.1)
        # download('小薇')
        # download('猪猪侠')
        # download('你好,世界')
    
    
    
  • 多进程.py

    """
    !./env python
    -*- coding: utf-8 -*-
    @Time:  2021/6/2 11:48
    @Author:    三玖天下第一
    @File: 多进程.py
    @Software: PyCharm
    """
    import time
    from datetime import datetime
    from multiprocessing import Process
    from random import randint
    from threading import Thread
    
    
    def download(name):
        print(f'"{name}"开始下载:{datetime.now()}')
        time.sleep(randint(3, 7))
        print(f'"{name}"下载结束:{datetime.now()}')
    
    
    def wait(*args):
        for p in args:
            p.start()
        for p in args:
            p.join()
        print('哈哈哈')
    
    
    if __name__ == '__main__':
        p1 = Process(target=download, args=('小薇',))
        p2 = Process(target=download, args=('触不可及',))
        p3 = Process(target=download, args=('很爱很爱你',))
        t1 = Thread(target=wait, args=(p1, p2, p3))
        t1.start()
        while True:
            time.sleep(0.1)
            input('请输入数据:')
    
  • 多进程中创建多线程.py

    """
    !./env python
    -*- coding: utf-8 -*-
    @Time:  2021/6/2 14:09
    @Author:    三玖天下第一
    @File: 多进程中创建多线程.py
    @Software: PyCharm
    """
    
    import random
    import time
    from multiprocessing import Process, current_process
    from threading import Thread, current_thread
    
    
    def download(name):
        print(f'当前进程{current_process()},当前线程{current_thread()}', end='')
        print(f'{name}:开始下载...')
        time.sleep(random.randint(3, 6))
        print(f'当前进程{current_process()},当前线程{current_thread()}', end='')
        print(f"{name}:下载结束...")
    
    
    def load(*names):
        all_thread = []
        for name in names:
            t = Thread(target=download, args=(name,))
            t.start()
            all_thread.append(t)
    
    
    if __name__ == '__main__':
        # 1.在主进程中执行
        # download('小薇')
    
        # 2.
        # t1 = Thread(target=download, args=('雄纠', ))
        # t2 = Thread(target=download, args=('阿甘正传', ))
        # t1.start()
        # t2.start()
    
        # 3.
        # p1 = Process(target=download, args=('阿甘正传',))
        # p2 = Process(target=download, args=('天堂',))
        # p3 = Process(target=download, args=('Python',))
        # p1.start()
        # p2.start()
        # p3.start()
    
        # 4
        p1 = Process(target=load, args=('阿甘正传', '肖申克的救赎', '喜羊羊与灰太狼'))
        p2 = Process(target=load, args=('天堂', '我的世界', '天下第一'))
        p3 = Process(target=load, args=('Python', 'Java', 'JavaScript'))
        p1.start()
        p2.start()
        p3.start()
    
    
  • 进程通信

    """
    !./env python
    -*- coding: utf-8 -*-
    @Time:  2021/6/2 15:29
    @Author:    三玖天下第一
    @File: 进程通信.py
    @Software: PyCharm
    """
    
    import random
    import time
    from multiprocessing import Process, current_process, Queue
    from threading import current_thread
    
    
    def download(name, q: Queue):
        print(f'当前进程{current_process()},当前线程{current_thread()}', end='')
        print(f'{name}:开始下载...')
        time.sleep(random.randint(3, 6))
        print(f'当前进程{current_process()},当前线程{current_thread()}', end='')
        print(f"{name}:下载结束...")
        q.put(name)
    
    
    def get_data(q: Queue):
        while True:
            result = q.get()
            if result == 'end':
                break
            print(result)
    
    
    if __name__ == '__main__':
        # 创建空的队列(必须是全局的)
        q = Queue(maxsize=20)
        p1 = Process(target=download, args=('小薇', q))
        p2 = Process(target=download, args=('触不可及', q))
        p3 = Process(target=download, args=('很爱很爱你', q))
        p4 = Process(target=get_data, args=(q,))
        p1.start()
        p2.start()
        p3.start()
        p4.start()
    
        p1.join()
        p2.join()
        p3.join()
    
        q.put('end')
    
    
  • 线程通信

    """
    !./env python
    -*- coding: utf-8 -*-
    @Time:  2021/6/2 14:47
    @Author:    三玖天下第一
    @File: 线程间通信.py
    @Software: PyCharm
    """
    
    import random
    import time
    from multiprocessing import current_process
    from threading import Thread, current_thread
    
    all_datas = []
    
    
    def download(name):
        print(f'当前进程{current_process()},当前线程{current_thread()}', end='')
        print(f'{name}:开始下载...')
        time.sleep(random.randint(3, 6))
        print(f'当前进程{current_process()},当前线程{current_thread()}', end='')
        print(f"{name}:下载结束...")
        all_datas.append(current_thread())
    
    
    if __name__ == '__main__':
        t1 = Thread(target=download, args=('雄纠',))
        t2 = Thread(target=download, args=('阿甘正传',))
        t1.start()
        t2.start()
    
    # 多线程数据共享:同一个进程中多线程数据可以直接共享
    # (同一个进程中的全局变量在作用域范围内可以接受或者存储其他线程中的任何数据
    # 如果需要在一个线程中去获取其他多个线程中的数据,就定义一个全局的可变容器,比如列表,最好是线程的队列
    
    
    

作业

  • 使用多进程和多线程爬取豆瓣图书
"""
@Time:  2021/6/1 9:34
@Author:    三玖天下第一
"""

import json
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process, Queue

import openpyxl
import requests

print_lock = threading.Lock()


def my_print(*args, **kwargs):
    with print_lock:
        print(*args, **kwargs)


def get_proxy_ips(ip_queue: Queue):
    while True:
        # api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
        api = 'http://api.kuainiaoip.com/index.php?fetch_type=2021060217064947339&pool_id=&qty=5&time=101&province=%E5%9B%9B%E5%B7%9D%E7%9C%81&city=%E6%88%90%E9%83%BD%E5%B8%82&protocol=1&format=txt-normal&dt=1'
        response = requests.get(api)
        # print(response.text)
        if response.status_code == 200:
            if response.text == '10404:没有找到相关记录':
                print('提取频繁请按照规定频率提取')
            else:
                for ip in response.text.split('\n')[:-1]:
                    ip_queue.put(ip)
        else:
            print('获取代理失败!')
        time.sleep(3)


def get_content2(q: Queue, ip_obj, url, header, data):
    ip = ip_obj.ip
    try:
        res = requests.post(url, data=json.dumps(data), headers=header, proxies={'http://': ip, 'https://': ip})
        # res = requests.post(url, data=json.dumps(data), headers=header)
        if res.status_code == 200:
            q.put(res.json())
        else:
            if ip_obj.is_update(ip):
                ip_obj.update(ip)
            if res.status_code == 403:
                get_content(q, ip_obj, url, header, data)
    except Exception as e:
        print(e)
        if ip_obj.is_update(ip):
            ip_obj.update(ip)
        get_content(q, ip_obj, url, header, data)


def proxies(ip):
    return {'http': ip, 'https': ip}


def get_content(q: Queue, ip_obj, url, header, data):
    ip = ip_obj.ip
    res = requests.post(url, data=json.dumps(data), headers=header, proxies={'http://': ip, 'https://': ip}, timeout=5)
    # res = requests.post(url, data=json.dumps(data), headers=header)
    if res.status_code == 200:
        q.put(res.json())
    else:
        ip_obj.update(ip)
        time.sleep(0.1)
        get_content(q, ip_obj, url, header, data)


def add_get_page(q: Queue):
    print('第一个子进程执行...')
    proxy_ip = Queue()
    threading.Thread(target=get_proxy_ips, args=(proxy_ip,), daemon=True).start()
    thread_pool = ThreadPoolExecutor(max_workers=256)

    class IpObject:
        lock = threading.RLock()
        ip = proxy_ip.get()

        @classmethod
        def update(cls, old):
            with cls.lock:
                if old == cls.ip:
                    cls.ip = proxy_ip.get()
                    print('update', cls.ip)

        @classmethod
        def is_update(cls, old):
            if old == cls.ip:
                return True
            return False

    # 请求网址!!!!post请求 data数据实现数据的获取
    url = "https://read.douban.com/j/kind/"
    # 浏览器请求头
    header = {"accept": "application/json",
              "Accept-Encoding": "gzip, deflate, br",
              "Accept-Language": "zh-CN,zh;q=0.8",
              "Connection": "keep-alive",
              "content-type": "application/json",
              "Host": "read.douban.com",
              "Cookie": "bid=jXNUTLsP_28; gr_user_id=e52067be-9219-484a-9f84-a1129fa1acbf; __utmz=30149280.1622524612.1.1.utmcsr=sogou.com|utmccn=(referral)|utmcmd=referral|utmcct=/link; __utma=30149280.2030887735.1622524612.1622524612.1622541364.2; _ga=GA1.3.2030887735.1622524612; _gid=GA1.3.231733992.1622705350; _pk_ses.100001.a7dd=*; _gat=1; _pk_id.100001.a7dd=f10116d5e1b94476.1622705350.1.1622705395.1622705350.",
              "Origin": "https://read.douban.com",
              "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
              "x-csrf-token": "null",
              "x-requested-with": "XMLHttpRequest"}
    # 64853
    for page in range(60000, 64853):
        data = {"sort": "new", "page": page, "kind": 0,
                "query": "    query getFilterWorksList($works_ids: [ID!]) {      worksList(worksIds: $works_ids) {                title    cover    url    isBundle          url    title          author {      name      url    }    origAuthor {      name      url    }    translator {      name      url    }          abstract    editorHighlight          isOrigin    kinds {          name @skip(if: true)    shortName @include(if: true)    id      }    ... on WorksBase @include(if: true) {      wordCount      wordCountUnit    }    ... on WorksBase @include(if: false) {          isEssay        ... on EssayWorks {      favorCount    }          isNew        averageRating    ratingCount    url          }    ... on WorksBase @include(if: true) {      isColumn      isEssay      onSaleTime      ... on ColumnWorks {        updateTime      }    }    ... on WorksBase @include(if: true) {      isColumn      ... on ColumnWorks {        isFinished      }    }    ... on EssayWorks {      essayActivityData {            title    uri    tag {      name      color      background      icon2x      icon3x      iconSize {        height      }      iconPosition {        x y      }    }        }    }    highlightTags {      name    }      ... on WorksBase @include(if: false) {          fixedPrice    salesPrice    isRebate      }    ... on EbookWorks {          fixedPrice    salesPrice    isRebate      }    ... on WorksBase @include(if: true) {      ... on EbookWorks {        id        isPurchased        isInWishlist      }    }          id        isOrigin      }    }  ",
                "variables": {},
                "tags": []}
        thread_pool.submit(get_content2, q, IpObject, url, header, data)
        # get_content(q, IpObject, url, header, data)
    thread_pool.shutdown(wait=True)


def analysis_data(pending_data: Queue, data: Queue):
    print('第二个子进程执行...')
    while True:
        content = pending_data.get()
        if content == 'end':
            print('子进程二结束')
            return
        root = 'https://read.douban.com'
        content_list = content['list']
        all_list = []
        for dict1 in content_list:
            title = dict1['title']
            img = dict1['cover']
            url = root + dict1['url']
            author = dict1['author']
            author_name = author[0]['name'] if author else ''
            author_url = root + author[0]['url'] if author else ''
            abstract = dict1['abstract']
            kinds = '|'.join([x['shortName'] for x in dict1['kinds']])
            wordCount = dict1['wordCount']
            isFinished = dict1.get('isFinished', 'TRUE')
            new_list = [title, img, url, author_name, author_url, abstract, kinds, wordCount, isFinished]
            all_list.append(new_list)
        data.put(all_list)


def save_data(data: Queue):
    print('第三个子进程执行...')
    # file = r'./files/scrawp.csv'
    # f = open(file, 'w', newline='', encoding='utf-8')
    # writer = csv.writer(f)
    # writer.writerow(
    #     ['title', 'img', 'url', 'author_name', 'author_url', 'abstract', 'kinds', 'wordCount', 'isFinished'])
    wb = openpyxl.Workbook()
    sheet = wb.active
    sheet.title = '豆瓣图书免费'
    sheet.append(['title', 'img', 'url', 'author_name', 'author_url', 'abstract', 'kinds', 'wordCount', 'isFinished'])
    file = r'./files/scrawp3.xlsx'

    def save(wb, file):
        while True:
            time.sleep(3)
            wb.save(file)

    t = threading.Thread(target=save, args=(wb, file), daemon=True)
    t.start()
    while True:
        content = data.get()
        if content == 'end':
            time.sleep(3)
            print('子进程三结束')
            break
        for ls in content:
            sheet.append(ls)


if __name__ == '__main__':
    pending_data = Queue(maxsize=4096)
    data = Queue(maxsize=4096)
    p1 = Process(target=add_get_page, args=(pending_data,))
    p2 = Process(target=analysis_data, args=(pending_data, data))
    p3 = Process(target=save_data, args=(data,))
    p1.start()
    p2.start()
    p3.start()

    p1.join()
    pending_data.put('end')
    pending_data.put('end')
    print('网页爬取完成...')
    p2.join()
    data.put('end')
    data.put('end')
    print('数据解析完成...')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值