day26-多进程多线程

最新推荐文章于 2024-07-25 18:55:58 发布

??fengyu

最新推荐文章于 2024-07-25 18:55:58 发布

阅读量86

点赞数

文章标签： python

本文链接：https://blog.csdn.net/qq_46137199/article/details/117532027

版权

day26

总结

多线程.py

"""
!./env python
-*- coding: utf-8 -*-
@Time:  2021/6/1 17:09
@Author:    三玖天下第一
@File: 多线程.py
@Software: PyCharm
"""

# 一个进程默认有一个线程，该线程叫主线程。其他线程都叫子线程（需要手动创建）
# 如果一个Python程序需要子线程需要手动创建子线程类Thread对象

import time
import threading
from threading import Thread
from datetime import datetime
from random import randint
from mine_thread import MyThread

print_lock = threading.Lock()

def my_print(*args, out=True, **kwargs):
    with print_lock:
        if out:
            print(*args, **kwargs)
        else:
            input('请输入数据:')

def download(name):
    my_print(f'"{name}"开始下载:{datetime.now()}')
    time.sleep(randint(3, 7))
    my_print(f'"{name}"下载结束:{datetime.now()}')


if __name__ == '__main__':
    # new_thread = MyThread(download, '小薇', thread_name='子线程1')
    # new_thread.start()
    # new_thread.join()
    t1 = Thread(target=download, args=('小薇',))
    t2 = Thread(target=download, args=('猪猪侠',))
    t3 = Thread(target=download, args=('你好，世界',))
    # ============电影下载完了再执行主线程===================
    # t1.start()
    # t2.start()
    # t3.start()
    # t1.join()
    # t2.join()
    # t3.join()
    # ============t1电影下载完了再执行主线程===================
    # t1.start()
    # t1.join()
    # t2.start()
    # t3.start()
    # t2.join()
    # t3.join()
    # ==========t4等待电影全部下载完了提示=====================
    def wati_download():
        t1.start()
        t2.start()
        t3.start()
        t1.join()
        t2.join()
        t3.join()
        my_print('下载完成...')
    t4 = Thread(target=wati_download)
    t4.start()
    while True:
        my_print(out=False)
        time.sleep(0.1)
    # download('小薇')
    # download('猪猪侠')
    # download('你好，世界')

多进程.py

"""
!./env python
-*- coding: utf-8 -*-
@Time:  2021/6/2 11:48
@Author:    三玖天下第一
@File: 多进程.py
@Software: PyCharm
"""
import time
from datetime import datetime
from multiprocessing import Process
from random import randint
from threading import Thread


def download(name):
    print(f'"{name}"开始下载:{datetime.now()}')
    time.sleep(randint(3, 7))
    print(f'"{name}"下载结束:{datetime.now()}')


def wait(*args):
    for p in args:
        p.start()
    for p in args:
        p.join()
    print('哈哈哈')


if __name__ == '__main__':
    p1 = Process(target=download, args=('小薇',))
    p2 = Process(target=download, args=('触不可及',))
    p3 = Process(target=download, args=('很爱很爱你',))
    t1 = Thread(target=wait, args=(p1, p2, p3))
    t1.start()
    while True:
        time.sleep(0.1)
        input('请输入数据:')

多进程中创建多线程.py

"""
!./env python
-*- coding: utf-8 -*-
@Time:  2021/6/2 14:09
@Author:    三玖天下第一
@File: 多进程中创建多线程.py
@Software: PyCharm
"""

import random
import time
from multiprocessing import Process, current_process
from threading import Thread, current_thread


def download(name):
    print(f'当前进程{current_process()}，当前线程{current_thread()}', end='')
    print(f'{name}:开始下载...')
    time.sleep(random.randint(3, 6))
    print(f'当前进程{current_process()}，当前线程{current_thread()}', end='')
    print(f"{name}:下载结束...")


def load(*names):
    all_thread = []
    for name in names:
        t = Thread(target=download, args=(name,))
        t.start()
        all_thread.append(t)


if __name__ == '__main__':
    # 1.在主进程中执行
    # download('小薇')

    # 2.
    # t1 = Thread(target=download, args=('雄纠', ))
    # t2 = Thread(target=download, args=('阿甘正传', ))
    # t1.start()
    # t2.start()

    # 3.
    # p1 = Process(target=download, args=('阿甘正传',))
    # p2 = Process(target=download, args=('天堂',))
    # p3 = Process(target=download, args=('Python',))
    # p1.start()
    # p2.start()
    # p3.start()

    # 4
    p1 = Process(target=load, args=('阿甘正传', '肖申克的救赎', '喜羊羊与灰太狼'))
    p2 = Process(target=load, args=('天堂', '我的世界', '天下第一'))
    p3 = Process(target=load, args=('Python', 'Java', 'JavaScript'))
    p1.start()
    p2.start()
    p3.start()

进程通信

"""
!./env python
-*- coding: utf-8 -*-
@Time:  2021/6/2 15:29
@Author:    三玖天下第一
@File: 进程通信.py
@Software: PyCharm
"""

import random
import time
from multiprocessing import Process, current_process, Queue
from threading import current_thread


def download(name, q: Queue):
    print(f'当前进程{current_process()}，当前线程{current_thread()}', end='')
    print(f'{name}:开始下载...')
    time.sleep(random.randint(3, 6))
    print(f'当前进程{current_process()}，当前线程{current_thread()}', end='')
    print(f"{name}:下载结束...")
    q.put(name)


def get_data(q: Queue):
    while True:
        result = q.get()
        if result == 'end':
            break
        print(result)


if __name__ == '__main__':
    # 创建空的队列（必须是全局的）
    q = Queue(maxsize=20)
    p1 = Process(target=download, args=('小薇', q))
    p2 = Process(target=download, args=('触不可及', q))
    p3 = Process(target=download, args=('很爱很爱你', q))
    p4 = Process(target=get_data, args=(q,))
    p1.start()
    p2.start()
    p3.start()
    p4.start()

    p1.join()
    p2.join()
    p3.join()

    q.put('end')

线程通信

"""
!./env python
-*- coding: utf-8 -*-
@Time:  2021/6/2 14:47
@Author:    三玖天下第一
@File: 线程间通信.py
@Software: PyCharm
"""

import random
import time
from multiprocessing import current_process
from threading import Thread, current_thread

all_datas = []


def download(name):
    print(f'当前进程{current_process()}，当前线程{current_thread()}', end='')
    print(f'{name}:开始下载...')
    time.sleep(random.randint(3, 6))
    print(f'当前进程{current_process()}，当前线程{current_thread()}', end='')
    print(f"{name}:下载结束...")
    all_datas.append(current_thread())


if __name__ == '__main__':
    t1 = Thread(target=download, args=('雄纠',))
    t2 = Thread(target=download, args=('阿甘正传',))
    t1.start()
    t2.start()

# 多线程数据共享：同一个进程中多线程数据可以直接共享
# （同一个进程中的全局变量在作用域范围内可以接受或者存储其他线程中的任何数据
# 如果需要在一个线程中去获取其他多个线程中的数据，就定义一个全局的可变容器，比如列表，最好是线程的队列

作业

使用多进程和多线程爬取豆瓣图书

"""
@Time:  2021/6/1 9:34
@Author:    三玖天下第一
"""

import json
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process, Queue

import openpyxl
import requests

print_lock = threading.Lock()


def my_print(*args, **kwargs):
    with print_lock:
        print(*args, **kwargs)


def get_proxy_ips(ip_queue: Queue):
    while True:
        # api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
        api = 'http://api.kuainiaoip.com/index.php?fetch_type=2021060217064947339&pool_id=&qty=5&time=101&province=%E5%9B%9B%E5%B7%9D%E7%9C%81&city=%E6%88%90%E9%83%BD%E5%B8%82&protocol=1&format=txt-normal&dt=1'
        response = requests.get(api)
        # print(response.text)
        if response.status_code == 200:
            if response.text == '10404:没有找到相关记录':
                print('提取频繁请按照规定频率提取')
            else:
                for ip in response.text.split('\n')[:-1]:
                    ip_queue.put(ip)
        else:
            print('获取代理失败！')
        time.sleep(3)


def get_content2(q: Queue, ip_obj, url, header, data):
    ip = ip_obj.ip
    try:
        res = requests.post(url, data=json.dumps(data), headers=header, proxies={'http://': ip, 'https://': ip})
        # res = requests.post(url, data=json.dumps(data), headers=header)
        if res.status_code == 200:
            q.put(res.json())
        else:
            if ip_obj.is_update(ip):
                ip_obj.update(ip)
            if res.status_code == 403:
                get_content(q, ip_obj, url, header, data)
    except Exception as e:
        print(e)
        if ip_obj.is_update(ip):
            ip_obj.update(ip)
        get_content(q, ip_obj, url, header, data)


def proxies(ip):
    return {'http': ip, 'https': ip}


def get_content(q: Queue, ip_obj, url, header, data):
    ip = ip_obj.ip
    res = requests.post(url, data=json.dumps(data), headers=header, proxies={'http://': ip, 'https://': ip}, timeout=5)
    # res = requests.post(url, data=json.dumps(data), headers=header)
    if res.status_code == 200:
        q.put(res.json())
    else:
        ip_obj.update(ip)
        time.sleep(0.1)
        get_content(q, ip_obj, url, header, data)


def add_get_page(q: Queue):
    print('第一个子进程执行...')
    proxy_ip = Queue()
    threading.Thread(target=get_proxy_ips, args=(proxy_ip,), daemon=True).start()
    thread_pool = ThreadPoolExecutor(max_workers=256)

    class IpObject:
        lock = threading.RLock()
        ip = proxy_ip.get()

        @classmethod
        def update(cls, old):
            with cls.lock:
                if old == cls.ip:
                    cls.ip = proxy_ip.get()
                    print('update', cls.ip)

        @classmethod
        def is_update(cls, old):
            if old == cls.ip:
                return True
            return False

    # 请求网址！！！！post请求 data数据实现数据的获取
    url = "https://read.douban.com/j/kind/"
    # 浏览器请求头
    header = {"accept": "application/json",
              "Accept-Encoding": "gzip, deflate, br",
              "Accept-Language": "zh-CN,zh;q=0.8",
              "Connection": "keep-alive",
              "content-type": "application/json",
              "Host": "read.douban.com",
              "Cookie": "bid=jXNUTLsP_28; gr_user_id=e52067be-9219-484a-9f84-a1129fa1acbf; __utmz=30149280.1622524612.1.1.utmcsr=sogou.com|utmccn=(referral)|utmcmd=referral|utmcct=/link; __utma=30149280.2030887735.1622524612.1622524612.1622541364.2; _ga=GA1.3.2030887735.1622524612; _gid=GA1.3.231733992.1622705350; _pk_ses.100001.a7dd=*; _gat=1; _pk_id.100001.a7dd=f10116d5e1b94476.1622705350.1.1622705395.1622705350.",
              "Origin": "https://read.douban.com",
              "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
              "x-csrf-token": "null",
              "x-requested-with": "XMLHttpRequest"}
    # 64853
    for page in range(60000, 64853):
        data = {"sort": "new", "page": page, "kind": 0,
                "query": "    query getFilterWorksList($works_ids: [ID!]) {      worksList(worksIds: $works_ids) {                title    cover    url    isBundle          url    title          author {      name      url    }    origAuthor {      name      url    }    translator {      name      url    }          abstract    editorHighlight          isOrigin    kinds {          name @skip(if: true)    shortName @include(if: true)    id      }    ... on WorksBase @include(if: true) {      wordCount      wordCountUnit    }    ... on WorksBase @include(if: false) {          isEssay        ... on EssayWorks {      favorCount    }          isNew        averageRating    ratingCount    url          }    ... on WorksBase @include(if: true) {      isColumn      isEssay      onSaleTime      ... on ColumnWorks {        updateTime      }    }    ... on WorksBase @include(if: true) {      isColumn      ... on ColumnWorks {        isFinished      }    }    ... on EssayWorks {      essayActivityData {            title    uri    tag {      name      color      background      icon2x      icon3x      iconSize {        height      }      iconPosition {        x y      }    }        }    }    highlightTags {      name    }      ... on WorksBase @include(if: false) {          fixedPrice    salesPrice    isRebate      }    ... on EbookWorks {          fixedPrice    salesPrice    isRebate      }    ... on WorksBase @include(if: true) {      ... on EbookWorks {        id        isPurchased        isInWishlist      }    }          id        isOrigin      }    }  ",
                "variables": {},
                "tags": []}
        thread_pool.submit(get_content2, q, IpObject, url, header, data)
        # get_content(q, IpObject, url, header, data)
    thread_pool.shutdown(wait=True)


def analysis_data(pending_data: Queue, data: Queue):
    print('第二个子进程执行...')
    while True:
        content = pending_data.get()
        if content == 'end':
            print('子进程二结束')
            return
        root = 'https://read.douban.com'
        content_list = content['list']
        all_list = []
        for dict1 in content_list:
            title = dict1['title']
            img = dict1['cover']
            url = root + dict1['url']
            author = dict1['author']
            author_name = author[0]['name'] if author else ''
            author_url = root + author[0]['url'] if author else ''
            abstract = dict1['abstract']
            kinds = '|'.join([x['shortName'] for x in dict1['kinds']])
            wordCount = dict1['wordCount']
            isFinished = dict1.get('isFinished', 'TRUE')
            new_list = [title, img, url, author_name, author_url, abstract, kinds, wordCount, isFinished]
            all_list.append(new_list)
        data.put(all_list)


def save_data(data: Queue):
    print('第三个子进程执行...')
    # file = r'./files/scrawp.csv'
    # f = open(file, 'w', newline='', encoding='utf-8')
    # writer = csv.writer(f)
    # writer.writerow(
    #     ['title', 'img', 'url', 'author_name', 'author_url', 'abstract', 'kinds', 'wordCount', 'isFinished'])
    wb = openpyxl.Workbook()
    sheet = wb.active
    sheet.title = '豆瓣图书免费'
    sheet.append(['title', 'img', 'url', 'author_name', 'author_url', 'abstract', 'kinds', 'wordCount', 'isFinished'])
    file = r'./files/scrawp3.xlsx'

    def save(wb, file):
        while True:
            time.sleep(3)
            wb.save(file)

    t = threading.Thread(target=save, args=(wb, file), daemon=True)
    t.start()
    while True:
        content = data.get()
        if content == 'end':
            time.sleep(3)
            print('子进程三结束')
            break
        for ls in content:
            sheet.append(ls)


if __name__ == '__main__':
    pending_data = Queue(maxsize=4096)
    data = Queue(maxsize=4096)
    p1 = Process(target=add_get_page, args=(pending_data,))
    p2 = Process(target=analysis_data, args=(pending_data, data))
    p3 = Process(target=save_data, args=(data,))
    p1.start()
    p2.start()
    p3.start()

    p1.join()
    pending_data.put('end')
    pending_data.put('end')
    print('网页爬取完成...')
    p2.join()
    data.put('end')
    data.put('end')
    print('数据解析完成...')

??fengyu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
day26-多进程多线程

day26总结多线程.py"""!./env python-*- coding: utf-8 -*-@Time: 2021/6/1 17:09@Author: 三玖天下第一@File: 多线程.py@Software: PyCharm"""# 一个进程默认有一个线程，该线程叫主线程。其他线程都叫子线程（需要手动创建）# 如果一个Python程序需要子线程需要手动创建子线程类Thread对象import timeimport threadingfrom threa
复制链接

扫一扫