day26
总结
-
多线程.py
""" !./env python -*- coding: utf-8 -*- @Time: 2021/6/1 17:09 @Author: 三玖天下第一 @File: 多线程.py @Software: PyCharm """ # 一个进程默认有一个线程,该线程叫主线程。其他线程都叫子线程(需要手动创建) # 如果一个Python程序需要子线程需要手动创建子线程类Thread对象 import time import threading from threading import Thread from datetime import datetime from random import randint from mine_thread import MyThread print_lock = threading.Lock() def my_print(*args, out=True, **kwargs): with print_lock: if out: print(*args, **kwargs) else: input('请输入数据:') def download(name): my_print(f'"{name}"开始下载:{datetime.now()}') time.sleep(randint(3, 7)) my_print(f'"{name}"下载结束:{datetime.now()}') if __name__ == '__main__': # new_thread = MyThread(download, '小薇', thread_name='子线程1') # new_thread.start() # new_thread.join() t1 = Thread(target=download, args=('小薇',)) t2 = Thread(target=download, args=('猪猪侠',)) t3 = Thread(target=download, args=('你好,世界',)) # ============电影下载完了再执行主线程=================== # t1.start() # t2.start() # t3.start() # t1.join() # t2.join() # t3.join() # ============t1电影下载完了再执行主线程=================== # t1.start() # t1.join() # t2.start() # t3.start() # t2.join() # t3.join() # ==========t4等待电影全部下载完了提示===================== def wati_download(): t1.start() t2.start() t3.start() t1.join() t2.join() t3.join() my_print('下载完成...') t4 = Thread(target=wati_download) t4.start() while True: my_print(out=False) time.sleep(0.1) # download('小薇') # download('猪猪侠') # download('你好,世界')
-
多进程.py
""" !./env python -*- coding: utf-8 -*- @Time: 2021/6/2 11:48 @Author: 三玖天下第一 @File: 多进程.py @Software: PyCharm """ import time from datetime import datetime from multiprocessing import Process from random import randint from threading import Thread def download(name): print(f'"{name}"开始下载:{datetime.now()}') time.sleep(randint(3, 7)) print(f'"{name}"下载结束:{datetime.now()}') def wait(*args): for p in args: p.start() for p in args: p.join() print('哈哈哈') if __name__ == '__main__': p1 = Process(target=download, args=('小薇',)) p2 = Process(target=download, args=('触不可及',)) p3 = Process(target=download, args=('很爱很爱你',)) t1 = Thread(target=wait, args=(p1, p2, p3)) t1.start() while True: time.sleep(0.1) input('请输入数据:')
-
多进程中创建多线程.py
""" !./env python -*- coding: utf-8 -*- @Time: 2021/6/2 14:09 @Author: 三玖天下第一 @File: 多进程中创建多线程.py @Software: PyCharm """ import random import time from multiprocessing import Process, current_process from threading import Thread, current_thread def download(name): print(f'当前进程{current_process()},当前线程{current_thread()}', end='') print(f'{name}:开始下载...') time.sleep(random.randint(3, 6)) print(f'当前进程{current_process()},当前线程{current_thread()}', end='') print(f"{name}:下载结束...") def load(*names): all_thread = [] for name in names: t = Thread(target=download, args=(name,)) t.start() all_thread.append(t) if __name__ == '__main__': # 1.在主进程中执行 # download('小薇') # 2. # t1 = Thread(target=download, args=('雄纠', )) # t2 = Thread(target=download, args=('阿甘正传', )) # t1.start() # t2.start() # 3. # p1 = Process(target=download, args=('阿甘正传',)) # p2 = Process(target=download, args=('天堂',)) # p3 = Process(target=download, args=('Python',)) # p1.start() # p2.start() # p3.start() # 4 p1 = Process(target=load, args=('阿甘正传', '肖申克的救赎', '喜羊羊与灰太狼')) p2 = Process(target=load, args=('天堂', '我的世界', '天下第一')) p3 = Process(target=load, args=('Python', 'Java', 'JavaScript')) p1.start() p2.start() p3.start()
-
进程通信
""" !./env python -*- coding: utf-8 -*- @Time: 2021/6/2 15:29 @Author: 三玖天下第一 @File: 进程通信.py @Software: PyCharm """ import random import time from multiprocessing import Process, current_process, Queue from threading import current_thread def download(name, q: Queue): print(f'当前进程{current_process()},当前线程{current_thread()}', end='') print(f'{name}:开始下载...') time.sleep(random.randint(3, 6)) print(f'当前进程{current_process()},当前线程{current_thread()}', end='') print(f"{name}:下载结束...") q.put(name) def get_data(q: Queue): while True: result = q.get() if result == 'end': break print(result) if __name__ == '__main__': # 创建空的队列(必须是全局的) q = Queue(maxsize=20) p1 = Process(target=download, args=('小薇', q)) p2 = Process(target=download, args=('触不可及', q)) p3 = Process(target=download, args=('很爱很爱你', q)) p4 = Process(target=get_data, args=(q,)) p1.start() p2.start() p3.start() p4.start() p1.join() p2.join() p3.join() q.put('end')
-
线程通信
""" !./env python -*- coding: utf-8 -*- @Time: 2021/6/2 14:47 @Author: 三玖天下第一 @File: 线程间通信.py @Software: PyCharm """ import random import time from multiprocessing import current_process from threading import Thread, current_thread all_datas = [] def download(name): print(f'当前进程{current_process()},当前线程{current_thread()}', end='') print(f'{name}:开始下载...') time.sleep(random.randint(3, 6)) print(f'当前进程{current_process()},当前线程{current_thread()}', end='') print(f"{name}:下载结束...") all_datas.append(current_thread()) if __name__ == '__main__': t1 = Thread(target=download, args=('雄纠',)) t2 = Thread(target=download, args=('阿甘正传',)) t1.start() t2.start() # 多线程数据共享:同一个进程中多线程数据可以直接共享 # (同一个进程中的全局变量在作用域范围内可以接受或者存储其他线程中的任何数据 # 如果需要在一个线程中去获取其他多个线程中的数据,就定义一个全局的可变容器,比如列表,最好是线程的队列
作业
- 使用多进程和多线程爬取豆瓣图书
"""
@Time: 2021/6/1 9:34
@Author: 三玖天下第一
"""
import json
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process, Queue
import openpyxl
import requests
print_lock = threading.Lock()
def my_print(*args, **kwargs):
with print_lock:
print(*args, **kwargs)
def get_proxy_ips(ip_queue: Queue):
while True:
# api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
api = 'http://api.kuainiaoip.com/index.php?fetch_type=2021060217064947339&pool_id=&qty=5&time=101&province=%E5%9B%9B%E5%B7%9D%E7%9C%81&city=%E6%88%90%E9%83%BD%E5%B8%82&protocol=1&format=txt-normal&dt=1'
response = requests.get(api)
# print(response.text)
if response.status_code == 200:
if response.text == '10404:没有找到相关记录':
print('提取频繁请按照规定频率提取')
else:
for ip in response.text.split('\n')[:-1]:
ip_queue.put(ip)
else:
print('获取代理失败!')
time.sleep(3)
def get_content2(q: Queue, ip_obj, url, header, data):
ip = ip_obj.ip
try:
res = requests.post(url, data=json.dumps(data), headers=header, proxies={'http://': ip, 'https://': ip})
# res = requests.post(url, data=json.dumps(data), headers=header)
if res.status_code == 200:
q.put(res.json())
else:
if ip_obj.is_update(ip):
ip_obj.update(ip)
if res.status_code == 403:
get_content(q, ip_obj, url, header, data)
except Exception as e:
print(e)
if ip_obj.is_update(ip):
ip_obj.update(ip)
get_content(q, ip_obj, url, header, data)
def proxies(ip):
return {'http': ip, 'https': ip}
def get_content(q: Queue, ip_obj, url, header, data):
ip = ip_obj.ip
res = requests.post(url, data=json.dumps(data), headers=header, proxies={'http://': ip, 'https://': ip}, timeout=5)
# res = requests.post(url, data=json.dumps(data), headers=header)
if res.status_code == 200:
q.put(res.json())
else:
ip_obj.update(ip)
time.sleep(0.1)
get_content(q, ip_obj, url, header, data)
def add_get_page(q: Queue):
print('第一个子进程执行...')
proxy_ip = Queue()
threading.Thread(target=get_proxy_ips, args=(proxy_ip,), daemon=True).start()
thread_pool = ThreadPoolExecutor(max_workers=256)
class IpObject:
lock = threading.RLock()
ip = proxy_ip.get()
@classmethod
def update(cls, old):
with cls.lock:
if old == cls.ip:
cls.ip = proxy_ip.get()
print('update', cls.ip)
@classmethod
def is_update(cls, old):
if old == cls.ip:
return True
return False
# 请求网址!!!!post请求 data数据实现数据的获取
url = "https://read.douban.com/j/kind/"
# 浏览器请求头
header = {"accept": "application/json",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8",
"Connection": "keep-alive",
"content-type": "application/json",
"Host": "read.douban.com",
"Cookie": "bid=jXNUTLsP_28; gr_user_id=e52067be-9219-484a-9f84-a1129fa1acbf; __utmz=30149280.1622524612.1.1.utmcsr=sogou.com|utmccn=(referral)|utmcmd=referral|utmcct=/link; __utma=30149280.2030887735.1622524612.1622524612.1622541364.2; _ga=GA1.3.2030887735.1622524612; _gid=GA1.3.231733992.1622705350; _pk_ses.100001.a7dd=*; _gat=1; _pk_id.100001.a7dd=f10116d5e1b94476.1622705350.1.1622705395.1622705350.",
"Origin": "https://read.douban.com",
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
"x-csrf-token": "null",
"x-requested-with": "XMLHttpRequest"}
# 64853
for page in range(60000, 64853):
data = {"sort": "new", "page": page, "kind": 0,
"query": " query getFilterWorksList($works_ids: [ID!]) { worksList(worksIds: $works_ids) { title cover url isBundle url title author { name url } origAuthor { name url } translator { name url } abstract editorHighlight isOrigin kinds { name @skip(if: true) shortName @include(if: true) id } ... on WorksBase @include(if: true) { wordCount wordCountUnit } ... on WorksBase @include(if: false) { isEssay ... on EssayWorks { favorCount } isNew averageRating ratingCount url } ... on WorksBase @include(if: true) { isColumn isEssay onSaleTime ... on ColumnWorks { updateTime } } ... on WorksBase @include(if: true) { isColumn ... on ColumnWorks { isFinished } } ... on EssayWorks { essayActivityData { title uri tag { name color background icon2x icon3x iconSize { height } iconPosition { x y } } } } highlightTags { name } ... on WorksBase @include(if: false) { fixedPrice salesPrice isRebate } ... on EbookWorks { fixedPrice salesPrice isRebate } ... on WorksBase @include(if: true) { ... on EbookWorks { id isPurchased isInWishlist } } id isOrigin } } ",
"variables": {},
"tags": []}
thread_pool.submit(get_content2, q, IpObject, url, header, data)
# get_content(q, IpObject, url, header, data)
thread_pool.shutdown(wait=True)
def analysis_data(pending_data: Queue, data: Queue):
print('第二个子进程执行...')
while True:
content = pending_data.get()
if content == 'end':
print('子进程二结束')
return
root = 'https://read.douban.com'
content_list = content['list']
all_list = []
for dict1 in content_list:
title = dict1['title']
img = dict1['cover']
url = root + dict1['url']
author = dict1['author']
author_name = author[0]['name'] if author else ''
author_url = root + author[0]['url'] if author else ''
abstract = dict1['abstract']
kinds = '|'.join([x['shortName'] for x in dict1['kinds']])
wordCount = dict1['wordCount']
isFinished = dict1.get('isFinished', 'TRUE')
new_list = [title, img, url, author_name, author_url, abstract, kinds, wordCount, isFinished]
all_list.append(new_list)
data.put(all_list)
def save_data(data: Queue):
print('第三个子进程执行...')
# file = r'./files/scrawp.csv'
# f = open(file, 'w', newline='', encoding='utf-8')
# writer = csv.writer(f)
# writer.writerow(
# ['title', 'img', 'url', 'author_name', 'author_url', 'abstract', 'kinds', 'wordCount', 'isFinished'])
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = '豆瓣图书免费'
sheet.append(['title', 'img', 'url', 'author_name', 'author_url', 'abstract', 'kinds', 'wordCount', 'isFinished'])
file = r'./files/scrawp3.xlsx'
def save(wb, file):
while True:
time.sleep(3)
wb.save(file)
t = threading.Thread(target=save, args=(wb, file), daemon=True)
t.start()
while True:
content = data.get()
if content == 'end':
time.sleep(3)
print('子进程三结束')
break
for ls in content:
sheet.append(ls)
if __name__ == '__main__':
pending_data = Queue(maxsize=4096)
data = Queue(maxsize=4096)
p1 = Process(target=add_get_page, args=(pending_data,))
p2 = Process(target=analysis_data, args=(pending_data, data))
p3 = Process(target=save_data, args=(data,))
p1.start()
p2.start()
p3.start()
p1.join()
pending_data.put('end')
pending_data.put('end')
print('网页爬取完成...')
p2.join()
data.put('end')
data.put('end')
print('数据解析完成...')