#################################### 多线程
from threading import Thread # 线程类
####多线程第一套写法---推荐
# def func():
# for i in range(1000):
# print("func", i)
#
#
# if __name__ == '__main__':
# t = Thread(target=func) # 创建线程并给线程安排任务
# t.start() # 多线程状态为可以开始工作状态, 具体的执行时间由CPU决定
#
# for i in range(1000):
# print("main", i)
###多线程第二套写法
class MyThread(Thread): #
def run(self): # 固定的 -> 当线程被执行的时候, 被执行的就是run()
for i in range(1000):
print("子线程", i)
if __name__ == '__main__':
t = MyThread()
# t.run() # 方法的调用了. ->不要这样用,成了单线程了
t.start() # 开启线程
for i in range(1000):
print("主线程", i)
##########################多进程+带参数多线程
from multiprocessing import Process
from threading import Thread
# def func():
# for i in range(1000):
# print("子进程", i)
#
#
# if __name__ == '__main__':
# p = Process(target=func)
# p.start()
# for i in range(1000):
# print("主进程", i)
def func(name): # ??
for i in range(1000):
print(name, i)
if __name__ == '__main__':
t1 = Thread(target=func, args=("周杰伦",)) # 传递参数必须是元组
t1.start()
t2 = Thread(target=func, args=("王力宏",))
t2.start()
#####################线程池
# 线程池: 一次性开辟一些线程. 我们用户直接给线程池子提交任务. 线程任务的调度交给线程池来完成
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
def fn(name):
for i in range(1000):
print(name, i)
if __name__ == '__main__':
# 创建线程池
with ThreadPoolExecutor(50) as t:
for i in range(100):
t.submit(fn, name=f"线程{i}")
# with 等待线程池中的任务全部执行完毕. 才继续执行(守护)
print("123")
###########################线程池+xpath应用
# 1. 如何提取单个页面的数据
# 2. 上线程池,多个页面同时抓取
import requests
from lxml import etree
import csv
from concurrent.futures import ThreadPoolExecutor
f = open("data2.csv", mode="w", encoding="utf-8")
csvwriter = csv.writer(f)
def download_one_page(url):
# 拿到页面源代码
resp = requests.get(url)
html = etree.HTML(resp.text)
table = html.xpath("/html/body/div[2]/div[4]/div[1]/table")[0]
# trs = table.xpath("./tr")[1:]
trs = table.xpath("./tr[position()>1]") #不要第一个tr数据
# 拿到每个tr
for tr in trs:
txt = tr.xpath("./td/text()")
# 对数据做简单的处理: \\ / 去掉
txt = (item.replace("\\", "").replace("/", "") for item in txt)
# 把数据存放在文件中
csvwriter.writerow(txt) #追加形式添加一行数据
print(url, "提取完毕!")
if __name__ == '__main__':
# 创建线程池
with ThreadPoolExecutor(50) as t:
for i in range(1, 200): # 199 * 20 = 3980 获取20页数据
# 把下载任务提交给线程池
t.submit(download_one_page, f"http://www.xinfadi.com.cn/marketanalysis/0/list/{i}.shtml")
f.close()
print("全部下载完毕!")
8_多线程+多进程及应用
于 2022-03-09 16:48:38 首次发布