文章目录
进程
一、程序、进程和线程
程序:一个应用可以当做一个程序,比如qq软件
进程:程序运行最小的资源分配单位。一个程序可以有多个进程
线程:cpu最小的调度单位,必须依赖进程而存在。线程没有独立的资源,所有线程共享他所在进程的资源
import time
import random
import threading
#单线程爬虫
def download(fileName):
print(f"{fileName}文件开始下载")
time.sleep(random.random()*10)
print(f"{fileName}文件完成下载")
#单线程 默认主线程
if __name__ == '__main__':
for i in range(5):
# download(i)
t = threading.Thread(target=download,args=(i,))
t.start()
关系:一个程序至少有一个进程,一个进程至少有一个线程
二、多线程
多线程是指一个程序包含多个并行的现车给你来完成不同的任务
优点:可以提高cpu的利用率
三、创建多线程
第一种方法
# 导包
import threading
# 创建一个线程
t = threading.Thread(
target = 方法名,
args = (,) # 方法的参数
)
# 启动线程
t.start()
import random,time,threading
def sing():
for i in range(3):
print(f'{i}正在唱歌--------')
time.sleep(random.random())
def dance():
for i in range(3):
print(f'{i}正在跳舞------')
time.sleep(random.random())
if __name__ == '__main__':
#创建线程来启动这两个任务
t1 = threading.Thread(target=sing)
t2 = threading.Thread(target=dance)
t1.start()
t2.start()
#查看线程数量几个
while True:
length = len(threading.enumerate())
print(f'当前运行的线程数量:{length}')
time.sleep(random.random())
if length<=1:
break
四、线程生存期
线程启动到任务方法执行完毕称为一个线程的生存期
五、查看线程数量
threading.enumerate() # 可以查看当前进程下的线程的数量
import random,time,threading
def sing():
for i in range(3):
print(f'{i}正在唱歌--------')
time.sleep(random.random())
def dance():
for i in range(3):
print(f'{i}正在跳舞------')
time.sleep(random.random())
if __name__ == '__main__':
#创建线程来启动这两个任务
t1 = threading.Thread(target=sing)
t2 = threading.Thread(target=dance)
t1.start()
t2.start()
#查看线程数量几个
while True:
length = len(threading.enumerate())
print(f'当前运行的线程数量:{length}')
time.sleep(random.random())
if length<=1:
break
六、创建线程
第二种方法:通过线程类来创建
-
继承threading.Thread
-
重写run方法
-
实例化这个类就可以创建线程,之后再调用start()方法启动
#父类
class A:
def init(self):
print(‘父类init被触发’)
def run(self):
print(‘父类的run被调用’)#子类
class B(A):
def run(self):
print(‘子类的run被调用’)
#实例化过程
b = B()
b.run()
七、线程类传参
必须在线程类的init方法中调用父类的init方法
# 调用父类init方法有两种:
super().__init__() # 第一种
threading.Thread.__init__(self)
import threading,time
class MyThread(threading.Thread):
def __init__(self,filename):
self.filename = filename
print('线程开始启动=====')
# threading.Thread.__init__(self)
def run(self):
print(f'线程开始下载:{self.filename}---')
if __name__ == '__main__':
t = MyThread('log.png')
t.start()
python编译器:
- cpython c语言写的
- jpython
- pypython
八、线程类
线程类中,可以通过实例化线程类的时候,通过制定name参数来给线程起名字
t = MyThread(name = 'downloadThread')
t.start()
# 在线程类中调用self.name来使用线程的名称
# 如果不指定名称,默认情况系就是Thread-1,Thread-2,...
import threading
class MyThread(threading.Thread):
def run(self):
print("%s正在下载...." % self.name)
if __name__ == '__main__':
#自定义线程名
t = MyThread(name= "downloadThread")
t.start()
#如果不传则默认线程名称Thread-1,Thread-2...以此类推
# for i in range(5):
# t = MyThread()
# t.start()
格式化字符串的三种方法
1.'....%s.'%i
2.'..{3}..{2}...{1}'.format(a,b,c)
'..{}...{}..{}'.format(a,b,c)
3.f'...{filename}..'
from threading import Thread
import time
class Mythread(Thread):
def __init__(self,filename):
super().__init__()
self.filename = filename
def run(self):
for i in range(3):
time.sleep(1)
print(f'当前线程是:{self.name},正在下载:{self.filename}')
if __name__ == '__main__':
for i in range(3):
t = Mythread(i)
t.start()
九、线程的执行顺序
线程执行顺序是不固定,原因主要是由线程状态决定
from threading import Thread
import time
import random
g_num = 100
def work1():
global g_num
for i in range(3):
g_num += 1
time.sleep(random.random())
print('in work1,gum=%d' % g_num)
def work2():
global g_num
for i in range(3):
g_num += 1
time.sleep(random.random())
print('in work2,gum=%d' % g_num)
if __name__ == '__main__':
t1 = Thread(target=work1)
t2 = Thread(target=work2)
t1.start()
t2.start()
'''
其中一种执行结果:
in work1,gum=102
in work2,gum=103
in work2,gum=104
in work1,gum=105
in work1,gum=106
in work2,gum=106
'''
线程的五种状态:
- 新建:线程创建
t = Mythread(i)
或
t = threadingg.Thread(target=) - 就绪:当启动线程后,线程就进入就绪状态,就绪状态的线程会被放到一个cpu的调度队列里面,cpu会负责让其中的线程运行,变为运行状态
- 运行状态:cpu调度一个就绪状态的线程,该线程就变为运行状态
- 阻塞状态:当运行状态的线程被阻塞time.sleep()就变为阻塞状态,阻塞状态的线程会重新变为就绪状态才能继续运行
- 死亡状态:线程执行完毕
十、线程不安全问题
多个线程对公有变量处理时,容易造成数据混乱
十一、多线程和多进程*
(一)多线程
优点
- 程序逻辑和控制方式复杂
- 所有线程可以直接共享内存和变量
- 线程方式消耗的总资源比进程方式好
缺点
- 没个线程与主程序共用地址空间,受限于2GB地址空间
- 线程之间的同步和加锁控制比较麻烦
- 一个线程的崩溃可能影响到整个程序的稳定性
(二)多进程
优点
- 每个进程相互独立,不影响主程序的稳定性,子进程崩溃没关系
- 通过增加CPU,就容易扩充性能
- 每个子进程都有2GB地址空间和相关资源,总体能达到的性能上线非常大
缺点
- 逻辑控制复杂,需要和主程序交互
- 需要跨进程边界,如果有大数据量传送,就不太好,适合小数据量传送、密集运算,多进程调度开销比较大
在实际开发中,选择多线程和多进程应该从具体实际开发来进行选择。最好是多进程和多线程结合,即根据实际的需要,每个CPU开启一个子进程,这个子进程开启多线程可以若干同类型的数据进行处理
#列表(容器)当做实参传递到线程中
import threading
import time
import copy
def work1(download_list,finish_list):
#为了不影响下载任务列表,我们对下载列表进行拷贝
#浅拷贝:copy_list发生改变不影响download_list(不改变)
#同样的download_list发生改变,不会影响到copy_list
copy_list = copy.copy(download_list)
for file in download_list:#[22]
print("----in work1---download:%d"%file)
time.sleep(1)
#下载完成之后
#1、任务列表中移除已经下载的元素
download_list.remove(file)
finish_list.append(file)
if __name__ == '__main__':
#下载任务列表
download_list = [11,22,33]
total = len(download_list) #总任务列数
finish_list = []
t1 = threading.Thread(target=work1,args=(download_list,finish_list))
t1.start()
while True:
print(download_list,finish_list)
pro = len(finish_list)/total
print("当前下载进度:%.2f%%"%(pro*100))
time.sleep(1)
if pro == 1:
print("全部任务下载完成!")
break
十二、互斥锁
通过互斥锁来确保线程之间数据的正确
创建互斥锁的步骤:
# 1.创建互斥锁
muetx = threading.Lock()
# 2.
if mutex.acpuire():
'''
对公有变量处理的代码
'''
mutex.release() # 释放锁
十三、死锁
产生死锁的情况有两种:
-
当一个线程获取了锁之后,还未释放的前提下,试图获取另一把锁
-
线程A获取锁1,线程B获取了锁2,线程A还未释放锁1继续想要获取锁2,线程B也未释放锁2,同时想要获取锁1
from threading import Thread
import threading#创建一个所对象
metux = threading.Lock()metuxflag = metux.acquire()
g_num = 0
def a1():
global g_num
if metux.acquire():
for i in range(1000000):
# g_num += 1
print(‘马上发生死锁!’)
if metux.acquire():
b = g_num + 1
g_num = b
metux.release()
metux.release()
print("—test1—g_num=%d"%g_num)def a2():
global g_num
if metux.acquire():
for i in range(1000000):
a = g_num + 1
g_num = a
metux.release()
print("—test2—g_num=%d"%g_num)
if name == ‘main’:
p1 = Thread(target=a1)
p1.start()p2 = Thread(target=a2) p2.start() p1.join() p2.join() print(g_num)
十四、爬取网站的流程
- 确定网站哪个url是数据的来源
- 简要分析一下网站结构,查看数据一般放在哪里
- 查看是否有分页,解决分页的问题
- 发送请求,查看response.text里面是否有我们想要的数据内欧诺个
- 如果有数据,就用响应的提取数据的放大提取数据保存
- 如果么有,我们就可以通过以下两种方法来实现爬取:
- 分析数据来源,查看是否通过一些借口获取到的页面数据(首推)
如果没有在页面中返回数据,我们应该首先想到,数据有可能是从ajax接口中获得的
分析接口的步骤:- 查看接口返回的数据是否是我们想要的
- 重点查看该接口的请求参数
了解哪些请求参数是变化的以及他的变化规律
- selenium+phantomjs来获取页面内容
- 分析数据来源,查看是否通过一些借口获取到的页面数据(首推)
tencent_ajax.py—单线程
import requests,json,time
class Tencent:
def __init__(self,url,headers):
self.url= url
self.headers = headers
self.parse()
def write_to_file(self,list_):
for item in list_:
with open('infos.txt','a+',encoding='utf-8') as fp:
fp.writelines(str(item))
def parse_json(self,text):
#将json字符串编程python内置对象
infos = []
json_dict = json.loads(text)
for data in json_dict['Data']['Posts']:
RecruitPostName = data['RecruitPostName']
CategoryName = data['CategoryName']
Responsibility = data['Responsibility']
LastUpdateTime = data['LastUpdateTime']
detail_url = data['PostURL']
item = {}
item['RecruitPostName'] = RecruitPostName
item['CategoryName'] = CategoryName
item['Responsibility'] = Responsibility
item['LastUpdateTime'] = LastUpdateTime
item['detail_url'] = detail_url
print(item)
infos.append(item)
self.write_to_file(infos)
def parse(self):
for i in range(1,30):
params = {
'timestamp': '1572856224161',
'countryId':'',
'cityId':'',
'bgIds':'',
'productId':'',
'categoryId':'',
'parentCategoryId':'',
'attrId':'',
'keyword':'',
'pageIndex': str(i),
'pageSize': '10',
'language': 'zh-cn',
'area': 'cn'
}
response = requests.get(self.url,params=params,headers=self.headers)
self.parse_json(response.text)
if __name__ == '__main__':
start = time.time()
base_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?'
headers= {
'referer': 'https: // careers.tencent.com / search.html',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin'
}
Tencent(base_url,headers)
#11.333648204803467
print(time.time()-start)
tencent_ajax_multi1.py–一页一个线程
import time
import requests,json,threading
class Tencent:
def __init__(self,url,headers,params):
self.url= url
self.headers = headers
self.params = params
# self.parse()
def write_to_file(self,list_):
for item in list_:
with open('infos.txt','a+',encoding='utf-8') as fp:
fp.writelines(str(item))
def parse_json(self,text):
#将json字符串编程python内置对象
infos = []
json_dict = json.loads(text)
for data in json_dict['Data']['Posts']:
RecruitPostName = data['RecruitPostName']
CategoryName = data['CategoryName']
Responsibility = data['Responsibility']
LastUpdateTime = data['LastUpdateTime']
detail_url = data['PostURL']
item = {}
item['RecruitPostName'] = RecruitPostName
item['CategoryName'] = CategoryName
item['Responsibility'] = Responsibility
item['LastUpdateTime'] = LastUpdateTime
item['detail_url'] = detail_url
# print(item)
infos.append(item)
self.write_to_file(infos)
def parse(self):
response = requests.get(self.url,params=params,headers=self.headers)
self.parse_json(response.text)
if __name__ == '__main__':
start = time.time()
base_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?'
headers= {
'referer': 'https: // careers.tencent.com / search.html',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin'
}
crawl_list = []
for i in range(1, 30):
params = {
'timestamp': '1572850797210',
'countryId': '',
'cityId': '',
'bgIds': '',
'productId': '',
'categoryId': '',
'parentCategoryId': '',
'attrId': '',
'keyword': '',
'pageIndex': str(i),
'pageSize': '10',
'language': 'zh-cn',
'area': 'cn'
}
tencent = Tencent(base_url,headers,params)
#用第一种方法开启线程
t = threading.Thread(target=tencent.parse)
t.start()
crawl_list.append(t)
#将每个线程都调用join方法,保证测试运行时间是在每个线程直线完毕之后的时间
for t in crawl_list:
t.join()
#8.0724618434906
print(time.time()-start)
tencent_ajax_multi.py–固定五个线程
import requests,json,time,threading
from queue import Queue
class Tencent(threading.Thread):
def __init__(self,url,headers,name,q):
super().__init__()
self.url= url
self.name = name
self.q = q
self.headers = headers
def run(self):
self.parse()
def write_to_file(self,list_):
for item in list_:
with open('infos.txt','a+',encoding='utf-8') as fp:
fp.writelines(str(item))
def parse_json(self,text):
#将json字符串编程python内置对象
infos = []
json_dict = json.loads(text)
for data in json_dict['Data']['Posts']:
RecruitPostName = data['RecruitPostName']
CategoryName = data['CategoryName']
Responsibility = data['Responsibility']
LastUpdateTime = data['LastUpdateTime']
detail_url = data['PostURL']
item = {}
item['RecruitPostName'] = RecruitPostName
item['CategoryName'] = CategoryName
item['Responsibility'] = Responsibility
item['LastUpdateTime'] = LastUpdateTime
item['detail_url'] = detail_url
# print(item)
infos.append(item)
self.write_to_file(infos)
def parse(self):
while True:
if self.q.empty():
break
page = self.q.get()
print(f'==================第{page}页==========================in{self.name}')
params = {
'timestamp': '1572856224161',
'countryId':'',
'cityId':'',
'bgIds':'',
'productId':'',
'categoryId':'',
'parentCategoryId':'',
'attrId':'',
'keyword':'',
'pageIndex': str(page),
'pageSize': '10',
'language': 'zh-cn',
'area': 'cn'
}
response = requests.get(self.url,params=params,headers=self.headers)
self.parse_json(response.text)
if __name__ == '__main__':
start = time.time()
base_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?'
headers= {
'referer': 'https: // careers.tencent.com / search.html',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin'
}
#1创建任务队列
q = Queue()
#2给队列添加任务,任务是每一页的页码
for page in range(1,50):
q.put(page)
# print(queue)
# while not q.empty():
# print(q.get())
#3.创建一个列表
crawl_list = ['aa','bb','cc','dd','ee']
list_ = []
for name in crawl_list:
t = Tencent(base_url,headers,name,q)
t.start()
list_.append(t)
for l in list_:
l.join()
# #11.333648204803467
print(time.time()-start)