(一)以爬虫中的多线程为例
主要看main里面的内容,即:
if __name__ == '__main__':
# 1、创建任务队列。存放所有任务
page_queue = Queue()
for page in range(1,61):
page_queue.put(page)
# 2、生成线程
start = time.time()
crawl_name = ["c1","c2","c3"] # 创建三个线程
crawl_thread_list = []
for name in crawl_name:
craw1 = Thread_crawl(name,page_queue)
craw1.start()
crawl_thread_list.append(craw1)
#必须加到列表里之后再全部join
for thread in crawl_thread_list:
thread.join()
end = time.time()
print('耗时',end-start)
首先在队列里存放60个任务,然后开始计时,循环启三个线程,再循环把三个线程加join,目的是让三个线程都执行完再继续主线程完成计时,防止主线程在三个子线程执行期间抢到cpu结束计时。如果在for name in crawl_name里的crawl.start()后加crawl.join() ,那么其他两个子线程和主线程会在c1完成后再开始执行,也就是说任务都让c1干了。
总结: 多个子线程的join必须用列表完成
(二)完整爬虫代码
"""
1、程序:由源代码生成的可以执行的应用(例如:QQ)
2、进程:能有完成多个任务,同时运行多个程序,是一个独立单位
3、线程:能完成多个任务,一个QQ可以和多个聊天窗口
"""
# 1、导入类
import threading
import requests
import json
from queue import Queue
import time
# 2、写子类,继承父类Thread类,复写run方法:
class Thread_crawl(threading.Thread):
def __init__(self, name, page_queue):
threading.Thread.__init__(self)
self.page_queue = page_queue
self.name = name
def run(self):
while not page_queue.empty():
print(self.name,"将要从我们的队列中取任务")
page = self.page_queue.get()
print(self.name, "取出的任务是:",page)
url = "https://careers.tencent.com/tencentcareer/api/post/Query?keyword=Python&pageIndex={}&pageSize=10".format(page)
self.get_content(url=url)
print(self.name, "完成的任务是:",page)
def get_content(self,url):
headers = {
'user - agent': 'Mozilla / 5.0(Windows NT 6.1;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 74.0.3729.108Safari / 537.36'
}
response = requests.get(url=url,headers=headers).content.decode('utf-8')
self.get_data(response)
#解析函数
def get_data(self,response):
data = json.loads(response) # 将json字符串转换成Python标准的数据格式
data_list = data["Data"]["Posts"]
for i in data_list:
# 岗位名称:
RecruitPostName=i["RecruitPostName"]
# 国家
CountryName=i["CountryName"]
# 种类
CategoryName=i["CategoryName"]
PostURL=i["PostURL"]
Responsibility=i["Responsibility"]
infor = "name:"+RecruitPostName+'---'+"CountryName:"+CountryName+'----'\
+"CategoryName:"+CategoryName+"-----"+"PostURL:"+PostURL+"-----"\
+"Responsibility:"+Responsibility
# 保存
with open('job1.txt','a',encoding='utf-8')as fp:
fp.write(infor+'\n')
# 提取数据
if __name__ == '__main__':
# 1、创建任务队列。存放所有队列
page_queue = Queue()
for page in range(1,61):
page_queue.put(page)
# 2、生成线程
start = time.time()
crawl_name = ["c1","c2","c3"] # 创建三个线程
crawl_thread_list = []
for name in crawl_name:
craw1 = Thread_crawl(name,page_queue)
craw1.start()
crawl_thread_list.append(craw1)
for thread in crawl_thread_list:
thread.join()
end = time.time()
print('耗时',end-start)