多线程爬取
多线程回顾介绍
import threading
import time
def listening():
for i in range(5):
print("我在听歌")
time.sleep(1)
def reading():
for i in range(5):
print("我比较喜欢读书")
time.sleep(1)
if __name__ == "__main__":
t1 = threading.Thread(target = listening)
t2 = threading.Thread(target = reading)
t1.start()
t2.start()
t1.join()
t2.join()
#等待子线程执行结束之后再让主线程执行
print("程序执行结束")
#创建一个类,继承threading.Thread线程类
class MyThread(threading.Thread):
#初始化父类的__init__方法
def __init__(self,name):
threading.Thread.__init__(self)
self.name = name #它是实例变量,存在对象t1,t2中
def run(self): #run就是父类的方法的重写
print("正在执行—1\n")
time.sleep(1)
print("正在执行—2\n")
time.sleep(1)
print("正在执行—3\n")
time.sleep(1)
print("正在执行—4\n")
time.sleep(1)
print("正在执行—5\n")
time.sleep(1)
print("正在执行—6\n")
time.sleep(1)
print("正在执行—7\n")
time.sleep(1)
print("正在执行—8\n")
time.sleep(1)
print("正在执行—9\n")
time.sleep(1)
print("正在执行—10\n")
time.sleep(1)
if __name__ == "__main__":
t1 = MyThread("t1")
t2 = MyThread("t2")
t3 = MyThread("t3")
t1.start()
t2.start()
t3.start()
t1.join()
t2.join()
t3.join()
print("程序执行结束")
queue队列
队列:先进先出
import threading
import time
import queue
'''
queue是python的标准库,俗称队列
queue 实现线程安全,和多线程配合使用,先进先出的数据结构
列表和字典都属于线程不安全的容器
'''
'''
队列的创建:
1.可以指明队列中能存放的数据个数的上限:maxsize = 20
一旦达到上限,插入会导致阻塞,直到队列中的数据被消费掉。
就像排队,只有20个位置,如果有人加入,除非空位。
2.如果maxsize小于或者等于0,队列大小没有限制。放置多少数据都是可以的。
'''
q = queue.Queue(maxsize = 20)
for i in range(1,21):
#将数据放入队列中
q.put(i)
q.put(i+20)
#遍历队列
while not q.empty():#当它不为空的时候,就一直取
print(q.get())
多线程爬取思路分析
#多线程--腾讯招聘信息爬取
import re
import time
from queue import Queue
from threading import Thread
import json
import requests
class TenxunSpider(object):
def __init__(self):
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}
self.address = {"深圳": 1, "北京": 2, "广州": 5, "上海": 3}
self.url_list = [ ] # 用来放置url
self.q = Queue() #存放url地址的队列
def run(self): # 实现主要逻辑
# 1.获取所有符合地址和招聘信息连接
# 函数之间内部调用前面要加上self
url_jog_list = self.get_url_inform()
# for i in url_jog_list:
# print(i)
# 2.发送请求,获取响应
self.parse_url(url_jog_list)
def parse_url(self):
while True:
#当队列不为空时,获取url地址
if not self.q.empty():
url = self.q.get()
response = requests.get(url=url, headers=self.headers)
# 获取了一个json对象
data2 = json.loads(response.text)
for k in data2["Data"]["Posts"]:
# if k["PostURL"].split("=") [-1]== "0":
# 构建新的url
new_url1 = "https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp={}&postId={}&language=zh-cn".format(
str(int(time.time())), k['PostId'])
# print(new_url1)
response2 = requests.get(new_url1, headers=self.headers)
data3 = json.loads(response2.text)
name_data2 = data3["Data"]["RecruitPostName"] # 获取标题
res_bili_data3 = data3["Data"]["Responsibility"] # 获取工作职责
requirement_data3 = data3["Data"]["Requirement"] # 获取岗位
self.write_data(name_data2, res_bili_data3, requirement_data3)
print("写入完毕......")
print("写入完毕")
else:
break
def write_data(self, name_data2, res_bili_data3, requirement_data3):
try:
name_data2 = re.sub("/", "", name_data2)
f = open("./腾讯招聘信息/" + name_data2 + ".txt", "a")
f.write(name_data2)
f.write("\n工作职责\n")
f.write(res_bili_data3)
f.write("\n工作要求\n")
f.write(requirement_data3)
f.close()
except Exception as e:
print(e)
def get_url_inform(self):
# 提供用户输入,所在地址和招聘行业信息
address = input("请输入您选择的地点:")
job_info = input("请输入您选择的工作岗位方向:")
page = input("您要查询多少页:")
# print(address)
# print(job_info)
# print(page)
# 遍历字典
for key, value in self.address.items():
if address == key:
address_input = str(value)
for i in range(1, int(page) + 1):
url = "https://careers.tencent.com" \
"/tencentcareer/api/post/" \
"Query?timestamp={}&cityId={}" \
"&keyword={}&pageIndex={}" \
"&pageSize=10&language=zh-cn&area=cn" \
"".format(str(int(time.time())), address_input, job_info, str(i))
self.url_list.append(url)
self.url_in() #写入队列
#return url_list
def url_in(self):
for i in self.url_list:
self.q.put(i)
self.parse_url()
def main(self):
# 1.获取所有符合地址和招聘信息连接
url_jog_list = self.get_url_inform()
#创建多个线程
t_list = [ ]
for i in range(3):
t = Thread(target=self.parse_url)
t_list.append(t)
t.start()
#先加载子线程,再执行主线程
for i in t_list:
i.join()
if __name__ == "__main__":
start = time.time()
ts = TenxunSpider()
ts.main()
end = time.time()
print(end - start)