-
并行是指两个或者多个事件在同一时刻发送
-
并发是指两个或者多个事件在同一时间段发送
- 并行指的多个cup, 并发主要是针对一个cpu而已
- 并发的目的是充分利用处理器的每一个核,以达到最高的处理
线程代码如下
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-12 09:31
# +--------------------------
# 多线程编程
# 1. 实例化 Thread
# 2. 继承Tread类
import time
from threading import Thread
def sleep_task1():
print("sleep 2 second start!")
time.sleep(2)
print("sleep2 seconds end!")
def sleep_task2():
print("sleep 3 second start!")
time.sleep(3)
print("sleep3 seconds end!")
if __name__ == "__main__":
t1 = Thread(target=sleep_task1)
t1.start()
t2 = Thread(target=sleep_task2)
t2.start()
改进代码:
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-12 09:31
# +--------------------------
# 多线程编程
# 1. 实例化 Thread
# 2. 继承Tread类
import time
from threading import Thread
def sleep_task(sleep_time):
print("sleep {} second start!".format(sleep_time))
time.sleep(sleep_time)
print("sleep {} seconds end!".format(sleep_time))
if __name__ == "__main__":
t1 = Thread(target=sleep_task, args=(2,))
t1.start()
t2 = Thread(target=sleep_task, args=(3,))
t2.start()
继续改进代码,加上统计时间 :
join 是让等待上面的子线程结束之后,才继续执行join下面的代码
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-12 09:31
# +--------------------------
# 多线程编程
# 1. 实例化 Thread
# 2. 继承Tread类
import time
from threading import Thread
def sleep_task(sleep_time):
print("sleep {} second start!".format(sleep_time))
time.sleep(sleep_time)
print("sleep {} seconds end!".format(sleep_time))
if __name__ == "__main__":
start_time = time.time()
t1 = Thread(target=sleep_task, args=(2,))
t1.start()
t2 = Thread(target=sleep_task, args=(3,))
t2.start()
# 加了join之后,主线程会等到 t1,t2结束之后再继续执行下去
t1.join()
t2.join()
end_time = time.time()
print("last_time:{}".format(end_time - start_time))
# 1. 当开启一个程序的时候,会默认启动一个主线程
# 2. 如果在主线程等到其他线程执行完以后才继续执行, join
现在有一个新需要,主线程给 t1,t2线程 1秒时间内完成,不然就关闭 setDaemon(True)
t1.setDaemon(True) # 守护线程,主线程退出,子线程也退出
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-12 09:31
# +--------------------------
# 多线程编程
# 1. 实例化 Thread
# 2. 继承Tread类
import time
from threading import Thread
def sleep_task(sleep_time):
print("sleep {} second start!".format(sleep_time))
time.sleep(sleep_time)
print("sleep {} seconds end!".format(sleep_time))
if __name__ == "__main__":
start_time = time.time()
t1 = Thread(target=sleep_task, args=(2,))
t1.setDaemon(True) # 守护线程,主线程退出,子线程也退出
t1.start()
t2 = Thread(target=sleep_task, args=(3,))
t2.setDaemon(True) # 守护线程,主线程退出,子线程也退出
t2.start()
# 加了join之后,主线程会等到 t1,t2结束之后再继续执行下去
# t1.join()
# t2.join()
time.sleep(1)
# 现在有一个新需要,主线程给 t1,t2线程 1秒时间内完成,不然就关闭
end_time = time.time()
print("last_time:{}".format(end_time - start_time))
# 1. 当开启一个程序的时候,会默认启动一个主线程
# 2. 如果在主线程等到其他线程执行完以后才继续执行, join
加上守护线程之后,1秒之后,就会都停止掉
进程类的方式,创建进程,join方法和setDaemon 正常可以用
# 进程类
class SleepThread(Thread):
def __init__(self, sleep_time):
self.sleep_time = sleep_time
super().__init__()
def run(self):
print("sleep {} seconds start!".format(self.sleep_time))
time.sleep(self.sleep_time)
print("sleep {} seconds end!".format(self.sleep_time))
if __name__ == "__main__":
t1 = SleepThread(2)
t2 = SleepThread(3)
t1.start()
t2.start()
什么是GIL
- GIL的全称是 Global Interpreter Lock(全局解释器锁),来源是 python 设计之初的考虑,为了数据安全所做的决定。每个CPU在同一时间只能执行一个线程(在单核CUP下的多线程其实都只是并发,不是并行)
- 某个线程想要执行,必须先拿到GIL,我们可以把GIL看作是“通行证”,并且在一个 python 进程中,GIL只有一个。拿不到通行证的线程,就不允许进入 cpu执行,GIL会释放的。
- 时间片释放 - 指定时间释放
- 遇到io释放
8-4 线程锁
# 一把锁,上完锁之后,只有释放了,下一个线程才难继续运行,不然拿不到
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-12 13:51
# +--------------------------
from threading import Thread
# 线程同步
from threading import Lock
total = 0
total_lock = Lock() # 线程锁
# 一把锁,上完锁之后,只有释放了,下一个线程才难继续运行,不然拿不到
def add():
total_lock.acquire() # 拿到线程锁
global total
for i in range(1000000):
total += 1
total_lock.release() # 释放线程锁
def desc():
total_lock.acquire() # 拿到线程锁
global total
for i in range(1000000):
total -= 1
total_lock.release() # 释放线程锁
if __name__ == '__main__':
add_thread = Thread(target=add)
desc_thread = Thread(target=desc)
add_thread.start()
desc_thread.start()
add_thread.join()
desc_thread.join()
print(total)
8-6 使用多线程重构 csdn -不是很好,下面是用线程消息队列改进
利用线程可以使用全局变量的原理,实现多线程操作,代码如下:
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-12 14:07
# +--------------------------
from threading import Thread
import re
import ast # str转为list
import requests
from scrapy import Selector
from datetime import datetime # 把字符串转为时间类型
from urllib import parse
import time
from csdn_spider.models import *
topic_list_urls = []
topic_list = []
author_list = []
# 需要抓取的 csdn 域名
domain = 'https://bbs.csdn.net/'
# 获取js,然后通过正则提取出list数据
def get_nodes_json():
left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
nodes_str_match = re.search("forumNodes: (.*])", left_menu_text)
if nodes_str_match:
nodes_str = nodes_str_match.group(1).replace('null', 'None') # 把js里面的None改成null
nodes_list = ast.literal_eval(nodes_str) # 把str转为list
return nodes_list
return []
# url的list
url_list = []
# 处理url,把url都提取到 url_list中
def process_nodes_list(nodes_list):
# 将 json的格式提取出 url到list中
for item in nodes_list:
if "url" in item:
if item['url']:
url_list.append(item['url'])
if 'children' in item:
process_nodes_list(item['children'])
# 获取顶级的url,后面去掉,因为顶级的url只是,下面的聚合.
def get_level1_list(nodes_list):
level1_url = []
for item in nodes_list:
if 'url' in item and item['url']:
level1_url.append(item['url'])
return level1_url
# 获取最终需要抓取的url的list数据
def get_last_urls():
# 获取 list数据
nodes_list = get_nodes_json()
# 处理list数据得到url的list
process_nodes_list(nodes_list)
# 获取1级的url
level1_url = get_level1_list(nodes_list)
# 经过处理之后的 url 的list
last_urls = []
# 如果 url在 url_list 中但是不在 1级中则加入到 last_urls中
for url in url_list:
if url not in level1_url:
last_urls.append(url)
all_urls = []
# csdn默认路径为 未解决, recommend为精华,closed为以解决,我们要抓取这3个地址,所以都要拼接出来
for url in last_urls:
all_urls.append(parse.urljoin(domain, url))
all_urls.append(parse.urljoin(domain, url + '/recommend'))
all_urls.append(parse.urljoin(domain, url + 'closed'))
return all_urls
class ParseTopicAuthorThread(Thread):
pass
class ParseTopicDetailThread(Thread):
def run(self):
while 1:
try:
url = topic_list.pop()
except IndexError as e:
time.sleep(1)
continue
print('开始获取帖子: {} '.format(url))
# 获取帖子的详情以及回复
topic_id = url.split('/')[-1]
res_text = requests.get(url).text
sel = Selector(text=res_text)
all_divs = sel.xpath("//div[starts-with(@id, 'post-')]") # xpath中的方法,已什么开头
topic_item = all_divs[0]
content = topic_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0] # 内容
praised_nums = topic_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0] # 点赞数量
jtl = 0
if topic_item.xpath(".//div[@class='close_topic']/text()").extract():
jtl_str = topic_item.xpath(".//div[@class='close_topic']/text()").extract()[0] # 结帖率
jtl_match = re.search("(\d+\.?\d+)%", jtl_str)
if jtl_match:
jtl = jtl_match.group(1)
existed_topics = Topic.select().where(Topic.id == topic_id)
# 完成topic的更新,把少的几个字段加入进来
if existed_topics:
topic = existed_topics[0]
topic.content = content
topic.jtl = jtl
topic.praised_nums = praised_nums
topic.save()
for answer_item in all_divs[1:]:
answer = Answer()
answer.topic_id = topic_id # 这篇文章id
author_info = answer_item.xpath(".//div[@class='nick_name']//a[1]/@href").extract()[0]
answer.author = author_info.split('/')[-1] # 回帖作者id
create_time = answer_item.xpath(".//label[@class='date_time']/text()").extract()[0]
create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M:%S')
answer.create_time = create_time # 回帖时间
content = answer_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0] # 内容
answer.content = content
praised_nums = answer_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0] # 点赞数量
answer.praised_nums = int(praised_nums)
answer.save()
# 解析下一页
next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
if next_page:
next_url = parse.urljoin(domain, next_page[0])
# 继续解析刚刚得到的下一页的地址
topic_list.append(next_url)
class ParseTopicListThread(Thread):
def run(self):
while 1:
try:
url = topic_list_urls.pop()
except IndexError as e:
time.sleep(1)
continue
print('开始获取帖子列表页: {} '.format(url))
res_text = requests.get(url).text # 获取url的内容
sel = Selector(text=res_text) # 获取selector对象
# 我们找的信息,进过观察在 table下面的 tr中
all_trs = sel.xpath("//table[@class='forums_tab_table']/tbody//tr")
# all_trs = sel.xpath("//table[@class='forums_tab_table']//tr")[2:] #这种写法也可以
for tr in all_trs:
topic = Topic()
if tr.xpath(".//td[1]/span/text()").extract():
status = tr.xpath(".//td[1]/span/text()").extract()[0] # 状态 "未结" "已结" "满意"
topic.status = status
if tr.xpath(".//td[2]/em/text()").extract():
score = tr.xpath(".//td[2]/em/text()").extract()[0] # 赏分
topic.score = int(score)
if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract():
topic_url = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract()[0] # 标题链接,相对地址
topic.id = int(topic_url.split('/')[-1]) # 文章的id
topic_url = parse.urljoin(domain, topic_url) # 相对地址换成绝对地址,方便后续抓取
if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract():
topic_title = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract()[0] # 标题内容
topic.title = topic_title
if tr.xpath("//td[4]/a/text()").extract():
author_url = tr.xpath(".//td[4]/a/@href").extract()[0] # 作者链接,相对地址
author_id = author_url.split('/')[-1] # 作者id
author_url = parse.urljoin(domain, author_url) # 作者链接,改成绝对地址
topic.author = author_id
# 解析用户详情页面
# parse_author(author_url)
if tr.xpath(".//td[4]/em/text()").extract():
create_time = tr.xpath(".//td[4]/em/text()").extract()[0] # 创建时间字符串类型
create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M') # 创建时间转为时间类型
topic.create_time = create_time
if tr.xpath(".//td[5]/span/text()").extract():
answer_info = tr.xpath(".//td[5]/span/text()").extract()[0] # 回复查看数量
answer_nums = answer_info.split('/')[0] # 回复数量
click_nums = answer_info.split('/')[1] # 查看数量
topic.click_nums = int(click_nums)
topic.answer_nums = int(answer_nums)
if tr.xpath(".//td[6]/em/text()").extract():
last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0] # 最后回复时间
last_time = datetime.strptime(last_time_str, '%Y-%m-%d %H:%M') # 把字符串转为时间类型
topic.last_answer_time = last_time
try:
existed_topics = Topic.select().where(Topic.id == topic.id)
if existed_topics:
topic.save()
else:
topic.save(force_insert=True)
except Exception as e:
pass
# 解析帖子内容页面
# parse_topic(topic_url)
topic_list.append(topic_url)
# 解析下一页
next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
# 如果下一页存在,就取到下一页的连接,放入 next_url
if next_page:
next_url = parse.urljoin(domain, next_page[0])
# 继续解析刚刚得到的下一页的地址
topic_list_urls.append(next_url)
if __name__ == "__main__":
last_urls = get_last_urls()
for url in last_urls:
topic_list_urls.append(url)
topic_list_thread = ParseTopicListThread()
topic_detail_thread = ParseTopicDetailThread()
topic_list_thread.start()
topic_detail_thread.start()
8-7 使用多线程和 Queue重构csdn 使用线程消息队列
from queue import Queue
消息队列就不用全局变量了
1.队列添加数据 put
- put 方法是向队列中添加数据
- Queue(maxsize=2) 如果设置了 maxsize=2 表示只能有2个值,如果添加了第三个值,就会堵塞住,下面的代码也不会执行
- 如果 添加了第三个值,被堵塞了,这时候,加上timeout=3,表示3秒后就结束堵塞
- try 用 queue.Full 这个方法也比较好
from queue import Queue
import queue
if __name__ == "__main__":
message_queue = Queue(maxsize=2) # 最多放2个值
# 向队列中放数据,可以放任何类或者对象, timeout
message_queue.put("bobby")
message_queue.put("bobby2")
print("start put bobby3")
try:
message_queue.put("bobby3", timeout=3) #put方法是组赛的
except queue.Full as e:
pass
print("end")
put 方法如果不加timeout属性就会一直堵塞,还有一个方法 put_nowait(), 如果错误,立刻返回,不会等待
2. 队列取数据 get() get_nowait()
如果不传参数,就会从队列头取一个数据
from queue import Queue
import queue
if __name__ == "__main__":
message_queue = Queue(maxsize=2) # 最多放2个值
# 向队列中放数据,可以放任何类或者对象, timeout
message_queue.put("bobby")
message_queue.put("bobby2")
message = message_queue.get()
print(message) #输出 bobby
message = message_queue.get()
print(message) # 输出 bobby2
用队列改善的csdn爬虫
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-12 14:07
# +--------------------------
from threading import Thread
import re
import ast # str转为list
import requests
from scrapy import Selector
from datetime import datetime # 把字符串转为时间类型
from urllib import parse
import time
from queue import Queue
from csdn_spider.models import *
topic_list_queue = Queue()
topic_queue = Queue()
author_queue = Queue()
# 需要抓取的 csdn 域名
domain = 'https://bbs.csdn.net/'
# 获取js,然后通过正则提取出list数据
def get_nodes_json():
left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
nodes_str_match = re.search("forumNodes: (.*])", left_menu_text)
if nodes_str_match:
nodes_str = nodes_str_match.group(1).replace('null', 'None') # 把js里面的None改成null
nodes_list = ast.literal_eval(nodes_str) # 把str转为list
return nodes_list
return []
# url的list
url_list = []
# 处理url,把url都提取到 url_list中
def process_nodes_list(nodes_list):
# 将 json的格式提取出 url到list中
for item in nodes_list:
if "url" in item:
if item['url']:
url_list.append(item['url'])
if 'children' in item:
process_nodes_list(item['children'])
# 获取顶级的url,后面去掉,因为顶级的url只是,下面的聚合.
def get_level1_list(nodes_list):
level1_url = []
for item in nodes_list:
if 'url' in item and item['url']:
level1_url.append(item['url'])
return level1_url
# 获取最终需要抓取的url的list数据
def get_last_urls():
# 获取 list数据
nodes_list = get_nodes_json()
# 处理list数据得到url的list
process_nodes_list(nodes_list)
# 获取1级的url
level1_url = get_level1_list(nodes_list)
# 经过处理之后的 url 的list
last_urls = []
# 如果 url在 url_list 中但是不在 1级中则加入到 last_urls中
for url in url_list:
if url not in level1_url:
last_urls.append(url)
all_urls = []
# csdn默认路径为 未解决, recommend为精华,closed为以解决,我们要抓取这3个地址,所以都要拼接出来
for url in last_urls:
all_urls.append(parse.urljoin(domain, url))
all_urls.append(parse.urljoin(domain, url + '/recommend'))
all_urls.append(parse.urljoin(domain, url + 'closed'))
return all_urls
class ParseTopicAuthorThread(Thread):
pass
class ParseTopicDetailThread(Thread):
def run(self):
while 1:
url = topic_queue.get() # 会堵塞,真是我们需要的
print('开始获取帖子: {} '.format(url))
# 获取帖子的详情以及回复
topic_id = url.split('/')[-1]
res_text = requests.get(url).text
sel = Selector(text=res_text)
all_divs = sel.xpath("//div[starts-with(@id, 'post-')]") # xpath中的方法,已什么开头
topic_item = all_divs[0]
content = topic_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0] # 内容
praised_nums = topic_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0] # 点赞数量
jtl = 0
if topic_item.xpath(".//div[@class='close_topic']/text()").extract():
jtl_str = topic_item.xpath(".//div[@class='close_topic']/text()").extract()[0] # 结帖率
jtl_match = re.search("(\d+\.?\d+)%", jtl_str)
if jtl_match:
jtl = jtl_match.group(1)
existed_topics = Topic.select().where(Topic.id == topic_id)
# 完成topic的更新,把少的几个字段加入进来
if existed_topics:
topic = existed_topics[0]
topic.content = content
topic.jtl = jtl
topic.praised_nums = praised_nums
topic.save()
for answer_item in all_divs[1:]:
answer = Answer()
answer.topic_id = topic_id # 这篇文章id
author_info = answer_item.xpath(".//div[@class='nick_name']//a[1]/@href").extract()[0]
answer.author = author_info.split('/')[-1] # 回帖作者id
create_time = answer_item.xpath(".//label[@class='date_time']/text()").extract()[0]
create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M:%S')
answer.create_time = create_time # 回帖时间
content = answer_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0] # 内容
answer.content = content
praised_nums = answer_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0] # 点赞数量
answer.praised_nums = int(praised_nums)
answer.save()
# 解析下一页
next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
if next_page:
next_url = parse.urljoin(domain, next_page[0])
# 继续解析刚刚得到的下一页的地址
topic_queue.put(next_url)
class ParseTopicListThread(Thread):
def run(self):
while 1:
url = topic_list_queue.get()
print('开始获取帖子列表页: {} '.format(url))
res_text = requests.get(url).text # 获取url的内容
sel = Selector(text=res_text) # 获取selector对象
# 我们找的信息,进过观察在 table下面的 tr中
all_trs = sel.xpath("//table[@class='forums_tab_table']/tbody//tr")
# all_trs = sel.xpath("//table[@class='forums_tab_table']//tr")[2:] #这种写法也可以
for tr in all_trs:
topic = Topic()
if tr.xpath(".//td[1]/span/text()").extract():
status = tr.xpath(".//td[1]/span/text()").extract()[0] # 状态 "未结" "已结" "满意"
topic.status = status
if tr.xpath(".//td[2]/em/text()").extract():
score = tr.xpath(".//td[2]/em/text()").extract()[0] # 赏分
topic.score = int(score)
if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract():
topic_url = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract()[0] # 标题链接,相对地址
topic.id = int(topic_url.split('/')[-1]) # 文章的id
topic_url = parse.urljoin(domain, topic_url) # 相对地址换成绝对地址,方便后续抓取
if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract():
topic_title = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract()[0] # 标题内容
topic.title = topic_title
if tr.xpath("//td[4]/a/text()").extract():
author_url = tr.xpath(".//td[4]/a/@href").extract()[0] # 作者链接,相对地址
author_id = author_url.split('/')[-1] # 作者id
author_url = parse.urljoin(domain, author_url) # 作者链接,改成绝对地址
topic.author = author_id
# 解析用户详情页面
# parse_author(author_url)
if tr.xpath(".//td[4]/em/text()").extract():
create_time = tr.xpath(".//td[4]/em/text()").extract()[0] # 创建时间字符串类型
create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M') # 创建时间转为时间类型
topic.create_time = create_time
if tr.xpath(".//td[5]/span/text()").extract():
answer_info = tr.xpath(".//td[5]/span/text()").extract()[0] # 回复查看数量
answer_nums = answer_info.split('/')[0] # 回复数量
click_nums = answer_info.split('/')[1] # 查看数量
topic.click_nums = int(click_nums)
topic.answer_nums = int(answer_nums)
if tr.xpath(".//td[6]/em/text()").extract():
last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0] # 最后回复时间
last_time = datetime.strptime(last_time_str, '%Y-%m-%d %H:%M') # 把字符串转为时间类型
topic.last_answer_time = last_time
try:
existed_topics = Topic.select().where(Topic.id == topic.id)
if existed_topics:
topic.save()
else:
topic.save(force_insert=True)
except Exception as e:
pass
# 解析帖子内容页面
# parse_topic(topic_url)
topic_queue.put(topic_url)
# 解析下一页
next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
# 如果下一页存在,就取到下一页的连接,放入 next_url
if next_page:
next_url = parse.urljoin(domain, next_page[0])
# 继续解析刚刚得到的下一页的地址
topic_list_queue.put(next_url)
if __name__ == "__main__":
last_urls = get_last_urls()
for url in last_urls:
topic_list_queue.put(url)
topic_list_thread = ParseTopicListThread()
topic_detail_thread = ParseTopicDetailThread()
topic_list_thread.start()
topic_detail_thread.start()
线程池