yolov5 对不同媒体源的处理方式
yolov5 可以处理不同格式的数据, 如图像, 视频, 截屏, 摄像头, 流媒体等等
总结 : yolo对于不同格式的处理其实都是转换为图像进行处理的, 只不过具体方式有所区别
图像数据处理
这也是 yolo 最核心的数据处理方式
首先, 转换为 Dataset
, 在 detect.py
的127 行
else:
dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride) # 640 32 True 1
我们观察LoadImages
类
class LoadImages:
"""YOLOv5 image/video dataloader, i.e. `python detect.py --source image.jpg/vid.mp4`"""
def __init__(self, path, img_size=640, stride=32, auto=True, transforms=None, vid_stride=1):
"""Initializes YOLOv5 loader for images/videos, supporting glob patterns, directories, and lists of paths."""
if isinstance(path, str) and Path(path).suffix == ".txt": # *.txt file with img/vid/dir on each line
path = Path(path).read_text().rsplit()
files = []
# 将图像的路径和视频的路径都加入到files中
for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
p = str(Path(p).resolve())
if "*" in p:
files.extend(sorted(glob.glob(p, recursive=True))) # glob
elif os.path.isdir(p):
files.extend(sorted(glob.glob(os.path.join(p, "*.*")))) # dir
elif os.path.isfile(p):
files.append(p) # files
else:
raise FileNotFoundError(f"{p} does not exist")
images = [x for x in files if x.split(".")[-1].lower() in IMG_FORMATS]
videos = [x for x in files if x.split(".")[-1].lower() in VID_FORMATS]
ni, nv = len(images), len(videos)
self.img_size = img_size # 640
self.stride = stride # 32
self.files = images + videos
self.nf = ni + nv # number of files
self.video_flag = [False] * ni + [True] * nv
self.mode = "image"
self.auto = auto
self.transforms = transforms # None
self.vid_stride = vid_stride # 1
if any(videos):
self._new_video(videos[0]) # new video
else:
self.cap = None
assert self.nf > 0, (
f"No images or videos found in {p}. "
f"Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}"
)
def __iter__(self):
"""Initializes iterator by resetting count and returns the iterator object itself."""
self.count = 0
return self
def __next__(self):
"""Advances to the next file in the dataset, raising StopIteration if at the end."""
if self.count == self.nf:
raise StopIteration
path = self.files[self.count]
# ... 删去了部分 Video 相关代码
# Read image
self.count += 1
im0 = cv2.imread(path) # BGR
assert im0 is not None, f"Image Not Found {path}"
s = f"image {self.count}/{self.nf} {path}: "
if self.transforms: # None
im = self.transforms(im0) # transforms
else:
im = letterbox(im0, self.img_size, stride=self.stride, auto=self.auto)[0] # padded resize
im = im.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
im = np.ascontiguousarray(im) # contiguous
return path, im, im0, self.cap, s
def __len__(self):
"""Returns the number of files in the dataset."""
return self.nf # number of files
在下面循环中循环调用__next__
方法, 获得letterbox
处理后的numpy图像数据
for path, im, im0s, vid_cap, s in dataset:
with dt[0]:
# ....
然后就是模型调用, NMS处理, 再还原成原来的图像
速率: cpu 大约是 90ms 一张图像, gpu 3ms
视频数据处理
如果是视频 , 任然使用 LoadImages
这个类, 不过 __next__
方法中会有所变化
当媒体为视频资源时, 视频有很多帧, 在循环一个视频资源时, 每一次 __next__
将返回一个帧的 numpy 数组, 同时, 在循环一个视频时, self.count
不变
def __next__(self):
"""Advances to the next file in the dataset, raising StopIteration if at the end."""
if self.count == self.nf:
raise StopIteration
path = self.files[self.count]
if self.video_flag[self.count]: # 当前路径属于视频
# Read video
self.mode = "video"
for _ in range(self.vid_stride): # 跨一下视频偏置, 比如说录制的视频2帧算一帧
self.cap.grab() # 尝试获取视频当前帧 , cap.read() = cap.grab() + cap.retrieve()
ret_val, im0 = self.cap.retrieve()
while not ret_val: # 失败的情况, 或者视频结束了
self.count += 1
self.cap.release() # 释放资源
if self.count == self.nf: # last video
raise StopIteration
path = self.files[self.count] # 下一个视频咯
self._new_video(path)
ret_val, im0 = self.cap.read()
self.frame += 1
# im0 = self._cv2_rotate(im0) # for use if cv2 autorotation is False
s = f"video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: "
else:
# Read image
self.count += 1
im0 = cv2.imread(path) # BGR
assert im0 is not None, f"Image Not Found {path}"
s = f"image {self.count}/{self.nf} {path}: "
if self.transforms: # None
im = self.transforms(im0) # transforms
else:
im = letterbox(im0, self.img_size, stride=self.stride, auto=self.auto)[0] # padded resize
im = im.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
im = np.ascontiguousarray(im) # contiguous
def _new_video(self, path):
"""
初始化视频相关内容, 如帧数等等
"""
self.frame = 0
self.cap = cv2.VideoCapture(path)
self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride) # 321 / 1
self.orientation = int(self.cap.get(cv2.CAP_PROP_ORIENTATION_META)) # rotation degrees 0
截屏处理
我们使用 mss 这个库进行屏幕截图, 其他处理方式和 LoadImages
类的图像处理方式一样
elif screenshot:
dataset = LoadScreenshots(source, img_size=imgsz, stride=stride, auto=pt)
启动方式
python detect.py --source 0 100 100 512 256 # 截取 (100,100) 和 (512,256) 这两个点组成的区域
mss 使用请参考 链接
class LoadScreenshots:
# YOLOv5 screenshot dataloader, i.e. `python detect.py --source "screen 0 100 100 512 256"`
def __init__(self, source, img_size=640, stride=32, auto=True, transforms=None):
"""
Initializes a screenshot dataloader for YOLOv5 with specified source region, image size, stride, auto, and
transforms.
Source = [screen_number left top width height] (pixels)
"""
check_requirements("mss")
import mss
source, *params = source.split()
self.screen, left, top, width, height = 0, None, None, None, None # default to full screen 0
if len(params) == 1:
self.screen = int(params[0])
elif len(params) == 4:
left, top, width, height = (int(x) for x in params)
elif len(params) == 5:
self.screen, left, top, width, height = (int(x) for x in params)
self.img_size = img_size
self.stride = stride
self.transforms = transforms
self.auto = auto
self.mode = "stream"
self.frame = 0
self.sct = mss.mss()
# Parse monitor shape mss.mss().monitors保存着显示器的分辨率
monitor = self.sct.monitors[self.screen]
self.top = monitor["top"] if top is None else (monitor["top"] + top)
self.left = monitor["left"] if left is None else (monitor["left"] + left)
self.width = width or monitor["width"]
self.height = height or monitor["height"]
# 定义了xyxy屏幕区域
self.monitor = {"left": self.left, "top": self.top, "width": self.width, "height": self.height}
def __iter__(self):
"""Iterates over itself, enabling use in loops and iterable contexts."""
return self
def __next__(self):
"""Captures and returns the next screen frame as a BGR numpy array, cropping to only the first three channels
from BGRA.
"""
im0 = np.array(self.sct.grab(self.monitor))[:, :, :3] # [:, :, :3] BGRA to BGR
s = f"screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: "
if self.transforms:
im = self.transforms(im0) # transforms
else:
im = letterbox(im0, self.img_size, stride=self.stride, auto=self.auto)[0] # padded resize
im = im.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
im = np.ascontiguousarray(im) # contiguous
self.frame += 1
return str(self.screen), im, im0, None, s # screen, img, original img, im0s, s
流媒体摄像头处理
这个是最有意思的一个处理, 且处理能力取决于电脑计算速度和网络速度(网络流)
为什么这样说呢, 因为在 yolov5 的处理中, 用一个线程将流媒体传输的帧保存, 但是这个保存是覆盖保存, 也就是只会保存传输的最新帧, 这就意味着你的设备计算速率需要高于帧的传输速率, 不过由于一张图像的处理一般在 3ms 左右(gpu) , 而流视频帧率在 30 fps 左右, 1000>>90
, 所以不会存在丢帧的问题, 但如果调整了letterbox
预处理的图像大小的话就不一定了
if webcam:
view_img = check_imshow(warn=True) # 测试 cv 环境是否可用
dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride)
bs = len(dataset)
以摄像头为例
class LoadStreams:
# YOLOv5 streamloader, i.e. `python detect.py --source 'rtsp://example.com/media.mp4' # RTSP, RTMP, HTTP streams`
def __init__(self, sources="file.streams", img_size=640, stride=32, auto=True, transforms=None, vid_stride=1):
"""Initializes a stream loader for processing video streams with YOLOv5, supporting various sources including
YouTube.
"""
torch.backends.cudnn.benchmark = True # faster for fixed-size inference
self.mode = "stream"
self.img_size = img_size # (640,640)
self.stride = stride # (32)
self.vid_stride = vid_stride # video frame-rate stride 1
# 可能存在多个源, 一般是一个
sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources] # ['0']
n = len(sources) # 1
self.sources = [clean_str(x) for x in sources] # clean source names for later
self.imgs, self.fps, self.frames, self.threads = [None] * n, [0] * n, [0] * n, [None] * n
for i, s in enumerate(sources): # index, source
# Start thread to read frames from video stream
st = f"{i + 1}/{n}: {s}... "
# 如果是 youtube 的资源, 特殊处理
if urlparse(s).hostname in ("www.youtube.com", "youtube.com", "youtu.be"): # if source is YouTube video
# YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/LNwODJXcvt4'
check_requirements(("pafy", "youtube_dl==2020.12.2"))
import pafy
s = pafy.new(s).getbest(preftype="mp4").url # YouTube URL
# 本地摄像头
s = eval(s) if s.isnumeric() else s # i.e. s = '0' local webcam
if s == 0:
# colab 和 kaggle 不可以用本地摄像头
assert not is_colab(), "--source 0 webcam unsupported on Colab. Rerun command in a local environment."
assert not is_kaggle(), "--source 0 webcam unsupported on Kaggle. Rerun command in a local environment."
# opencv 捕获流媒体源
cap = cv2.VideoCapture(s)
assert cap.isOpened(), f"{st}Failed to open {s}"
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # 640
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # 480
fps = cap.get(cv2.CAP_PROP_FPS) # warning: may return 0 or nan 30
# 有多个源, 所以这里用数组弄 [i]
self.frames[i] = max(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float("inf") # infinite stream fallback inf
self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30 # 30 FPS fallback 30
_, self.imgs[i] = cap.read() # guarantee first frame
# 定义当前流媒体对应的线程, 且是守护线程
self.threads[i] = Thread(target=self.update, args=([i, cap, s]), daemon=True)
LOGGER.info(f"{st} Success ({self.frames[i]} frames {w}x{h} at {self.fps[i]:.2f} FPS)")
# 启动
self.threads[i].start()
LOGGER.info("") # newline
# check for common shapes
# 处理第一帧数据, 看看合不合规, 没什么意义
s = np.stack([letterbox(x, img_size, stride=stride, auto=auto)[0].shape for x in self.imgs])
self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal
self.auto = auto and self.rect
self.transforms = transforms # optional
if not self.rect:
LOGGER.warning("WARNING ⚠️ Stream shapes differ. For optimal performance supply similarly-shaped streams.")
def update(self, i, cap, stream):
"""
线程实时获取流媒体帧的方式
:param i: 对应哪个流媒体源
:param cap: 捕获的媒体对象
:param stream: 流媒体或者摄像头url, 摄像头的话为 0
:return:
"""
n, f = 0, self.frames[i] # frame number, frame array
while cap.isOpened() and n < f:
n += 1
cap.grab() # .read() = .grab() followed by .retrieve()
if n % self.vid_stride == 0:
success, im = cap.retrieve()
if success:
self.imgs[i] = im
else:
LOGGER.warning("WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.")
self.imgs[i] = np.zeros_like(self.imgs[i])
cap.open(stream) # re-open stream if signal was lost
time.sleep(0.0) # wait time
def __iter__(self):
"""Resets and returns the iterator for iterating over video frames or images in a dataset."""
self.count = -1
return self
def __next__(self):
"""Iterates over video frames or images, halting on thread stop or 'q' key press, raising `StopIteration` when
done.
"""
self.count += 1
# 按 q 退出
if not all(x.is_alive() for x in self.threads) or cv2.waitKey(1) == ord("q"): # q to quit
cv2.destroyAllWindows()
raise StopIteration
im0 = self.imgs.copy()
if self.transforms: # None
im = np.stack([self.transforms(x) for x in im0]) # transforms
else:
# 处理线程得到了帧数据, 并且预处理
im = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0] for x in im0]) # resize
im = im[..., ::-1].transpose((0, 3, 1, 2)) # BGR to RGB, BHWC to BCHW
im = np.ascontiguousarray(im) # contiguous
return self.sources, im, im0, None, ""
def __len__(self):
"""Returns the number of sources in the dataset, supporting up to 32 streams at 30 FPS over 30 years."""
return len(self.sources) # 1E12 frames = 32 streams at 30 FPS for 30 years