【yolo】yolov5 对不同媒体源的处理方式

最新推荐文章于 2024-11-02 15:21:28 发布

忆世界

最新推荐文章于 2024-11-02 15:21:28 发布

阅读量376

点赞数 4

分类专栏： YOLO 目标检测文章标签： YOLO 媒体 python

本文链接：https://blog.csdn.net/qq_51677409/article/details/140131567

版权

YOLO 目标检测专栏收录该内容

7 篇文章 0 订阅

订阅专栏

yolov5 对不同媒体源的处理方式

yolov5 可以处理不同格式的数据, 如图像, 视频, 截屏, 摄像头, 流媒体等等

总结 : yolo对于不同格式的处理其实都是转换为图像进行处理的, 只不过具体方式有所区别

图像数据处理

这也是 yolo 最核心的数据处理方式

首先, 转换为 Dataset , 在 detect.py 的127 行

else:
    dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride) # 640 32 True 1

我们观察LoadImages类

class LoadImages:
    """YOLOv5 image/video dataloader, i.e. `python detect.py --source image.jpg/vid.mp4`"""

    def __init__(self, path, img_size=640, stride=32, auto=True, transforms=None, vid_stride=1):
        """Initializes YOLOv5 loader for images/videos, supporting glob patterns, directories, and lists of paths."""
        if isinstance(path, str) and Path(path).suffix == ".txt":  # *.txt file with img/vid/dir on each line
            path = Path(path).read_text().rsplit()
        files = []
        # 将图像的路径和视频的路径都加入到files中
        for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
            p = str(Path(p).resolve())
            if "*" in p:
                files.extend(sorted(glob.glob(p, recursive=True)))  # glob
            elif os.path.isdir(p):
                files.extend(sorted(glob.glob(os.path.join(p, "*.*"))))  # dir
            elif os.path.isfile(p):
                files.append(p)  # files
            else:
                raise FileNotFoundError(f"{p} does not exist")

        images = [x for x in files if x.split(".")[-1].lower() in IMG_FORMATS]
        videos = [x for x in files if x.split(".")[-1].lower() in VID_FORMATS]
        ni, nv = len(images), len(videos)

        self.img_size = img_size # 640
        self.stride = stride # 32
        self.files = images + videos
        self.nf = ni + nv  # number of files
        self.video_flag = [False] * ni + [True] * nv
        self.mode = "image"
        self.auto = auto
        self.transforms = transforms  # None
        self.vid_stride = vid_stride  # 1
        if any(videos):
            self._new_video(videos[0])  # new video
        else:
            self.cap = None
        assert self.nf > 0, (
            f"No images or videos found in {p}. "
            f"Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}"
        )

    def __iter__(self):
        """Initializes iterator by resetting count and returns the iterator object itself."""
        self.count = 0
        return self

    def __next__(self):
        """Advances to the next file in the dataset, raising StopIteration if at the end."""
        if self.count == self.nf:
            raise StopIteration
        path = self.files[self.count]

        # ... 删去了部分 Video 相关代码
        # Read image
        self.count += 1
        im0 = cv2.imread(path)  # BGR
        assert im0 is not None, f"Image Not Found {path}"
        s = f"image {self.count}/{self.nf} {path}: "

        if self.transforms: # None
            im = self.transforms(im0)  # transforms
        else:
            im = letterbox(im0, self.img_size, stride=self.stride, auto=self.auto)[0]  # padded resize
            im = im.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
            im = np.ascontiguousarray(im)  # contiguous

        return path, im, im0, self.cap, s
    
    def __len__(self):
        """Returns the number of files in the dataset."""
        return self.nf  # number of files

在下面循环中循环调用__next__方法, 获得letterbox处理后的numpy图像数据

for path, im, im0s, vid_cap, s in dataset:
    with dt[0]:
        # ....

然后就是模型调用, NMS处理, 再还原成原来的图像

速率: cpu 大约是 90ms 一张图像, gpu 3ms

视频数据处理

如果是视频 , 任然使用 LoadImages 这个类, 不过 __next__方法中会有所变化

当媒体为视频资源时, 视频有很多帧, 在循环一个视频资源时, 每一次 __next__ 将返回一个帧的 numpy 数组, 同时, 在循环一个视频时, self.count不变

    def __next__(self):
        """Advances to the next file in the dataset, raising StopIteration if at the end."""
        if self.count == self.nf:
            raise StopIteration
        path = self.files[self.count]

        if self.video_flag[self.count]: # 当前路径属于视频
            # Read video
            self.mode = "video"
            for _ in range(self.vid_stride): # 跨一下视频偏置, 比如说录制的视频2帧算一帧
                self.cap.grab() # 尝试获取视频当前帧 , cap.read() = cap.grab() + cap.retrieve()
            ret_val, im0 = self.cap.retrieve()
            while not ret_val: # 失败的情况, 或者视频结束了
                self.count += 1
                self.cap.release() # 释放资源
                if self.count == self.nf:  # last video
                    raise StopIteration
                path = self.files[self.count] # 下一个视频咯
                self._new_video(path)
                ret_val, im0 = self.cap.read()

            self.frame += 1
            # im0 = self._cv2_rotate(im0)  # for use if cv2 autorotation is False
            s = f"video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: "

        else:
            # Read image
            self.count += 1
            im0 = cv2.imread(path)  # BGR
            assert im0 is not None, f"Image Not Found {path}"
            s = f"image {self.count}/{self.nf} {path}: "

        if self.transforms: # None
            im = self.transforms(im0)  # transforms
        else:
            im = letterbox(im0, self.img_size, stride=self.stride, auto=self.auto)[0]  # padded resize
            im = im.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
            im = np.ascontiguousarray(im)  # contiguous
            
    def _new_video(self, path):
        """
        初始化视频相关内容, 如帧数等等
        """
        self.frame = 0
        self.cap = cv2.VideoCapture(path)
        self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride) # 321 / 1
        self.orientation = int(self.cap.get(cv2.CAP_PROP_ORIENTATION_META))  # rotation degrees 0

截屏处理

我们使用 mss 这个库进行屏幕截图, 其他处理方式和 LoadImages类的图像处理方式一样

elif screenshot:
    dataset = LoadScreenshots(source, img_size=imgsz, stride=stride, auto=pt)

启动方式

python detect.py --source 0 100 100 512 256 # 截取 (100,100) 和 (512,256) 这两个点组成的区域

mss 使用请参考链接

class LoadScreenshots:
    # YOLOv5 screenshot dataloader, i.e. `python detect.py --source "screen 0 100 100 512 256"`
    def __init__(self, source, img_size=640, stride=32, auto=True, transforms=None):
        """
        Initializes a screenshot dataloader for YOLOv5 with specified source region, image size, stride, auto, and
        transforms.

        Source = [screen_number left top width height] (pixels)
        """
        check_requirements("mss")
        import mss

        source, *params = source.split()
        self.screen, left, top, width, height = 0, None, None, None, None  # default to full screen 0
        if len(params) == 1:
            self.screen = int(params[0])
        elif len(params) == 4:
            left, top, width, height = (int(x) for x in params)
        elif len(params) == 5:
            self.screen, left, top, width, height = (int(x) for x in params)
        self.img_size = img_size
        self.stride = stride
        self.transforms = transforms
        self.auto = auto
        self.mode = "stream"
        self.frame = 0
        self.sct = mss.mss()

        # Parse monitor shape  mss.mss().monitors保存着显示器的分辨率
        monitor = self.sct.monitors[self.screen]
        self.top = monitor["top"] if top is None else (monitor["top"] + top)
        self.left = monitor["left"] if left is None else (monitor["left"] + left)
        self.width = width or monitor["width"]
        self.height = height or monitor["height"]
        # 定义了xyxy屏幕区域
        self.monitor = {"left": self.left, "top": self.top, "width": self.width, "height": self.height}

    def __iter__(self):
        """Iterates over itself, enabling use in loops and iterable contexts."""
        return self

    def __next__(self):
        """Captures and returns the next screen frame as a BGR numpy array, cropping to only the first three channels
        from BGRA.
        """
        im0 = np.array(self.sct.grab(self.monitor))[:, :, :3]  # [:, :, :3] BGRA to BGR
        s = f"screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: "

        if self.transforms:
            im = self.transforms(im0)  # transforms
        else:
            im = letterbox(im0, self.img_size, stride=self.stride, auto=self.auto)[0]  # padded resize
            im = im.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
            im = np.ascontiguousarray(im)  # contiguous
        self.frame += 1
        return str(self.screen), im, im0, None, s  # screen, img, original img, im0s, s

流媒体摄像头处理

这个是最有意思的一个处理, 且处理能力取决于电脑计算速度和网络速度(网络流)

为什么这样说呢, 因为在 yolov5 的处理中, 用一个线程将流媒体传输的帧保存, 但是这个保存是覆盖保存, 也就是只会保存传输的最新帧, 这就意味着你的设备计算速率需要高于帧的传输速率, 不过由于一张图像的处理一般在 3ms 左右(gpu) , 而流视频帧率在 30 fps 左右, 1000>>90, 所以不会存在丢帧的问题, 但如果调整了letterbox预处理的图像大小的话就不一定了

if webcam: 
    view_img = check_imshow(warn=True) # 测试 cv 环境是否可用
    dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride)
    bs = len(dataset)

以摄像头为例

class LoadStreams:
    # YOLOv5 streamloader, i.e. `python detect.py --source 'rtsp://example.com/media.mp4'  # RTSP, RTMP, HTTP streams`
    def __init__(self, sources="file.streams", img_size=640, stride=32, auto=True, transforms=None, vid_stride=1):
        """Initializes a stream loader for processing video streams with YOLOv5, supporting various sources including
        YouTube.
        """
        torch.backends.cudnn.benchmark = True  # faster for fixed-size inference
        self.mode = "stream"
        self.img_size = img_size # (640,640)
        self.stride = stride # (32)
        self.vid_stride = vid_stride  # video frame-rate stride 1
        # 可能存在多个源, 一般是一个
        sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources] # ['0']
        n = len(sources) # 1
        self.sources = [clean_str(x) for x in sources]  # clean source names for later
        self.imgs, self.fps, self.frames, self.threads = [None] * n, [0] * n, [0] * n, [None] * n
        for i, s in enumerate(sources):  # index, source
            # Start thread to read frames from video stream
            st = f"{i + 1}/{n}: {s}... "
            # 如果是 youtube 的资源, 特殊处理
            if urlparse(s).hostname in ("www.youtube.com", "youtube.com", "youtu.be"):  # if source is YouTube video
                # YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/LNwODJXcvt4'
                check_requirements(("pafy", "youtube_dl==2020.12.2"))
                import pafy

                s = pafy.new(s).getbest(preftype="mp4").url  # YouTube URL
            # 本地摄像头
            s = eval(s) if s.isnumeric() else s  # i.e. s = '0' local webcam
            if s == 0:
                # colab 和 kaggle 不可以用本地摄像头
                assert not is_colab(), "--source 0 webcam unsupported on Colab. Rerun command in a local environment."
                assert not is_kaggle(), "--source 0 webcam unsupported on Kaggle. Rerun command in a local environment."
            # opencv 捕获流媒体源
            cap = cv2.VideoCapture(s)
            assert cap.isOpened(), f"{st}Failed to open {s}"
            w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # 640
            h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # 480
            fps = cap.get(cv2.CAP_PROP_FPS)  # warning: may return 0 or nan 30
            # 有多个源, 所以这里用数组弄 [i]
            self.frames[i] = max(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float("inf")  # infinite stream fallback inf
            self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30  # 30 FPS fallback 30

            _, self.imgs[i] = cap.read()  # guarantee first frame
            # 定义当前流媒体对应的线程, 且是守护线程
            self.threads[i] = Thread(target=self.update, args=([i, cap, s]), daemon=True)
            LOGGER.info(f"{st} Success ({self.frames[i]} frames {w}x{h} at {self.fps[i]:.2f} FPS)")
            # 启动
            self.threads[i].start()
        LOGGER.info("")  # newline

        # check for common shapes
        # 处理第一帧数据, 看看合不合规, 没什么意义
        s = np.stack([letterbox(x, img_size, stride=stride, auto=auto)[0].shape for x in self.imgs])
        self.rect = np.unique(s, axis=0).shape[0] == 1  # rect inference if all shapes equal
        self.auto = auto and self.rect
        self.transforms = transforms  # optional
        if not self.rect:
            LOGGER.warning("WARNING ⚠️ Stream shapes differ. For optimal performance supply similarly-shaped streams.")

    def update(self, i, cap, stream):
        """
        线程实时获取流媒体帧的方式
        :param i: 对应哪个流媒体源
        :param cap: 捕获的媒体对象
        :param stream: 流媒体或者摄像头url, 摄像头的话为 0
        :return: 
        """
        
        n, f = 0, self.frames[i]  # frame number, frame array
        while cap.isOpened() and n < f:
            n += 1
            cap.grab()  # .read() = .grab() followed by .retrieve()
            if n % self.vid_stride == 0:
                success, im = cap.retrieve()
                if success:
                    self.imgs[i] = im
                else:
                    LOGGER.warning("WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.")
                    self.imgs[i] = np.zeros_like(self.imgs[i])
                    cap.open(stream)  # re-open stream if signal was lost
            time.sleep(0.0)  # wait time

    def __iter__(self):
        """Resets and returns the iterator for iterating over video frames or images in a dataset."""
        self.count = -1
        return self

    def __next__(self):
        """Iterates over video frames or images, halting on thread stop or 'q' key press, raising `StopIteration` when
        done.
        """
        self.count += 1
        # 按 q 退出
        if not all(x.is_alive() for x in self.threads) or cv2.waitKey(1) == ord("q"):  # q to quit
            cv2.destroyAllWindows()
            raise StopIteration
        
        im0 = self.imgs.copy()
        if self.transforms: # None
            im = np.stack([self.transforms(x) for x in im0])  # transforms
        else:
            # 处理线程得到了帧数据, 并且预处理
            im = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0] for x in im0])  # resize
            im = im[..., ::-1].transpose((0, 3, 1, 2))  # BGR to RGB, BHWC to BCHW
            im = np.ascontiguousarray(im)  # contiguous

        return self.sources, im, im0, None, ""

    def __len__(self):
        """Returns the number of sources in the dataset, supporting up to 32 streams at 30 FPS over 30 years."""
        return len(self.sources)  # 1E12 frames = 32 streams at 30 FPS for 30 years