【无标题】

最新推荐文章于 2025-04-23 15:46:44 发布
infrastruction
最新推荐文章于 2025-04-23 15:46:44 发布
阅读量867
点赞数 22
文章标签： python
本文链接：https://blog.csdn.net/qq_45631885/article/details/136626720
版权
本文记录了在win11+pycharm2023+torch2.2.1+cuda11.8环境下，使用PyQt5界面显示人体姿态识别的过程。主要步骤包括用qt designer画ui图、导出ui文件为py文件、建立槽函数，结合st - gcn的demooffline部分用作识别算法，还给出了引入库的代码示例。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
提示：文章写完后，目录可以自动生成，如何生成可参考右边的帮助文档
前言

记录一下pyqt5界面显示人体姿态识别，环境配置win11+pycharm2023+torch2.2.1+cuda11.8 PyQt version: 5.15.10
需要配置openpose 参考[https://blog.csdn.net/weixin_44003104/article/details/132685437?spm=1001.2014.3001.5506]
一、主要步骤

1、qt designer画出ui图
2、ui文件导出为py文件：命令行里输入pyuic5 -o firstMainWin.py firstMainwin. ui
3、对播放按钮识别按钮建立相应的槽函数
4、将st-gcn的demooffline部分结合起来用作识别的算法
二、代码

1.引入库

代码如下（示例）：
`from PyQt5.QtWidgets import *
from PyQt5.QtMultimedia import *
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtMultimediaWidgets import QVideoWidget
from test01 import Ui_MainWindow
import sys
import time
import cv2
import numpy as np
import torch
from net.st_gcn import Model
class myMainWindow(Ui_MainWindow, QMainWindow):
    def __init__(self):
        super(Ui_MainWindow, self).__init__()
        self.setupUi(self)
        # self.sld_video_pressed=False  #判断当前进度条识别否被鼠标点击
        # self.videoFullScreen = False   # 判断当前widget是否全屏
        # self.videoFullScreenWidget = myVideoWidget()   # 创建一个全屏的widget
        self.player = QMediaPlayer()
        self.player.setVideoOutput(self.widget)  # 视频播放输出的widget，就是上面定义的
        self.pushButton_2.clicked.connect(self.openVideoFile)   # 打开视频文件按钮
        self.pushButton_4.clicked.connect(self.playVideo)       # play
        self.pushButton.clicked.connect(self.pauseVideo)       # pause
        self.player2 = QMediaPlayer()
        self.player2.setVideoOutput(self.widget_2)  # 视频播放输出的widget，就是上面定义的
        self.player3 = QMediaPlayer()
        self.player3.setVideoOutput(self.widget_3)  # 视频播放输出的widget，就是上面定义的
        self.pushButton_3.clicked.connect(self.Bone_recognition)
        self.pushButton_5.clicked.connect(self.rgb_result_recognition)
        graph_args = {'layout': 'openpose', 'strategy': 'spatial'}
        self.model = Model(3,400,{'layout': 'openpose', 'strategy': 'spatial'},True)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)
        self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        label_name_path = './resource/kinetics_skeleton/label_name.txt'
        with open(label_name_path) as f:
            label_name = f.readlines()
            label_name = [line.rstrip() for line in label_name]
            self.label_name = label_name
        # print(self.model)
        # self.btn_cast.clicked.connect(self.castVideo)        # 视频截图
        # self.player.positionChanged.connect(self.changeSlide)      # change Slide
        # self.videoFullScreenWidget.doubleClickedItem.connect(self.videoDoubleClicked)  #双击响应
        # self.wgt_video.doubleClickedItem.connect(self.videoDoubleClicked)   #双击响应
        # self.sld_video.setTracking(False)
        # self.sld_video.sliderReleased.connect(self.releaseSlider)
        # self.sld_video.sliderPressed.connect(self.pressSlider)
        # self.sld_video.sliderMoved.connect(self.moveSlider)   # 进度条拖拽跳转
        # self.sld_video.ClickedValue.connect(self.clickedSlider)  # 进度条点击跳转
        # self.sld_audio.valueChanged.connect(self.volumeChange)  # 控制声音播放
        # self.btn_cast.hide()


    def Bone_recognition(self):
        video,data_numpy = self.pose_estimation()
        edge=[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (15, 15), (16, 16), (17, 17), (4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11), (10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1), (0, 1), (15, 0), (14, 0), (17, 15), (16, 14)]
        # images =self.stgcn_visualize(data_numpy,edge,video)
        images = list(self.stgcn_visualize(data_numpy, edge, video))
        height, width, _ = images[0].shape
        output_file_path = 'test.mp4'
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # 视频编码方式，可以根据需要更改
        out = cv2.VideoWriter(output_file_path, fourcc, 5.0, (width, height))  # 最后一个参数是帧速率，可以根据需要更改
        for image in images:
            out.write(image)
        out.release()

        # 创建媒体内容对象
        media_content = QMediaContent(QUrl.fromLocalFile(output_file_path))

        # 使用QMediaPlayer播放视频
        self.player2.setMedia(media_content)
        self.player2.play()
        # file_path = 'E:/work/qt/rgb/ta_chi.mp4'  # 替换为你的视频文件路径
        # media_content = QMediaContent(QUrl.fromLocalFile(file_path))
        # self.player3.setMedia(media_content)
        # self.player3.play()

    def rgb_result_recognition(self):
        video,data_numpy = self.pose_estimation()
        edge=[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (15, 15), (16, 16), (17, 17), (4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11), (10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1), (0, 1), (15, 0), (14, 0), (17, 15), (16, 14)]
        #images =self.stgcn_visualize(data_numpy,edge,video)
        print(self.model.graph.edge)
        data = torch.from_numpy(data_numpy)#通过torch.from_numpy()将NumPy数组data_numpy转换为PyTorch张量。
        data = data.unsqueeze(0)
        data = data.float().to(self.dev).detach()  # (1, channel, frame, joint, person)
        # model predict
        voting_label_name, video_label_name, output, intensity = self.predict(data)
        # images = self.stgcn_rgb_visualize(data_numpy, edge, intensity, video)
        images = list(self.stgcn_rgb_visualize(data_numpy, edge,intensity, video,voting_label_name))
        # for image in images:
        #     image = image.astype(np.uint8)
        #     cv2.imshow("ST-GCN", image)
        #     if cv2.waitKey(1) & 0xFF == ord('q'):
        #         break
        height, width, _ = images[0].shape
        output_file_path = 'test1.mp4'
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # 视频编码方式，可以根据需要更改
        out = cv2.VideoWriter(output_file_path, fourcc, 5.0, (width, height))  # 最后一个参数是帧速率，可以根据需要更改
        for image in images:
            image = image.astype(np.uint8)
            out.write(image)
        out.release()

        # 创建媒体内容对象
        media_content = QMediaContent(QUrl.fromLocalFile(output_file_path))

        # 使用QMediaPlayer播放视频
        self.player3.setMedia(media_content)
        self.player3.play()
        self.player3.setPlaybackRate(1.0)  # 设置为1.0表示正常速度

        # file_path = 'E:/work/qt/rgb/ta_chi.mp4'  # 替换为你的视频文件路径
        # media_content = QMediaContent(QUrl.fromLocalFile(file_path))
        # self.player3.setMedia(media_content)
        # self.player3.play()

    def predict(self, data):
        # forward
        output, feature = self.model.extract_feature(data)
        output = output[0]
        feature = feature[0]
        intensity = (feature*feature).sum(dim=0)**0.5
        intensity = intensity.cpu().detach().numpy()

        # get result
        # classification result of the full sequence
        voting_label = output.sum(dim=3).sum(
            dim=2).sum(dim=1).argmax(dim=0)
        voting_label_name = self.label_name[voting_label]
        # classification result for each person of the latest frame
        num_person = data.size(4)
        latest_frame_label = [output[:, :, :, m].sum(
            dim=2)[:, -1].argmax(dim=0) for m in range(num_person)]
        latest_frame_label_name = [self.label_name[l]
                                   for l in latest_frame_label]

        num_person = output.size(3)
        num_frame = output.size(1)
        video_label_name = list()
        for t in range(num_frame):
            frame_label_name = list()
            for m in range(num_person):
                person_label = output[:, t, :, m].sum(dim=1).argmax(dim=0)
                person_label_name = self.label_name[person_label]
                frame_label_name.append(person_label_name)
            video_label_name.append(frame_label_name)
        return voting_label_name, video_label_name, output, intensity

    def put_text(self,img, text, position, scale_factor=1):
        t_w, t_h = cv2.getTextSize(
            text, cv2.FONT_HERSHEY_TRIPLEX, scale_factor, thickness=1)[0]
        H, W, _ = img.shape
        position = (int(W * position[1] - t_w * 0.5),
                    int(H * position[0] - t_h * 0.5))
        params = (position, cv2.FONT_HERSHEY_TRIPLEX, scale_factor,
                  (255, 255, 255))
        cv2.putText(img, text, *params)


    def castVideo(self):
        screen = QGuiApplication.primaryScreen()
        cast_jpg = './'+QDateTime.currentDateTime().toString("yyyy-MM-dd hh-mm-ss-zzz")+'.jpg'
        screen.grabWindow(self.wgt_video.winId()).save(cast_jpg)

    def volumeChange(self, position):
        volume= round(position/self.sld_audio.maximum()*100)
        print("vlume %f" %volume)
        self.player.setVolume(volume)
        self.lab_audio.setText("volume:"+str(volume)+"%")

    def clickedSlider(self, position):
        if self.player.duration() > 0:  # 开始播放后才允许进行跳转
            video_position = int((position / 100) * self.player.duration())
            self.player.setPosition(video_position)
            self.lab_video.setText("%.2f%%" % position)
        else:
            self.sld_video.setValue(0)

    def moveSlider(self, position):
        self.sld_video_pressed = True
        if self.player.duration() > 0:  # 开始播放后才允许进行跳转
            video_position = int((position / 100) * self.player.duration())
            self.player.setPosition(video_position)
            self.lab_video.setText("%.2f%%" % position)

    def pressSlider(self):
        self.sld_video_pressed = True
        print("pressed")

    def releaseSlider(self):
        self.sld_video_pressed = False

    def changeSlide(self, position):
        if not self.sld_video_pressed:  # 进度条被鼠标点击时不更新
            self.vidoeLength = self.player.duration()+0.1
            self.sld_video.setValue(round((position/self.vidoeLength)*100))
            self.lab_video.setText("%.2f%%" % ((position/self.vidoeLength)*100))

    def openVideoFile(self):
        self.player.setMedia(QMediaContent(QFileDialog.getOpenFileUrl()[0]))  # 选取视频文件
        self.player.play()  # 播放视频
        print(self.player.availableMetaData())

    def playVideo(self):
        self.player.play()

    def pauseVideo(self):
        self.player.pause()

    def videoDoubleClicked(self, text):

        if self.player.duration() > 0:  # 开始播放后才允许进行全屏操作
            if self.videoFullScreen:
                self.player.setVideoOutput(self.wgt_video)
                self.videoFullScreenWidget.hide()
                self.videoFullScreen = False
            else:
                self.videoFullScreenWidget.show()
                self.player.setVideoOutput(self.videoFullScreenWidget)
                self.videoFullScreenWidget.setFullScreen(1)
                self.videoFullScreen = True

    def pose_estimation(self):
        # load openpose python api
        # if self.arg.openpose is not None:
        #     # sys.path.append('{}/python'.format(self.arg.openpose))
        #     #sys.path.append('{}/build/python'.format(self.arg.openpose))
        #     sys.path.append('{}/build/python/openpose/Release'.format(self.arg.openpose))
        #     print("1")
        try:
            sys.path.append('E:/work/openpose-master/build/python/openpose/Release')
            # from openpose import pyopenpose as op
            import pyopenpose as op
        except:
            print('Can not find Openpose Python API.')
            return


        # video_name = self.arg.video.split('/')[-1].split('.')[0]

        # initiate # 初始化OpenPose包装器
        opWrapper = op.WrapperPython()
        params = dict(model_folder='E:/work/qt/PyQt_Video_Demo-master/PyQt_Video_Demo-master/models', model_pose='COCO')
        opWrapper.configure(params)
        opWrapper.start()
        # 设置模型为评估模式
        self.model.eval()
        # 打开视频文件并获取视频长度
        video_capture = cv2.VideoCapture('E:/work/qt/rgb/play1.mp4')
        video_length = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
        pose_tracker = naive_pose_tracker(data_frame=video_length)

        # pose estimation
        start_time = time.time()
        frame_index = 0
        video = list()
        while(True):

            # get image
            ret, orig_image = video_capture.read()
            if orig_image is None:
                break
            source_H, source_W, _ = orig_image.shape
            orig_image = cv2.resize(
                orig_image, (256 * source_W // source_H, 256))
            H, W, _ = orig_image.shape
            video.append(orig_image)

            # pose estimation
            datum = op.Datum()
            datum.cvInputData = orig_image
            # opWrapper.emplaceAndPop([datum])
            opWrapper.emplaceAndPop(op.VectorDatum([datum]))
            multi_pose = datum.poseKeypoints  # (num_person, num_joint, 3)
            if len(multi_pose.shape) != 3:
                continue

            # normalization
            multi_pose[:, :, 0] = multi_pose[:, :, 0]/W
            multi_pose[:, :, 1] = multi_pose[:, :, 1]/H
            multi_pose[:, :, 0:2] = multi_pose[:, :, 0:2] - 0.5
            multi_pose[:, :, 0][multi_pose[:, :, 2] == 0] = 0
            multi_pose[:, :, 1][multi_pose[:, :, 2] == 0] = 0

            # pose tracking
            pose_tracker.update(multi_pose, frame_index)
            frame_index += 1

            print('Pose estimation ({}/{}).'.format(frame_index, video_length))
        data_numpy = pose_tracker.get_skeleton_sequence()
        return video, data_numpy

    def stgcn_visualize(self,pose,
                        edge,
                        video,
                        height=1080,
                        fps=None):
        _, T, V, M = pose.shape
        T = len(video)
        pos_track = [None] * M
        for t in range(T):
            frame = video[t]

            # image resize
            H, W, c = frame.shape
            frame = cv2.resize(frame, (height * W // H // 2, height // 2))
            H, W, c = frame.shape
            scale_factor = 2 * height / 1080

            # draw skeleton
            skeleton = frame * 0
            text = frame * 0
            for m in range(M):

                score = pose[2, t, :, m].max()
                if score < 0.3:
                    continue

                for i, j in edge:
                    xi = pose[0, t, i, m]
                    yi = pose[1, t, i, m]
                    xj = pose[0, t, j, m]
                    yj = pose[1, t, j, m]
                    if xi + yi == 0 or xj + yj == 0:
                        continue
                    else:
                        xi = int((xi + 0.5) * W)
                        yi = int((yi + 0.5) * H)
                        xj = int((xj + 0.5) * W)
                        yj = int((yj + 0.5) * H)
                    cv2.line(skeleton, (xi, yi), (xj, yj), (255, 255, 255),
                             int(np.ceil(2 * scale_factor)))#np.ceil 函数用于向上取整。

                x_nose = int((pose[0, t, 0, m] + 0.5) * W)
                y_nose = int((pose[1, t, 0, m] + 0.5) * H)
                x_neck = int((pose[0, t, 1, m] + 0.5) * W)
                y_neck = int((pose[1, t, 1, m] + 0.5) * H)

                half_head = int(((x_neck - x_nose) ** 2 + (y_neck - y_nose) ** 2) ** 0.5)
                pos = (x_nose + half_head, y_nose - half_head)
                if pos_track[m] is None:
                    pos_track[m] = pos
                else:
                    new_x = int(pos_track[m][0] + (pos[0] - pos_track[m][0]) * 0.2)
                    new_y = int(pos_track[m][1] + (pos[1] - pos_track[m][1]) * 0.2)
                    pos_track[m] = (new_x, new_y)

            # img0 = np.concatenate((frame, skeleton), axis=1)
            yield skeleton

    def stgcn_rgb_visualize(self,pose,
                        edge,
                        feature,
                        video,
                        label=None,
                        height=1080,
                        fps=None):
        _, T, V, M = pose.shape
        T = len(video)
        pos_track = [None] * M
        for t in range(T):
            frame = video[t]

            # image resize
            H, W, c = frame.shape
            frame = cv2.resize(frame, (height * W // H // 2, height // 2))
            H, W, c = frame.shape
            scale_factor = 2 * height / 1080

            # draw skeleton
            skeleton = frame * 0
            text = frame * 0
            for m in range(M):

                score = pose[2, t, :, m].max()
                if score < 0.3:
                    continue

                for i, j in edge:
                    xi = pose[0, t, i, m]
                    yi = pose[1, t, i, m]
                    xj = pose[0, t, j, m]
                    yj = pose[1, t, j, m]
                    if xi + yi == 0 or xj + yj == 0:
                        continue
                    else:
                        xi = int((xi + 0.5) * W)
                        yi = int((yi + 0.5) * H)
                        xj = int((xj + 0.5) * W)
                        yj = int((yj + 0.5) * H)
                    cv2.line(skeleton, (xi, yi), (xj, yj), (255, 255, 255),
                             int(np.ceil(2 * scale_factor)))#np.ceil 函数用于向上取整。

                x_nose = int((pose[0, t, 0, m] + 0.5) * W)
                y_nose = int((pose[1, t, 0, m] + 0.5) * H)
                x_neck = int((pose[0, t, 1, m] + 0.5) * W)
                y_neck = int((pose[1, t, 1, m] + 0.5) * H)

                half_head = int(((x_neck - x_nose) ** 2 + (y_neck - y_nose) ** 2) ** 0.5)
                pos = (x_nose + half_head, y_nose - half_head)
                if pos_track[m] is None:
                    pos_track[m] = pos
                else:
                    new_x = int(pos_track[m][0] + (pos[0] - pos_track[m][0]) * 0.2)
                    new_y = int(pos_track[m][1] + (pos[1] - pos_track[m][1]) * 0.2)
                    pos_track[m] = (new_x, new_y)
            # generate mask
            mask = frame * 0
            feature = np.abs(feature)
            feature = feature / feature.mean()
            for m in range(M):
                score = pose[2, t, :, m].max()
                if score < 0.3:
                    continue

                f = feature[t // 4, :, m] ** 5
                if f.mean() != 0:
                    f = f / f.mean()
                for v in range(V):
                    x = pose[0, t, v, m]
                    y = pose[1, t, v, m]
                    if x + y == 0:
                        continue
                    else:
                        x = int((x + 0.5) * W)
                        y = int((y + 0.5) * H)
                    cv2.circle(mask, (x, y), 0, (255, 255, 255),
                               int(np.ceil(f[v] ** 0.5 * 8 * scale_factor)))
            blurred_mask = cv2.blur(mask, (12, 12))

            skeleton_result = blurred_mask.astype(float) * 0.75
            skeleton_result += skeleton.astype(float) * 0.25
            skeleton_result += text.astype(float)
            skeleton_result[skeleton_result > 255] = 255
            skeleton_result.astype(np.uint8)
            if label is not None:
                label_0_name = 'voting result: ' + label
                self.put_text(skeleton_result, label_0_name, (0.1, 0.5))


            # img0 = np.concatenate((frame, skeleton), axis=1)
            yield skeleton_result
class naive_pose_tracker():
    """ A simple tracker for recording person poses and generating skeleton sequences.
    For actual occasion, I recommend you to implement a robuster tracker.
    Pull-requests are welcomed.
    """

    def __init__(self, data_frame=128, num_joint=18, max_frame_dis=np.inf):
        self.data_frame = data_frame
        self.num_joint = num_joint
        self.max_frame_dis = max_frame_dis
        self.latest_frame = 0
        self.trace_info = list()

    def update(self, multi_pose, current_frame):
        # multi_pose.shape: (num_person, num_joint, 3)

        if current_frame <= self.latest_frame:
            return

        if len(multi_pose.shape) != 3:
            return

        score_order = (-multi_pose[:, :, 2].sum(axis=1)).argsort(axis=0)
        for p in multi_pose[score_order]:

            # match existing traces
            matching_trace = None
            matching_dis = None
            for trace_index, (trace, latest_frame) in enumerate(self.trace_info):
                # trace.shape: (num_frame, num_joint, 3)
                if current_frame <= latest_frame:
                    continue
                mean_dis, is_close = self.get_dis(trace, p)
                if is_close:
                    if matching_trace is None:
                        matching_trace = trace_index
                        matching_dis = mean_dis
                    elif matching_dis > mean_dis:
                        matching_trace = trace_index
                        matching_dis = mean_dis

            # update trace information
            if matching_trace is not None:
                trace, latest_frame = self.trace_info[matching_trace]

                # padding zero if the trace is fractured
                pad_mode = 'interp' if latest_frame == self.latest_frame else 'zero'
                pad = current_frame-latest_frame-1
                new_trace = self.cat_pose(trace, p, pad, pad_mode)
                self.trace_info[matching_trace] = (new_trace, current_frame)

            else:
                new_trace = np.array([p])
                self.trace_info.append((new_trace, current_frame))

        self.latest_frame = current_frame

    def get_skeleton_sequence(self):

        # remove old traces
        valid_trace_index = []
        for trace_index, (trace, latest_frame) in enumerate(self.trace_info):
            if self.latest_frame - latest_frame < self.data_frame:
                valid_trace_index.append(trace_index)
        self.trace_info = [self.trace_info[v] for v in valid_trace_index]

        num_trace = len(self.trace_info)
        if num_trace == 0:
            return None

        data = np.zeros((3, self.data_frame, self.num_joint, num_trace))
        for trace_index, (trace, latest_frame) in enumerate(self.trace_info):
            end = self.data_frame - (self.latest_frame - latest_frame)
            d = trace[-end:]
            beg = end - len(d)
            data[:, beg:end, :, trace_index] = d.transpose((2, 0, 1))

        return data

    # concatenate pose to a trace
    def cat_pose(self, trace, pose, pad, pad_mode):
        # trace.shape: (num_frame, num_joint, 3)
        num_joint = pose.shape[0]
        num_channel = pose.shape[1]
        if pad != 0:
            if pad_mode == 'zero':
                trace = np.concatenate(
                    (trace, np.zeros((pad, num_joint, 3))), 0)
            elif pad_mode == 'interp':
                last_pose = trace[-1]
                coeff = [(p+1)/(pad+1) for p in range(pad)]
                interp_pose = [(1-c)*last_pose + c*pose for c in coeff]
                trace = np.concatenate((trace, interp_pose), 0)
        new_trace = np.concatenate((trace, [pose]), 0)
        return new_trace

    # calculate the distance between a existing trace and the input pose

    def get_dis(self, trace, pose):
        last_pose_xy = trace[-1, :, 0:2]
        curr_pose_xy = pose[:, 0:2]

        mean_dis = ((((last_pose_xy - curr_pose_xy)**2).sum(1))**0.5).mean()
        wh = last_pose_xy.max(0) - last_pose_xy.min(0)
        scale = (wh[0] * wh[1]) ** 0.5 + 0.0001
        is_close = mean_dis < scale * self.max_frame_dis
        return mean_dis, is_close


if __name__ == '__main__':
    app = QApplication(sys.argv)
    video_gui = myMainWindow()
    video_gui.show()
    sys.exit(app.exec_())`
结果展示

在这里插入图片描述