python-OpenCV:调用视频识别手势进行电脑音量调节

独憩

已于 2022-01-20 13:23:48 修改

阅读量4.2k

点赞数 6

分类专栏： opencv-python 学习文章标签： python opencv 音视频计算机视觉

于 2021-12-24 17:06:19 首次发布

本文链接：https://blog.csdn.net/qq_54517101/article/details/122130967

版权

opencv-python 学习专栏收录该内容

21 篇文章 2 订阅

订阅专栏

写在前面

对于python调节音量的问题，一般都是直接使用pycaw库进行调节，但是当我们要设定电脑音量的时候，不能实现精确映射（我想让我的电脑音量是40，不能直接输入40），但是由于内部确实没有精确的关系，只能用一对一映射的方式。详情请参照：

python-使用pycaw设置电脑音量（包含转换）_独憩的博客-CSDN博客

手部识别可以通过mediapipe库进行：

python-OpenCV 视频中的手部跟踪：基于mediapipe库_独憩的博客-CSDN博客

对于调用视频识别手势进行电脑音量调节这个问题，网上有很多教程，一般是直接测量两个手指的距离进行映射，这样的问题是：我改变了自身位置就很难进行控制。故我本次对其进行改进：即先进行标定，标定的目的是获取此时的两个手指直接的最大距离，以这个maxlengh为基础，进行映射。

import：

import cv2
import math
import time
import mediapipe as mp
from os import listdir
from datetime import datetime
import time
import datetime

from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume

调用电脑发声硬件：

mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(
    IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))
volRange = volume.GetVolumeRange()

定义一些函数：

这个函数的作用是将识别到的手的点映射到图像坐标上，为后期的画点及计算距离服务。

def Normalize_landmarks(image, hand_landmarks):
    new_landmarks = []
    for i in range(0, len(hand_landmarks.landmark)):
        float_x = hand_landmarks.landmark[i].x
        float_y = hand_landmarks.landmark[i].y
        width = image.shape[1]
        height = image.shape[0]
        pt = mp_drawing._normalized_to_pixel_coordinates(float_x, float_y, width, height)
        new_landmarks.append(pt)
    return new_landmarks

这个函数是画图函数，将食指与拇指的位置单独画出，并连线，至于为什么是landmarks[4]与landmarks[8]，请参照python-OpenCV 视频中的手部跟踪：基于mediapipe库_独憩的博客-CSDN博客

def Draw_hand_points(image, normalized_hand_landmarks):
    cv2.circle(image, normalized_hand_landmarks[4], 12, (255, 0, 255), -1, cv2.LINE_AA)
    cv2.circle(image, normalized_hand_landmarks[8], 12, (255, 0, 255), -1, cv2.LINE_AA)
    cv2.line(image, normalized_hand_landmarks[4], normalized_hand_landmarks[8], (255, 0, 255), 3)
    x1, y1 = normalized_hand_landmarks[4][0], normalized_hand_landmarks[4][1]
    x2, y2 = normalized_hand_landmarks[8][0], normalized_hand_landmarks[8][1]
    mid_x, mid_y = (x1 + x2) // 2, (y1 + y2) // 2
    length = math.sqrt((x2 - x1)**2+(y2 - y1)**2) #得到大拇指到食指的距离
    if length < 100:
        cv2.circle(image, (mid_x, mid_y), 12, (0, 255, 0), cv2.FILLED)
    else:
        cv2.circle(image, (mid_x, mid_y), 12, (255, 0, 255), cv2.FILLED)
    return image, length

这两个函数的作用是将电脑的音量数字（0-100）与pycaw库中的数字对应，很蠢但是很有效，由于反向对应做不到11对应，只能找到误差最小的点进行对应：

def vol_tansfer(x):
    dict = {0: -65.25, 1: -56.99, 2: -51.67, 3: -47.74, 4: -44.62, 5: -42.03, 6: -39.82, 7: -37.89, 8: -36.17,
            9: -34.63, 10: -33.24,
            11: -31.96, 12: -30.78, 13: -29.68, 14: -28.66, 15: -27.7, 16: -26.8, 17: -25.95, 18: -25.15, 19: -24.38,
            20: -23.65,
            21: -22.96, 22: -22.3, 23: -21.66, 24: -21.05, 25: -20.46, 26: -19.9, 27: -19.35, 28: -18.82, 29: -18.32,
            30: -17.82,
            31: -17.35, 32: -16.88, 33: -16.44, 34: -16.0, 35: -15.58, 36: -15.16, 37: -14.76, 38: -14.37, 39: -13.99,
            40: -13.62,
            41: -13.26, 42: -12.9, 43: -12.56, 44: -12.22, 45: -11.89, 46: -11.56, 47: -11.24, 48: -10.93, 49: -10.63,
            50: -10.33,
            51: -10.04, 52: -9.75, 53: -9.47, 54: -9.19, 55: -8.92, 56: -8.65, 57: -8.39, 58: -8.13, 59: -7.88,
            60: -7.63,
            61: -7.38, 62: -7.14, 63: -6.9, 64: -6.67, 65: -6.44, 66: -6.21, 67: -5.99, 68: -5.76, 69: -5.55, 70: -5.33,
            71: -5.12, 72: -4.91, 73: -4.71, 74: -4.5, 75: -4.3, 76: -4.11, 77: -3.91, 78: -3.72, 79: -3.53, 80: -3.34,
            81: -3.15, 82: -2.97, 83: -2.79, 84: -2.61, 85: -2.43, 86: -2.26, 87: -2.09, 88: -1.91, 89: -1.75,
            90: -1.58,
            91: -1.41, 92: -1.25, 93: -1.09, 94: -0.93, 95: -0.77, 96: -0.61, 97: -0.46, 98: -0.3, 99: -0.15, 100: 0.0}
    return dict[x]

def vol_tansfer_reverse(x):
    error = []
    dict = {0: -65.25, 1: -56.99, 2: -51.67, 3: -47.74, 4: -44.62, 5: -42.03, 6: -39.82, 7: -37.89, 8: -36.17,
            9: -34.63, 10: -33.24,
            11: -31.96, 12: -30.78, 13: -29.68, 14: -28.66, 15: -27.7, 16: -26.8, 17: -25.95, 18: -25.15, 19: -24.38,
            20: -23.65,
            21: -22.96, 22: -22.3, 23: -21.66, 24: -21.05, 25: -20.46, 26: -19.9, 27: -19.35, 28: -18.82, 29: -18.32,
            30: -17.82,
            31: -17.35, 32: -16.88, 33: -16.44, 34: -16.0, 35: -15.58, 36: -15.16, 37: -14.76, 38: -14.37, 39: -13.99,
            40: -13.62,
            41: -13.26, 42: -12.9, 43: -12.56, 44: -12.22, 45: -11.89, 46: -11.56, 47: -11.24, 48: -10.93, 49: -10.63,
            50: -10.33,
            51: -10.04, 52: -9.75, 53: -9.47, 54: -9.19, 55: -8.92, 56: -8.65, 57: -8.39, 58: -8.13, 59: -7.88,
            60: -7.63,
            61: -7.38, 62: -7.14, 63: -6.9, 64: -6.67, 65: -6.44, 66: -6.21, 67: -5.99, 68: -5.76, 69: -5.55, 70: -5.33,
            71: -5.12, 72: -4.91, 73: -4.71, 74: -4.5, 75: -4.3, 76: -4.11, 77: -3.91, 78: -3.72, 79: -3.53, 80: -3.34,
            81: -3.15, 82: -2.97, 83: -2.79, 84: -2.61, 85: -2.43, 86: -2.26, 87: -2.09, 88: -1.91, 89: -1.75,
            90: -1.58,
            91: -1.41, 92: -1.25, 93: -1.09, 94: -0.93, 95: -0.77, 96: -0.61, 97: -0.46, 98: -0.3, 99: -0.15, 100: 0.0}
    for i in range (100):
        error.append(abs(dict[i]-x))
    return error.index(min(error))

主循环：

主要的逻辑是在大循环下设置两个小循环，第一个循环是标定循环，持续5秒，可以得到5秒内的len_max。以此为依据映射到电脑音量（0-100）：

vol = int((length) / len_max * 100)

hands = mp_hands.Hands(
    min_detection_confidence=0.5, min_tracking_confidence=0.5)
cap = cv2.VideoCapture(0)
len_max = 0
len_min = 0
num = 0

while cap.isOpened():
    stop = datetime.datetime.now() + datetime.timedelta(seconds=5)
    if num == 0:
        while datetime.datetime.now() < stop:
            success, image = cap.read()
            if not success:
                print("camera frame is empty!")
                continue
            image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
            results = hands.process(image)

            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    mp_drawing.draw_landmarks(
                        image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                    normalized_landmarks = Normalize_landmarks(image, hand_landmarks)
                    image, length = Draw_hand_points(image, normalized_landmarks)
                    if length>len_max:
                        len_max = length
                    strRate = 'Start calibration'
                    cv2.putText(image, strRate, (10, 410), cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 0, 0), 2)
                    strRate1 = 'max length = %d'%len_max
                    cv2.putText(image, strRate1, (10, 110), cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 0, 0), 2)

            cv2.imshow('result', image)
            if cv2.waitKey(5) & 0xFF == 27:
                break
            num = 1


    success, image = cap.read()
    if not success:
        print("camera frame is empty!")
        continue

    image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = hands.process(image)

    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            normalized_landmarks = Normalize_landmarks(image, hand_landmarks)
            try:
                image, length = Draw_hand_points(image, normalized_landmarks)
                # print(length) #20~300
                cv2.rectangle(image, (50, 150), (85, 350), (255, 0, 0), 1)
                if length >len_max:
                    length = len_max

                vol = int((length) / len_max * 100)
                volume.SetMasterVolumeLevel(vol_tansfer(vol), None)

                cv2.rectangle(image, (50, 150+200-2*vol), (85, 350), (255, 0, 0), cv2.FILLED)
                percent = int(length / len_max * 100)
                # print(percent)

                strRate = str(percent) + '%'
                cv2.putText(image, strRate, (40, 410), cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 0, 0), 2)

                vol_now = vol_tansfer_reverse(volume.GetMasterVolumeLevel())
                strvol = 'Current volume is'+str(vol_now)
                cv2.putText(image, strvol, (10, 470), cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 0, 0), 2)

            except:
                pass

    cv2.imshow('result', image)
    if cv2.waitKey(10) & 0xFF == ord('q'):
        len_max = 0
        num = 0
    if cv2.waitKey(10) & 0xFF == 27:
        break
cv2.destroyAllWindows()
hands.close()
cap.release()

最后，敲入"q"可以重新标定，敲入"esc"可以退出。

结果：

全部代码：

import cv2
import math
import time
import mediapipe as mp
from os import listdir
from datetime import datetime
import time
import datetime



from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume


mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(
    IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))
volRange = volume.GetVolumeRange()
minVol = volRange[0]
maxVol = volRange[1]
print(minVol, maxVol)





def Normalize_landmarks(image, hand_landmarks):
    new_landmarks = []
    for i in range(0, len(hand_landmarks.landmark)):
        float_x = hand_landmarks.landmark[i].x
        float_y = hand_landmarks.landmark[i].y
        width = image.shape[1]
        height = image.shape[0]
        pt = mp_drawing._normalized_to_pixel_coordinates(float_x, float_y, width, height)
        new_landmarks.append(pt)
    return new_landmarks


def Draw_hand_points(image, normalized_hand_landmarks):
    cv2.circle(image, normalized_hand_landmarks[4], 12, (255, 0, 255), -1, cv2.LINE_AA)
    cv2.circle(image, normalized_hand_landmarks[8], 12, (255, 0, 255), -1, cv2.LINE_AA)
    cv2.line(image, normalized_hand_landmarks[4], normalized_hand_landmarks[8], (255, 0, 255), 3)
    x1, y1 = normalized_hand_landmarks[4][0], normalized_hand_landmarks[4][1]
    x2, y2 = normalized_hand_landmarks[8][0], normalized_hand_landmarks[8][1]
    mid_x, mid_y = (x1 + x2) // 2, (y1 + y2) // 2
    length = math.sqrt((x2 - x1)**2+(y2 - y1)**2) #得到大拇指到食指的距离
    if length < 100:
        cv2.circle(image, (mid_x, mid_y), 12, (0, 255, 0), cv2.FILLED)
    else:
        cv2.circle(image, (mid_x, mid_y), 12, (255, 0, 255), cv2.FILLED)
    return image, length

def vol_tansfer(x):
    dict = {0: -65.25, 1: -56.99, 2: -51.67, 3: -47.74, 4: -44.62, 5: -42.03, 6: -39.82, 7: -37.89, 8: -36.17,
            9: -34.63, 10: -33.24,
            11: -31.96, 12: -30.78, 13: -29.68, 14: -28.66, 15: -27.7, 16: -26.8, 17: -25.95, 18: -25.15, 19: -24.38,
            20: -23.65,
            21: -22.96, 22: -22.3, 23: -21.66, 24: -21.05, 25: -20.46, 26: -19.9, 27: -19.35, 28: -18.82, 29: -18.32,
            30: -17.82,
            31: -17.35, 32: -16.88, 33: -16.44, 34: -16.0, 35: -15.58, 36: -15.16, 37: -14.76, 38: -14.37, 39: -13.99,
            40: -13.62,
            41: -13.26, 42: -12.9, 43: -12.56, 44: -12.22, 45: -11.89, 46: -11.56, 47: -11.24, 48: -10.93, 49: -10.63,
            50: -10.33,
            51: -10.04, 52: -9.75, 53: -9.47, 54: -9.19, 55: -8.92, 56: -8.65, 57: -8.39, 58: -8.13, 59: -7.88,
            60: -7.63,
            61: -7.38, 62: -7.14, 63: -6.9, 64: -6.67, 65: -6.44, 66: -6.21, 67: -5.99, 68: -5.76, 69: -5.55, 70: -5.33,
            71: -5.12, 72: -4.91, 73: -4.71, 74: -4.5, 75: -4.3, 76: -4.11, 77: -3.91, 78: -3.72, 79: -3.53, 80: -3.34,
            81: -3.15, 82: -2.97, 83: -2.79, 84: -2.61, 85: -2.43, 86: -2.26, 87: -2.09, 88: -1.91, 89: -1.75,
            90: -1.58,
            91: -1.41, 92: -1.25, 93: -1.09, 94: -0.93, 95: -0.77, 96: -0.61, 97: -0.46, 98: -0.3, 99: -0.15, 100: 0.0}
    return dict[x]

def vol_tansfer_reverse(x):
    error = []
    dict = {0: -65.25, 1: -56.99, 2: -51.67, 3: -47.74, 4: -44.62, 5: -42.03, 6: -39.82, 7: -37.89, 8: -36.17,
            9: -34.63, 10: -33.24,
            11: -31.96, 12: -30.78, 13: -29.68, 14: -28.66, 15: -27.7, 16: -26.8, 17: -25.95, 18: -25.15, 19: -24.38,
            20: -23.65,
            21: -22.96, 22: -22.3, 23: -21.66, 24: -21.05, 25: -20.46, 26: -19.9, 27: -19.35, 28: -18.82, 29: -18.32,
            30: -17.82,
            31: -17.35, 32: -16.88, 33: -16.44, 34: -16.0, 35: -15.58, 36: -15.16, 37: -14.76, 38: -14.37, 39: -13.99,
            40: -13.62,
            41: -13.26, 42: -12.9, 43: -12.56, 44: -12.22, 45: -11.89, 46: -11.56, 47: -11.24, 48: -10.93, 49: -10.63,
            50: -10.33,
            51: -10.04, 52: -9.75, 53: -9.47, 54: -9.19, 55: -8.92, 56: -8.65, 57: -8.39, 58: -8.13, 59: -7.88,
            60: -7.63,
            61: -7.38, 62: -7.14, 63: -6.9, 64: -6.67, 65: -6.44, 66: -6.21, 67: -5.99, 68: -5.76, 69: -5.55, 70: -5.33,
            71: -5.12, 72: -4.91, 73: -4.71, 74: -4.5, 75: -4.3, 76: -4.11, 77: -3.91, 78: -3.72, 79: -3.53, 80: -3.34,
            81: -3.15, 82: -2.97, 83: -2.79, 84: -2.61, 85: -2.43, 86: -2.26, 87: -2.09, 88: -1.91, 89: -1.75,
            90: -1.58,
            91: -1.41, 92: -1.25, 93: -1.09, 94: -0.93, 95: -0.77, 96: -0.61, 97: -0.46, 98: -0.3, 99: -0.15, 100: 0.0}
    for i in range (100):
        error.append(abs(dict[i]-x))
    return error.index(min(error))






hands = mp_hands.Hands(
    min_detection_confidence=0.5, min_tracking_confidence=0.5)
cap = cv2.VideoCapture(0)
len_max = 0
len_min = 0
num = 0

while cap.isOpened():
    stop = datetime.datetime.now() + datetime.timedelta(seconds=5)
    if num == 0:
        while datetime.datetime.now() < stop:
            success, image = cap.read()
            if not success:
                print("camera frame is empty!")
                continue
            image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
            results = hands.process(image)

            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    mp_drawing.draw_landmarks(
                        image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                    normalized_landmarks = Normalize_landmarks(image, hand_landmarks)
                    image, length = Draw_hand_points(image, normalized_landmarks)
                    if length>len_max:
                        len_max = length
                    strRate = 'Start calibration'
                    cv2.putText(image, strRate, (10, 410), cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 0, 0), 2)
                    strRate1 = 'max length = %d'%len_max
                    cv2.putText(image, strRate1, (10, 110), cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 0, 0), 2)

            cv2.imshow('result', image)
            if cv2.waitKey(5) & 0xFF == 27:
                break
            num = 1


    success, image = cap.read()
    if not success:
        print("camera frame is empty!")
        continue

    image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = hands.process(image)

    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            normalized_landmarks = Normalize_landmarks(image, hand_landmarks)
            try:
                image, length = Draw_hand_points(image, normalized_landmarks)
                # print(length) #20~300
                cv2.rectangle(image, (50, 150), (85, 350), (255, 0, 0), 1)
                if length >len_max:
                    length = len_max

                vol = int((length) / len_max * 100)
                volume.SetMasterVolumeLevel(vol_tansfer(vol), None)

                cv2.rectangle(image, (50, 150+200-2*vol), (85, 350), (255, 0, 0), cv2.FILLED)
                percent = int(length / len_max * 100)
                # print(percent)

                strRate = str(percent) + '%'
                cv2.putText(image, strRate, (40, 410), cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 0, 0), 2)

                vol_now = vol_tansfer_reverse(volume.GetMasterVolumeLevel())
                strvol = 'Current volume is'+str(vol_now)
                cv2.putText(image, strvol, (10, 470), cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 0, 0), 2)

            except:
                pass

    cv2.imshow('result', image)
    if cv2.waitKey(10) & 0xFF == ord('q'):
        len_max = 0
        num = 0
    if cv2.waitKey(10) & 0xFF == 27:
        break
cv2.destroyAllWindows()
hands.close()
cap.release()

独憩

关注

6
点赞
踩
30

收藏

觉得还不错? 一键收藏
4
评论
python-OpenCV:调用视频识别手势进行电脑音量调节

写在前面对于python调节音量的问题，一般都是直接使用pycaw库进行调节，但是当我们要设定电脑音量的时候，不能实现精确映射（我想让我的电脑音量是40，不能直接输入40），但是由于内部确实没有精确的关系，只能用一对一映射的方式。详情请参照：python-使用pycaw设置电脑音量（包含转换）_独憩的博客-CSDN博客手部识别可以通过mediapipe库进行：python-OpenCV 视频中的手部跟踪：基于mediapipe库_独憩的博客-CSDN博客对于调用视频识别手势进行电脑音量
复制链接

扫一扫