项目简介:
Orchestra Conductor 是一个基于 Python 的应用程序,利用手势识别控制音乐播放。通过使用 MediaPipe 实时捕捉手势,并借助 OpenCV 进行可视化展示,用户可以通过手势调整音乐的播放、暂停、音量和节奏。
会指挥和不会指挥的都沉默了(交响乐指挥模拟器)
功能概述
-
实时手势识别:
- 使用 MediaPipe 进行手势检测,能够识别不同手势并控制音乐播放。
-
音乐播放控制:
- 通过 VLC 实现音乐文件的播放、暂停、音量调节和节奏控制。
-
可视化界面:
- OpenCV 用于显示实时手势检测,并通过图表显示音量和节奏的变化。
-
手势指令:
- 左手:用于调整音量和节奏。
- 右手:用于控制音乐播放的暂停和继续。
数据处理流程
现在展示的是其数据处理流程(pipline):
-
Data Smoothing(数据平滑,Kalman Filters):
为了避免手部抖动和检测误差,Kalman 滤波器用于对检测到的手势数据进行平滑处理。它确保手势识别的数据更加稳定、准确。卡尔曼滤波器的递归过程包括两个步骤:预测(Prediction)和更新(Update)。
预测步骤: 预测下一时刻的状态和协方差矩阵:
更新步骤: 使用新的观测数据来更新预测值:
-
Feature Extraction(特征提取BPM、Cue 手势):
经过数据平滑后,开始对手势数据进行特征提取。
- Extensivity Calculation(扩展性计算):计算手指和手掌之间的距离,用于音量控制。
- BPM Detection(BPM检测):通过手部动作检测每分钟节拍数(BPM),用于控制音乐的节奏快慢。
- Cue Gesture Recognition(Cue手势识别):检测暂停或继续播放的手势信号。。
术语解释
-
BPM(每分钟节拍数): BPM 是音乐播放速度的度量单位,表示每分钟内发生的节拍数。例如,120 BPM 表示每分钟有 120 次节拍,属于中等速度的音乐。
-
音量: 音量指的是音乐的响度。你可以通过左手手势来控制音乐的音量。
-
Cue: Cue 是音乐中的一个信号,用来触发某个动作。在此应用中,cue 手势控制音乐的暂停和继续。
-
节奏: 节奏是音乐的速度,通常通过调整 BPM 来改变。BPM 越高,音乐播放越快;BPM 越低,音乐播放越慢。
关键代码参考
1. 手势检测与 Mediapipe 初始化
这部分代码展示了如何使用 MediaPipe 进行手势检测。
import mediapipe as mp
import cv2
# 初始化 MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
# 打开摄像头
cap = cv2.VideoCapture(0)
# 使用 MediaPipe 检测手势
with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.7, min_tracking_confidence=0.7) as hands:
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# 翻转图像,转换为 RGB
frame = cv2.flip(frame, 1)
image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# 使用 MediaPipe 处理图像
results = hands.process(image_rgb)
# 绘制手势
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
# 显示视频
cv2.imshow('Hand Tracking', frame)
# 按 'q' 键退出
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
2. 卡尔曼滤波器应用于手势平滑
这段代码展示了如何使用卡尔曼滤波器对手势检测进行平滑处理,避免手势抖动带来的误差。
from filterpy.kalman import KalmanFilter
import numpy as np
# 初始化卡尔曼滤波器
def initialize_kalman_filter(initial_value, measurement_noise=1.0, process_noise=1.0):
kf = KalmanFilter(dim_x=2, dim_z=1)
kf.x = np.array([[initial_value], [0]]) # 状态值和变化率
kf.F = np.array([[1, 1], [0, 1]]) # 状态转移矩阵
kf.H = np.array([[1, 0]]) # 观测矩阵
kf.P *= 1000.0 # 初始协方差矩阵
kf.R = measurement_noise # 观测噪声协方差
kf.Q = np.array([[process_noise, 0], [0, process_noise]]) # 过程噪声协方差
return kf
# 对手的坐标进行平滑
def smooth_hand_position(x, kf):
kf.predict()
kf.update([x])
return kf.x[0]
3. 手势特征提取与控制
这段代码展示了如何提取手势的扩展性、BPM,并通过手势控制音量和音乐播放速度。
# 计算扩展性(手指距离手掌的平均值,用于控制音量)
def calculate_extensivity(hand_landmarks, frame_width, frame_height):
palm_coords = np.array([hand_landmarks.landmark[mp_hands.HandLandmark.WRIST].x * frame_width,
hand_landmarks.landmark[mp_hands.HandLandmark.WRIST].y * frame_height])
finger_tips = [mp_hands.HandLandmark.THUMB_TIP,
mp_hands.HandLandmark.INDEX_FINGER_TIP,
mp_hands.HandLandmark.MIDDLE_FINGER_TIP,
mp_hands.HandLandmark.RING_FINGER_TIP,
mp_hands.HandLandmark.PINKY_TIP]
distances = []
for tip in finger_tips:
tip_coords = np.array([hand_landmarks.landmark[tip].x * frame_width,
hand_landmarks.landmark[tip].y * frame_height])
distance = np.linalg.norm(tip_coords - palm_coords)
distances.append(distance)
# 计算手指的平均扩展性
extensivity = np.mean(distances)
return extensivity
# 控制音量和节奏的手势映射
def control_music(player, extensivity, bpm):
# 音量控制
volume = min(100, max(0, int(extensivity / 200 * 100)))
player.set_volume(volume)
# 节奏控制
tempo = bpm / 120 # 假设音乐的默认BPM为120
player.set_rate(tempo)
4. 一个比较综合的代码可以作为示例跑着玩,是老的版本,现在我的这个版本项目太长了,通过跑这个你可以看到数据降噪和特征提取,高斯函数的映射对曲线的平滑化有多好。
import cv2
import mediapipe as mp
import numpy as np
import pygame
import threading
import time
import math
from collections import deque
from pydub import AudioSegment
from pydub.playback import play
# Load and convert the music file with Pydub (for tempo manipulation)
original_music = AudioSegment.from_file(
'Replace_with your music file path') # Replace with your music file path
current_music = original_music
volume = 0.5
previous_volume = 0.5 # Track previous volume
tempo = 1.0 # Ensure starting tempo is 1.0
stop_music = False
music_thread = None
start_time = time.time()
# Set the music BPM (you can detect this with a library like librosa, but for now, set it manually)
MUSIC_BPM = 120 # Beats per minute
SECONDS_PER_BEAT = 60.0 / MUSIC_BPM # Time duration for each beat
# Hand speed threshold for normal tempo (arbitrary units)
NORMAL_HAND_SPEED = 0.5 # You can adjust this based on experimentation
# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
# To keep track of volume and tempo values for visualization
volume_history = deque(maxlen=100) # Store last 100 frames of volume
tempo_history = deque(maxlen=100) # Store last 100 frames of tempo
# Function to apply Gaussian smoothing to volume
def gaussian_smooth(current_volume, previous_volume, sigma=0.1):
# Apply Gaussian smoothing to smooth sudden changes in volume
smoothed_volume = previous_volume + (current_volume - previous_volume) * math.exp(-sigma)
return smoothed_volume
# Function to adjust tempo using Pydub
def adjust_tempo(speed):
global current_music
speed = max(speed, 0.3) # Ensure minimum speed factor is 0.3x
playback_speed = 1 / speed
current_music = original_music._spawn(original_music.raw_data, overrides={
"frame_rate": int(original_music.frame_rate * playback_speed)
}).set_frame_rate(original_music.frame_rate)
# Function to control the volume using pygame.mixer
def adjust_volume(target_volume):
global volume, previous_volume
# Smooth the volume using Gaussian smoothing
smoothed_volume = gaussian_smooth(target_volume, previous_volume)
previous_volume = smoothed_volume # Update the previous volume
volume = smoothed_volume
pygame.mixer.music.set_volume(volume)
# Function to play music in a separate thread
def play_music():
global current_music, stop_music
pygame.mixer.init()
temp_music = current_music.export(format="wav")
pygame.mixer.music.load(temp_music)
pygame.mixer.music.set_volume(volume)
pygame.mixer.music.play()
while not stop_music:
time.sleep(0.1)
pygame.mixer.music.stop()
# Function to count extended fingers
def count_extended_fingers(hand_landmarks):
extended_fingers = 0
finger_tips = [8, 12, 16, 20] # Index, Middle, Ring, Pinky tips
finger_dips = [6, 10, 14, 18]
for tip, dip in zip(finger_tips, finger_dips):
if hand_landmarks.landmark[tip].y < hand_landmarks.landmark[dip].y:
extended_fingers += 1
# Check thumb separately
if hand_landmarks.landmark[4].x < hand_landmarks.landmark[3].x:
extended_fingers += 1
return extended_fingers
# Function to calculate hand speed from wrist movement
def calculate_hand_speed(wrist_positions, current_time, frame_width):
if len(wrist_positions) >= 2:
y_positions = [pos[0] for pos in wrist_positions]
x_positions = [pos[2] for pos in wrist_positions] # Horizontal position for weighting
times = [pos[1] for pos in wrist_positions]
dy = y_positions[-1] - y_positions[0]
dx = x_positions[-1] - x_positions[0]
dt = times[-1] - times[0]
if dt > 0:
# Speed is primarily vertical movement with reduced weight near the horizontal center
normalized_x = x_positions[-1] / frame_width # Normalize x position to [0, 1]
# Apply sinusoidal weighting based on x-position to mimic conductor wave
weight = math.sin(normalized_x * math.pi) ** 2 # Peaks at the extremes
return weight * abs(dy / dt)
return 0
# Function to adjust tempo based on hand speed
def adjust_tempo_by_hand_speed(hand_speed):
global tempo
if hand_speed > NORMAL_HAND_SPEED:
# Hand is moving faster than normal, increase tempo
adjustment_factor = min(0.5, (hand_speed - NORMAL_HAND_SPEED) / 50.0) # Limit to +0.5x max
tempo = 1.0 + adjustment_factor
elif hand_speed < NORMAL_HAND_SPEED:
# Hand is moving slower than normal, decrease tempo
adjustment_factor = min(0.7, (NORMAL_HAND_SPEED - hand_speed) / 50.0) # Limit to -0.7x max
tempo = 1.0 - adjustment_factor
else:
# Keep normal tempo
tempo = 1.0
# Ensure the tempo stays within reasonable bounds
tempo = max(0.3, min(1.5, tempo)) # Tempo stays between 0.3x and 1.5x
adjust_tempo(tempo)
# Function to visualize volume, tempo, and speed curves alongside real-time video
def visualize_volume_and_tempo(frame, volume_history, tempo_history):
height, width, _ = frame.shape
graph_frame = np.zeros((height, 500, 3), dtype=np.uint8)
# Normalize volume, tempo, and speed values
max_volume = 1.5
min_volume = 0.3
max_tempo = 1.5
min_tempo = 0.3
volume_color = (0, 255, 0) # Green
tempo_color = (0, 0, 255) # Blue
# Drawing volume curve
for i in range(1, len(volume_history)):
cv2.line(graph_frame,
(int((i - 1) * 4), int(350 - 300 * (volume_history[i - 1] - min_volume) / (max_volume - min_volume))),
(int(i * 4), int(350 - 300 * (volume_history[i] - min_volume) / (max_volume - min_volume))),
volume_color, 2)
# Drawing tempo curve
for i in range(1, len(tempo_history)):
cv2.line(graph_frame,
(int((i - 1) * 4), int(350 - 300 * (tempo_history[i - 1] - min_tempo) / (max_tempo - min_tempo))),
(int(i * 4), int(350 - 300 * (tempo_history[i] - min_tempo) / (max_tempo - min_tempo))),
tempo_color, 2)
# Move text away from curves to prevent overlap
font = cv2.FONT_HERSHEY_SIMPLEX
cv2.rectangle(graph_frame, (0, 0), (500, 100), (0, 0, 0), 2)
cv2.putText(graph_frame, "Volume", (10, 40), font, 1, volume_color, 2, cv2.LINE_AA) # Shifted higher
cv2.putText(graph_frame, "Tempo", (10, 80), font, 1, tempo_color, 2, cv2.LINE_AA) # Shifted lower
# Combine the original frame with the graph_frame
combined_frame = np.hstack((frame, graph_frame))
return combined_frame
# Start capturing video from the webcam
cap = cv2.VideoCapture(0)
# Initialize variables for gesture control
prev_time = time.time()
right_wrist_positions = []
left_wrist_positions = []
hand_speed = 0 # Initial hand speed
with mp_hands.Hands(
max_num_hands=2,
min_detection_confidence=0.7,
min_tracking_confidence=0.7) as hands:
while cap.isOpened():
success, frame = cap.read()
if not success:
print("Ignoring empty camera frame.")
continue
# Flip the image for a selfie-view display
frame = cv2.flip(frame, 1)
frame_height, frame_width, _ = frame.shape
# Convert the BGR image to RGB and process it with MediaPipe Hands
image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(image_rgb)
# Draw hand annotations on the frame
if results.multi_hand_landmarks:
right_hand_landmarks = None
left_hand_landmarks = None
for hand_landmarks, hand_info in zip(results.multi_hand_landmarks, results.multi_handedness):
hand_label = hand_info.classification[0].label
# Draw landmarks
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
if hand_label == 'Right':
right_hand_landmarks = hand_landmarks
elif hand_label == 'Left':
left_hand_landmarks = hand_landmarks
current_time = time.time()
# Tempo Control using Both Hands
if right_hand_landmarks:
right_wrist = right_hand_landmarks.landmark[mp_hands.HandLandmark.WRIST]
right_wrist_y = right_wrist.y * frame_height
right_wrist_x = right_wrist.x * frame_width
right_wrist_positions.append((right_wrist_y, current_time, right_wrist_x))
if left_hand_landmarks:
left_wrist = left_hand_landmarks.landmark[mp_hands.HandLandmark.WRIST]
left_wrist_y = left_wrist.y * frame_height
left_wrist_x = left_wrist.x * frame_width
left_wrist_positions.append((left_wrist_y, current_time, left_wrist_x))
# Keep only the last 0.5 seconds of data for both hands
right_wrist_positions = [pos for pos in right_wrist_positions if current_time - pos[1] <= 0.5]
left_wrist_positions = [pos for pos in left_wrist_positions if current_time - pos[1] <= 0.5]
# Calculate combined hand speed (mean speed) with weighting based on x-position
right_hand_speed = calculate_hand_speed(right_wrist_positions, current_time, frame_width)
left_hand_speed = calculate_hand_speed(left_wrist_positions, current_time, frame_width)
combined_hand_speed = (right_hand_speed + left_hand_speed) / 2
# Adjust tempo based on hand speed
adjust_tempo_by_hand_speed(combined_hand_speed)
# Volume Control with Left Hand
if left_hand_landmarks:
finger_count = count_extended_fingers(left_hand_landmarks)
# Ensure minimum volume is 30% (0.3) and maximum volume is 100% (1.0)
target_volume = 0.3 + (finger_count / 5) * 0.7 # Scaling between 0.3 and 1.0
print(f"Target Volume set to {target_volume * 100:.0f}%")
# Adjust the volume smoothly
threading.Thread(target=adjust_volume, args=(target_volume,)).start()
# Start music if not playing
if music_thread is None or not music_thread.is_alive():
stop_music = False
music_thread = threading.Thread(target=play_music)
music_thread.start()
# Append current volume and tempo to history for visualization
volume_history.append(volume)
tempo_history.append(tempo)
# Visualize the volume and tempo curves alongside the real-time video
combined_frame = visualize_volume_and_tempo(frame, volume_history, tempo_history)
# Display the webcam feed with hand annotations and volume/tempo/speed curves
cv2.imshow('Orchestra Conductor', combined_frame)
# Exit on 'Esc' key
if cv2.waitKey(5) & 0xFF == 27:
stop_music = True
break
# Clean up
cap.release()
cv2.destroyAllWindows()
pygame.mixer.quit()
该版本的代码已经开源到了GitHub