目录
TensorFlow + JavaScript。现在,最流行,最先进的AI框架支持地球上使用最广泛的编程语言,因此,让我们在我们的web浏览器中通过深度学习实现奇迹,通过TensorFlow.js的WebGL GPU加速!
这是我们六个系列的最后一篇文章:
- 使用TensorFlow.js在浏览器中进行深度学习入门
- 狗和披萨:使用TensorFlow.js在浏览器中实现计算机视觉
- 绒毛动物探测器:通过TensorFlow.js中的迁移学习识别浏览器中的自定义对象
- 使用TensorFlow.js进行人脸触摸检测第1部分:将实时网络摄像头数据与深度学习配合使用
- 使用TensorFlow.js进行人脸触摸检测第2部分:使用BodyPix
- 使用TensorFlow.js进行AI在网络摄像头中翻译手势和手语
在本文中,我们将通过网络摄像头拍摄不同手势的照片,并在预先训练的MobileNet模型上使用转移学习来构建可以实时识别各种手势的计算机视觉AI。
起点
为了识别多个手势,我们将使用几乎可用的入门代码并将其扩展以检测更多类别的对象。代码将执行以下操作:
- 导入TensorFlow.js和TensorFlow的tf-data.js
- 定义触摸与非触摸类别标签
- 为网络摄像头添加视频元素
- 首次训练后,每200毫秒运行一次模型预测
- 显示预测结果
- 加载预先训练的MobileNet模型,并准备将转移学习分类为与标签一样多的类别
- 训练和分类图像中的各种自定义对象
- 跳过在训练过程中放置图像和目标样本的过程,以保持它们进行多次训练
这是该项目的起点:
<html>
<head>
<meta charset="UTF-8">
<title>Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js</title>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@2.0.0/dist/tf.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-data@2.0.0/dist/tf-data.min.js"></script>
<style>
img, video {
object-fit: cover;
}
</style>
</head>
<body>
<video autoplay playsinline muted id="webcam" width="224" height="224"></video>
<div id="buttons">
<button onclick="captureSample(0)">None</button>
<button onclick="captureSample(1)">✊ (Rock)</button>
<button onclick="captureSample(2)">🖐 (Paper)</button>
<button onclick="captureSample(3)">✌️ (Scissors)</button>
<button onclick="trainModel()">Train</button>
</div>
<h1 id="status">Loading...</h1>
<script>
let trainingData = [];
const labels = [
"None",
"✊ (Rock)",
"🖐 (Paper)",
"✌️ (Scissors)",
];
function setText( text ) {
document.getElementById( "status" ).innerText = text;
}
async function predictImage() {
if( !hasTrained ) { return; } // Skip prediction until trained
const img = await getWebcamImage();
let result = tf.tidy( () => {
const input = img.reshape( [ 1, 224, 224, 3 ] );
return model.predict( input );
});
img.dispose();
let prediction = await result.data();
result.dispose();
// Get the index of the highest value in the prediction
let id = prediction.indexOf( Math.max( ...prediction ) );
setText( labels[ id ] );
}
function createTransferModel( model ) {
// Create the truncated base model (remove the "top" layers, classification + bottleneck layers)
const bottleneck = model.getLayer( "dropout" ); // This is the final layer before the conv_pred pre-trained classification layer
const baseModel = tf.model({
inputs: model.inputs,
outputs: bottleneck.output
});
// Freeze the convolutional base
for( const layer of baseModel.layers ) {
layer.trainable = false;
}
// Add a classification head
const newHead = tf.sequential();
newHead.add( tf.layers.flatten( {
inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
} ) );
newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
newHead.add( tf.layers.dense( {
units: labels.length,
kernelInitializer: 'varianceScaling',
useBias: false,
activation: 'softmax'
} ) );
// Build the new model
const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
return newModel;
}
async function trainModel() {
hasTrained = false;
setText( "Training..." );
// Setup training data
const imageSamples = [];
const targetSamples = [];
trainingData.forEach( sample => {
imageSamples.push( sample.image );
let cat = [];
for( let c = 0; c < labels.length; c++ ) {
cat.push( c === sample.category ? 1 : 0 );
}
targetSamples.push( tf.tensor1d( cat ) );
});
const xs = tf.stack( imageSamples );
const ys = tf.stack( targetSamples );
// Train the model on new image samples
model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );
await model.fit( xs, ys, {
epochs: 30,
shuffle: true,
callbacks: {
onEpochEnd: ( epoch, logs ) => {
console.log( "Epoch #", epoch, logs );
}
}
});
hasTrained = true;
}
// Mobilenet v1 0.25 224x224 model
const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";
let model = null;
let hasTrained = false;
async function setupWebcam() {
return new Promise( ( resolve, reject ) => {
const webcamElement = document.getElementById( "webcam" );
const navigatorAny = navigator;
navigator.getUserMedia = navigator.getUserMedia ||
navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
navigatorAny.msGetUserMedia;
if( navigator.getUserMedia ) {
navigator.getUserMedia( { video: true },
stream => {
webcamElement.srcObject = stream;
webcamElement.addEventListener( "loadeddata", resolve, false );
},
error => reject());
}
else {
reject();
}
});
}
async function getWebcamImage() {
const img = ( await webcam.capture() ).toFloat();
const normalized = img.div( 127 ).sub( 1 );
return normalized;
}
async function captureSample( category ) {
trainingData.push( {
image: await getWebcamImage(),
category: category
});
setText( "Captured: " + labels[ category ] );
}
let webcam = null;
(async () => {
// Load the model
model = await tf.loadLayersModel( mobilenet );
model = createTransferModel( model );
await setupWebcam();
webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
// Setup prediction every 200 ms
setInterval( predictImage, 200 );
})();
</script>
</body>
</html>
检测手势
构建起点可以检测四个不同的类别:无,拳头,手掌,剪刀手。您可以使用网络摄像头尝试此操作,方法是:在握住每个手势的同时单击每个类别按钮以捕获一些照片(5-6是一个很好的示例),然后单击“训练”按钮将学习转移到神经网络。之后,可以通过拍摄更多照片并再次单击“训练”按钮来改进模型。
其他手势和手语
如您所料,添加更多类别对于AI而言变得更加困难,并且需要花费更多时间。但是,结果很有趣,并且即使对于每个类别仅使用几张照片,人工智能也表现良好。让我们尝试添加一些美国手语(ASL)手势。
要添加更多,您可以在输入列表中包括更多按钮,更新传递到captureSample()中的数字,并相应地修改labels数组。
您可以添加任何您想要的标志。我尝试添加四个表情符号集的一部分:
- 👌(字母D)
- 👍(竖起大拇指)
- 🖖(火神)
- 🤟(ILY-我爱你)
技术脚注
- 如果AI似乎无法很好地识别您的手势,请尝试拍摄更多照片,然后多次训练模型。
- 在用各种手势训练模型时,请记住,它可以看到完整的图像。它不一定知道手本身就能区分类别。没有来自不同手的大量样本,可能很难准确识别不同的手势。
- 该模型有时会学习区分左手和右手,有时则不会,这可能会影响多轮训练后的预测。
终点线
供您参考,这是此项目的完整代码:
<html>
<head>
<meta charset="UTF-8">
<title>Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js</title>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@2.0.0/dist/tf.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-data@2.0.0/dist/tf-data.min.js"></script>
<style>
img, video {
object-fit: cover;
}
</style>
</head>
<body>
<video autoplay playsinline muted id="webcam" width="224" height="224"></video>
<div id="buttons">
<button onclick="captureSample(0)">None</button>
<button onclick="captureSample(1)">✊ (Rock)</button>
<button onclick="captureSample(2)">🖐 (Paper)</button>
<button onclick="captureSample(3)">✌️ (Scissors)</button>
<button onclick="captureSample(4)">👌 (Letter D)</button>
<button onclick="captureSample(5)">👍 (Thumb Up)</button>
<button onclick="captureSample(6)">🖖 (Vulcan)</button>
<button onclick="captureSample(7)">🤟 (ILY - I Love You)</button>
<button onclick="trainModel()">Train</button>
</div>
<h1 id="status">Loading...</h1>
<script>
let trainingData = [];
const labels = [
"None",
"✊ (Rock)",
"🖐 (Paper)",
"✌️ (Scissors)",
"👌 (Letter D)",
"👍 (Thumb Up)",
"🖖 (Vulcan)",
"🤟 (ILY - I Love You)"
];
function setText( text ) {
document.getElementById( "status" ).innerText = text;
}
async function predictImage() {
if( !hasTrained ) { return; } // Skip prediction until trained
const img = await getWebcamImage();
let result = tf.tidy( () => {
const input = img.reshape( [ 1, 224, 224, 3 ] );
return model.predict( input );
});
img.dispose();
let prediction = await result.data();
result.dispose();
// Get the index of the highest value in the prediction
let id = prediction.indexOf( Math.max( ...prediction ) );
setText( labels[ id ] );
}
function createTransferModel( model ) {
// Create the truncated base model (remove the "top" layers, classification + bottleneck layers)
const bottleneck = model.getLayer( "dropout" ); // This is the final layer before the conv_pred pre-trained classification layer
const baseModel = tf.model({
inputs: model.inputs,
outputs: bottleneck.output
});
// Freeze the convolutional base
for( const layer of baseModel.layers ) {
layer.trainable = false;
}
// Add a classification head
const newHead = tf.sequential();
newHead.add( tf.layers.flatten( {
inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
} ) );
newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
newHead.add( tf.layers.dense( {
units: labels.length,
kernelInitializer: 'varianceScaling',
useBias: false,
activation: 'softmax'
} ) );
// Build the new model
const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
return newModel;
}
async function trainModel() {
hasTrained = false;
setText( "Training..." );
// Setup training data
const imageSamples = [];
const targetSamples = [];
trainingData.forEach( sample => {
imageSamples.push( sample.image );
let cat = [];
for( let c = 0; c < labels.length; c++ ) {
cat.push( c === sample.category ? 1 : 0 );
}
targetSamples.push( tf.tensor1d( cat ) );
});
const xs = tf.stack( imageSamples );
const ys = tf.stack( targetSamples );
// Train the model on new image samples
model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );
await model.fit( xs, ys, {
epochs: 30,
shuffle: true,
callbacks: {
onEpochEnd: ( epoch, logs ) => {
console.log( "Epoch #", epoch, logs );
}
}
});
hasTrained = true;
}
// Mobilenet v1 0.25 224x224 model
const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";
let model = null;
let hasTrained = false;
async function setupWebcam() {
return new Promise( ( resolve, reject ) => {
const webcamElement = document.getElementById( "webcam" );
const navigatorAny = navigator;
navigator.getUserMedia = navigator.getUserMedia ||
navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
navigatorAny.msGetUserMedia;
if( navigator.getUserMedia ) {
navigator.getUserMedia( { video: true },
stream => {
webcamElement.srcObject = stream;
webcamElement.addEventListener( "loadeddata", resolve, false );
},
error => reject());
}
else {
reject();
}
});
}
async function getWebcamImage() {
const img = ( await webcam.capture() ).toFloat();
const normalized = img.div( 127 ).sub( 1 );
return normalized;
}
async function captureSample( category ) {
trainingData.push( {
image: await getWebcamImage(),
category: category
});
setText( "Captured: " + labels[ category ] );
}
let webcam = null;
(async () => {
// Load the model
model = await tf.loadLayersModel( mobilenet );
model = createTransferModel( model );
await setupWebcam();
webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
// Setup prediction every 200 ms
setInterval( predictImage, 200 );
})();
</script>
</body>
</html>
下一步是什么?
该项目向您展示了如何开始训练自己的计算机视觉AI,以识别可能不受限制的手势、物体、动物物种甚至食物类型。其余的取决于您;深度学习和AI的未来可能会在您的浏览器中开始。
希望您喜欢以下示例。并且,随着您尝试更多的想法,不要忘了获得乐趣!
https://www.codeproject.com/Articles/5272777/Interpreting-Hand-Gestures-and-Sign-Language-in-th