使用TensorFlow.js进行AI在网络摄像头中翻译手势和手语-CSDN博客

TensorFlow + JavaScript。现在，最流行，最先进的AI框架支持地球上使用最广泛的编程语言，因此，让我们在我们的web浏览器中通过深度学习实现奇迹，通过TensorFlow.js的WebGL GPU加速！

这是我们六个系列的最后一篇文章：

在本文中，我们将通过网络摄像头拍摄不同手势的照片，并在预先训练的MobileNet模型上使用转移学习来构建可以实时识别各种手势的计算机视觉AI。

起点

为了识别多个手势，我们将使用几乎可用的入门代码并将其扩展以检测更多类别的对象。代码将执行以下操作：

导入TensorFlow.js和TensorFlow的tf-data.js
定义触摸与非触摸类别标签
为网络摄像头添加视频元素
首次训练后，每200毫秒运行一次模型预测
显示预测结果
加载预先训练的MobileNet模型，并准备将转移学习分类为与标签一样多的类别
训练和分类图像中的各种自定义对象
跳过在训练过程中放置图像和目标样本的过程，以保持它们进行多次训练

这是该项目的起点：

<html>
    <head>
        <meta charset="UTF-8">
        <title>Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js</title>
        <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@2.0.0/dist/tf.min.js"></script>
        <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-data@2.0.0/dist/tf-data.min.js"></script>
        <style>
            img, video {
                object-fit: cover;
            }
        </style>
    </head>
    <body>
        <video autoplay playsinline muted id="webcam" width="224" height="224"></video>
        <div id="buttons">
            <button onclick="captureSample(0)">None</button>
            <button onclick="captureSample(1)">✊ (Rock)</button>
            <button onclick="captureSample(2)">🖐 (Paper)</button>
            <button onclick="captureSample(3)">✌️ (Scissors)</button>
            <button onclick="trainModel()">Train</button>
        </div>
        <h1 id="status">Loading...</h1>
        <script>
        let trainingData = [];

        const labels = [
            "None",
            "✊ (Rock)",
            "🖐 (Paper)",
            "✌️ (Scissors)",
        ];

        function setText( text ) {
            document.getElementById( "status" ).innerText = text;
        }

        async function predictImage() {
            if( !hasTrained ) { return; } // Skip prediction until trained
            const img = await getWebcamImage();
            let result = tf.tidy( () => {
                const input = img.reshape( [ 1, 224, 224, 3 ] );
                return model.predict( input );
            });
            img.dispose();
            let prediction = await result.data();
            result.dispose();
            // Get the index of the highest value in the prediction
            let id = prediction.indexOf( Math.max( ...prediction ) );
            setText( labels[ id ] );
        }

        function createTransferModel( model ) {
            // Create the truncated base model (remove the "top" layers, classification + bottleneck layers)
            const bottleneck = model.getLayer( "dropout" ); // This is the final layer before the conv_pred pre-trained classification layer
            const baseModel = tf.model({
                inputs: model.inputs,
                outputs: bottleneck.output
            });
            // Freeze the convolutional base
            for( const layer of baseModel.layers ) {
                layer.trainable = false;
            }
            // Add a classification head
            const newHead = tf.sequential();
            newHead.add( tf.layers.flatten( {
                inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
            } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( {
                units: labels.length,
                kernelInitializer: 'varianceScaling',
                useBias: false,
                activation: 'softmax'
            } ) );
            // Build the new model
            const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
            const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
            return newModel;
        }

        async function trainModel() {
            hasTrained = false;
            setText( "Training..." );

            // Setup training data
            const imageSamples = [];
            const targetSamples = [];
            trainingData.forEach( sample => {
                imageSamples.push( sample.image );
                let cat = [];
                for( let c = 0; c < labels.length; c++ ) {
                    cat.push( c === sample.category ? 1 : 0 );
                }
                targetSamples.push( tf.tensor1d( cat ) );
            });
            const xs = tf.stack( imageSamples );
            const ys = tf.stack( targetSamples );

            // Train the model on new image samples
            model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );

            await model.fit( xs, ys, {
                epochs: 30,
                shuffle: true,
                callbacks: {
                    onEpochEnd: ( epoch, logs ) => {
                        console.log( "Epoch #", epoch, logs );
                    }
                }
            });
            hasTrained = true;
        }

        // Mobilenet v1 0.25 224x224 model
        const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";

        let model = null;
        let hasTrained = false;

        async function setupWebcam() {
            return new Promise( ( resolve, reject ) => {
                const webcamElement = document.getElementById( "webcam" );
                const navigatorAny = navigator;
                navigator.getUserMedia = navigator.getUserMedia ||
                navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
                navigatorAny.msGetUserMedia;
                if( navigator.getUserMedia ) {
                    navigator.getUserMedia( { video: true },
                        stream => {
                            webcamElement.srcObject = stream;
                            webcamElement.addEventListener( "loadeddata", resolve, false );
                        },
                    error => reject());
                }
                else {
                    reject();
                }
            });
        }

        async function getWebcamImage() {
            const img = ( await webcam.capture() ).toFloat();
            const normalized = img.div( 127 ).sub( 1 );
            return normalized;
        }

        async function captureSample( category ) {
            trainingData.push( {
                image: await getWebcamImage(),
                category: category
            });
            setText( "Captured: " + labels[ category ] );
        }

        let webcam = null;

        (async () => {
            // Load the model
            model = await tf.loadLayersModel( mobilenet );
            model = createTransferModel( model );
            await setupWebcam();
            webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
            // Setup prediction every 200 ms
            setInterval( predictImage, 200 );
        })();
        </script>
    </body>
</html>

检测手势

构建起点可以检测四个不同的类别：无，拳头，手掌，剪刀手。您可以使用网络摄像头尝试此操作，方法是：在握住每个手势的同时单击每个类别按钮以捕获一些照片（5-6是一个很好的示例），然后单击“训练”按钮将学习转移到神经网络。之后，可以通过拍摄更多照片并再次单击“训练”按钮来改进模型。

其他手势和手语

如您所料，添加更多类别对于AI而言变得更加困难，并且需要花费更多时间。但是，结果很有趣，并且即使对于每个类别仅使用几张照片，人工智能也表现良好。让我们尝试添加一些美国手语（ASL）手势。

要添加更多，您可以在输入列表中包括更多按钮，更新传递到captureSample()中的数字，并相应地修改labels数组。

您可以添加任何您想要的标志。我尝试添加四个表情符号集的一部分：

👌（字母D）
👍（竖起大拇指）
🖖（火神）
🤟（ILY-我爱你）

技术脚注

如果AI似乎无法很好地识别您的手势，请尝试拍摄更多照片，然后多次训练模型。
在用各种手势训练模型时，请记住，它可以看到完整的图像。它不一定知道手本身就能区分类别。没有来自不同手的大量样本，可能很难准确识别不同的手势。
该模型有时会学习区分左手和右手，有时则不会，这可能会影响多轮训练后的预测。

终点线

供您参考，这是此项目的完整代码：

<html>
    <head>
        <meta charset="UTF-8">
        <title>Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js</title>
        <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@2.0.0/dist/tf.min.js"></script>
        <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-data@2.0.0/dist/tf-data.min.js"></script>
        <style>
            img, video {
                object-fit: cover;
            }
        </style>
    </head>
    <body>
        <video autoplay playsinline muted id="webcam" width="224" height="224"></video>
        <div id="buttons">
            <button onclick="captureSample(0)">None</button>
            <button onclick="captureSample(1)">✊ (Rock)</button>
            <button onclick="captureSample(2)">🖐 (Paper)</button>
            <button onclick="captureSample(3)">✌️ (Scissors)</button>
            <button onclick="captureSample(4)">👌 (Letter D)</button>
            <button onclick="captureSample(5)">👍 (Thumb Up)</button>
            <button onclick="captureSample(6)">🖖 (Vulcan)</button>
            <button onclick="captureSample(7)">🤟 (ILY - I Love You)</button>
            <button onclick="trainModel()">Train</button>
        </div>
        <h1 id="status">Loading...</h1>
        <script>
        let trainingData = [];

        const labels = [
            "None",
            "✊ (Rock)",
            "🖐 (Paper)",
            "✌️ (Scissors)",
            "👌 (Letter D)",
            "👍 (Thumb Up)",
            "🖖 (Vulcan)",
            "🤟 (ILY - I Love You)"
        ];

        function setText( text ) {
            document.getElementById( "status" ).innerText = text;
        }

        async function predictImage() {
            if( !hasTrained ) { return; } // Skip prediction until trained
            const img = await getWebcamImage();
            let result = tf.tidy( () => {
                const input = img.reshape( [ 1, 224, 224, 3 ] );
                return model.predict( input );
            });
            img.dispose();
            let prediction = await result.data();
            result.dispose();
            // Get the index of the highest value in the prediction
            let id = prediction.indexOf( Math.max( ...prediction ) );
            setText( labels[ id ] );
        }

        function createTransferModel( model ) {
            // Create the truncated base model (remove the "top" layers, classification + bottleneck layers)
            const bottleneck = model.getLayer( "dropout" ); // This is the final layer before the conv_pred pre-trained classification layer
            const baseModel = tf.model({
                inputs: model.inputs,
                outputs: bottleneck.output
            });
            // Freeze the convolutional base
            for( const layer of baseModel.layers ) {
                layer.trainable = false;
            }
            // Add a classification head
            const newHead = tf.sequential();
            newHead.add( tf.layers.flatten( {
                inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
            } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( {
                units: labels.length,
                kernelInitializer: 'varianceScaling',
                useBias: false,
                activation: 'softmax'
            } ) );
            // Build the new model
            const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
            const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
            return newModel;
        }

        async function trainModel() {
            hasTrained = false;
            setText( "Training..." );

            // Setup training data
            const imageSamples = [];
            const targetSamples = [];
            trainingData.forEach( sample => {
                imageSamples.push( sample.image );
                let cat = [];
                for( let c = 0; c < labels.length; c++ ) {
                    cat.push( c === sample.category ? 1 : 0 );
                }
                targetSamples.push( tf.tensor1d( cat ) );
            });
            const xs = tf.stack( imageSamples );
            const ys = tf.stack( targetSamples );

            // Train the model on new image samples
            model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );

            await model.fit( xs, ys, {
                epochs: 30,
                shuffle: true,
                callbacks: {
                    onEpochEnd: ( epoch, logs ) => {
                        console.log( "Epoch #", epoch, logs );
                    }
                }
            });
            hasTrained = true;
        }

        // Mobilenet v1 0.25 224x224 model
        const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";

        let model = null;
        let hasTrained = false;

        async function setupWebcam() {
            return new Promise( ( resolve, reject ) => {
                const webcamElement = document.getElementById( "webcam" );
                const navigatorAny = navigator;
                navigator.getUserMedia = navigator.getUserMedia ||
                navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
                navigatorAny.msGetUserMedia;
                if( navigator.getUserMedia ) {
                    navigator.getUserMedia( { video: true },
                        stream => {
                            webcamElement.srcObject = stream;
                            webcamElement.addEventListener( "loadeddata", resolve, false );
                        },
                    error => reject());
                }
                else {
                    reject();
                }
            });
        }

        async function getWebcamImage() {
            const img = ( await webcam.capture() ).toFloat();
            const normalized = img.div( 127 ).sub( 1 );
            return normalized;
        }

        async function captureSample( category ) {
            trainingData.push( {
                image: await getWebcamImage(),
                category: category
            });
            setText( "Captured: " + labels[ category ] );
        }

        let webcam = null;

        (async () => {
            // Load the model
            model = await tf.loadLayersModel( mobilenet );
            model = createTransferModel( model );
            await setupWebcam();
            webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
            // Setup prediction every 200 ms
            setInterval( predictImage, 200 );
        })();
        </script>
    </body>
</html>