feat: HandTrackingService now allows selecting any suitable video inp…

…ut device
AlmasB · Jan 24, 2025 · 878533a · 878533a
1 parent 0adee8e
commit 878533a
Show file tree

Hide file tree

Showing 4 changed files with 201 additions and 86 deletions.
diff --git a/...lligence/src/main/kotlin/com/almasb/fxgl/intelligence/gesturerecog/HandTrackingService.kt b/...lligence/src/main/kotlin/com/almasb/fxgl/intelligence/gesturerecog/HandTrackingService.kt
@@ -6,111 +6,93 @@
 
 package com.almasb.fxgl.intelligence.gesturerecog
 
-import com.almasb.fxgl.core.EngineService
 import com.almasb.fxgl.core.concurrent.Async
+import com.almasb.fxgl.core.util.EmptyRunnable
 import com.almasb.fxgl.intelligence.WebAPI
+import com.almasb.fxgl.intelligence.WebAPIService
 import com.almasb.fxgl.logging.Logger
 import com.almasb.fxgl.net.ws.LocalWebSocketServer
+import com.almasb.fxgl.net.ws.VideoInputDeviceInfo
 import javafx.geometry.Point3D
-import org.openqa.selenium.WebDriver
-import org.openqa.selenium.chrome.ChromeDriver
-import org.openqa.selenium.chrome.ChromeOptions
 import java.util.function.Consumer
 
 /**
+ * Service that provides access to hand tracking.
  *
  * @author Almas Baim (https://github.com/AlmasB)
  */
-class HandTrackingService : EngineService() {
+class HandTrackingService : WebAPIService(
+    LocalWebSocketServer("HandTrackingServer", WebAPI.GESTURE_RECOGNITION_PORT),
+    WebAPI.GESTURE_RECOGNITION_API
+) {
 
     private val log = Logger.get(HandTrackingService::class.java)
-    private val server = LocalWebSocketServer("HandTrackingServer", WebAPI.GESTURE_RECOGNITION_PORT)
 
-    private var webDriver: WebDriver? = null
+    private val videoInputDevices = arrayListOf<VideoInputDeviceInfo>()
 
     private val handDataHandlers = arrayListOf<Consumer<Hand>>()
 
+    var onMediaDeviceDetectionCompleted: Runnable = EmptyRunnable
+
+    val videoDevices: List<VideoInputDeviceInfo>
+        get() = videoInputDevices.toList()
+
     val landmarksView: HandLandmarksView by lazy {
         HandLandmarksView().also { addInputHandler(it) }
     }
 
-    override fun onInit() {
-        server.addMessageHandler { message ->
-            try {
-                val rawData = message.split(",").filter { it.isNotEmpty() }
+    // kind: audioinput, videoinput, audiooutput
+    private fun onMediaDeviceDetected(kind: String, label: String, deviceID: String) {
+        log.debug("New media device detected: $kind,$label,$deviceID")
 
-                val id = rawData[0].toInt()
-                val points = ArrayList<Point3D>()
+        if (kind == "videoinput") {
+            videoInputDevices += VideoInputDeviceInfo(label, deviceID)
+        }
+    }
 
-                var i = 1
-                while (i < rawData.size) {
-                    val x = rawData[i + 0].toDouble()
-                    val y = rawData[i + 1].toDouble()
-                    val z = rawData[i + 2].toDouble()
+    private fun onMediaDeviceDetectionComplete() {
+        Async.startAsyncFX {
+            onMediaDeviceDetectionCompleted.run()
+        }
+    }
 
-                    points.add(Point3D(x, y, z))
+    private fun initService() {
+        log.debug("initService()")
 
-                    i += 3
-                }
+        setReady()
+    }
 
-                Async.startAsyncFX {
-                    handDataHandlers.forEach { it.accept(Hand(id, points)) }
-                }
+    private fun onHandInput(message: String) {
+        try {
+            val rawData = message.split(",").filter { it.isNotEmpty() }
 
-            } catch (e: Exception) {
-                log.warning("Failed to parse message.", e)
-            }
-        }
+            val id = rawData[0].toInt()
+            val points = ArrayList<Point3D>()
 
-        server.start()
-    }
+            var i = 1
+            while (i < rawData.size) {
+                val x = rawData[i + 0].toDouble()
+                val y = rawData[i + 1].toDouble()
+                val z = rawData[i + 2].toDouble()
 
-    /**
-     * Starts this service in a background thread.
-     * Can be called after stop() to restart the service.
-     * If the service has already started, then calls stop() and restarts it.
-     */
-    fun start() {
-        Async.startAsync {
-            try {
-                if (webDriver != null) {
-                    stop()
-                }
-
-                val options = ChromeOptions()
-                options.addArguments("--headless=new")
-
-                // for modules
-                options.addArguments("--allow-file-access-from-files")
-                // for webcam, audio input
-                options.addArguments("--use-fake-ui-for-media-stream")
-
-                webDriver = ChromeDriver(options)
-                webDriver!!.get(WebAPI.GESTURE_RECOGNITION_API.toExternalForm())
-
-                // we are ready to use the web api service
-            } catch (e: Exception) {
-                log.warning("Failed to start Chrome web driver. Ensure Chrome is installed in default location")
-                log.warning("Error data", e)
+                points.add(Point3D(x, y, z))
+
+                i += 3
             }
-        }
-    }
 
-    /**
-     * Stops this service.
-     * No-op if it has not started via start() before.
-     */
-    fun stop() {
-        try {
-            if (webDriver != null) {
-                webDriver!!.quit()
-                webDriver = null
+            Async.startAsyncFX {
+                handDataHandlers.forEach { it.accept(Hand(id, points)) }
             }
+
         } catch (e: Exception) {
-            log.warning("Failed to quit web driver", e)
+            log.warning("Failed to parse message.", e)
         }
     }
 
+    /**
+     * Add input handler for hand tracking data.
+     * Input handlers are called on the JavaFX thread.
+     */
     fun addInputHandler(handler: Consumer<Hand>) {
         handDataHandlers += handler
     }
@@ -119,8 +101,9 @@ class HandTrackingService : EngineService() {
         handDataHandlers -= handler
     }
 
-    override fun onExit() {
-        stop()
-        server.stop()
+    fun setVideoDevice(videoDevice: VideoInputDeviceInfo) {
+        log.debug("setting video device = $videoDevice")
+
+        rpcRun("setVideoInputDevice", videoDevice.id)
     }
 }
diff --git a/fxgl-intelligence/src/main/kotlin/com/almasb/fxgl/net/ws/MediaDeviceInfo.kt b/fxgl-intelligence/src/main/kotlin/com/almasb/fxgl/net/ws/MediaDeviceInfo.kt
@@ -0,0 +1,42 @@
+/*
+ * FXGL - JavaFX Game Library. The MIT License (MIT).
+ * Copyright (c) AlmasB ([email protected]).
+ * See LICENSE for details.
+ */
+
+package com.almasb.fxgl.net.ws
+
+/**
+ * This captures the WebAPI from
+ * https://developer.mozilla.org/en-US/docs/Web/API/MediaDevices/getUserMedia
+ *
+ * @author Almas Baim (https://github.com/AlmasB)
+ */
+open class MediaDeviceInfo
+internal constructor(
+    val kind: String,
+    val label: String,
+    val id: String
+)
+
+class AudioInputDeviceInfo(label: String, id: String)
+    : MediaDeviceInfo("audioinput", label, id) {
+
+    override fun toString(): String {
+        if (label.isNotEmpty())
+            return "(audio)$label"
+
+        return "(audio)$id"
+    }
+}
+
+class VideoInputDeviceInfo(label: String, id: String)
+    : MediaDeviceInfo("videoinput", label, id) {
+
+    override fun toString(): String {
+        if (label.isNotEmpty())
+            return "(video)$label"
+
+        return "(video)$id"
+    }
+}
diff --git a/fxgl-intelligence/src/main/resources/com/almasb/fxgl/intelligence/gesturerecog/index.html b/fxgl-intelligence/src/main/resources/com/almasb/fxgl/intelligence/gesturerecog/index.html
@@ -24,10 +24,10 @@
 <body>
     <h2>Hand tracking service</h2>
 
-    <section id="demos" class="invisible">
+    <section id="demos" class="invisible" hidden>
         <div id="liveView" class="videoView">
             <div style="position: relative;">
-                <video id="webcam" autoplay playsinline></video>
+                <video id="webcam" autoplay></video>
             </div>
         </div>
     </section>

diff --git a/fxgl-intelligence/src/main/resources/com/almasb/fxgl/intelligence/gesturerecog/script.js b/fxgl-intelligence/src/main/resources/com/almasb/fxgl/intelligence/gesturerecog/script.js
@@ -18,9 +18,36 @@ import {
 } from "https://cdn.jsdelivr.net/npm/@mediapipe/[email protected]";
 
 let handLandmarker = undefined;
+let nextVideoDeviceId = "";
+let isReady = false;
 
 const socket = new WebSocket('ws://localhost:55560');
 
+socket.addEventListener('open', function (event) {
+    createHandLandmarker();
+});
+
+socket.addEventListener('message', function (event) {
+    let message = event.data;
+
+    if (message.startsWith(FUNCTION_CALL_TAG)) {
+        let func = message.substring(FUNCTION_CALL_TAG.length);
+        let tokens = func.split('*,,*');
+        let funcName = tokens[0];
+
+        if (funcName === "setVideoInputDevice") {
+            let deviceId = tokens[1];
+
+            // TODO: window["functionName"](arguments);
+
+            setVideoInputDevice(deviceId);
+        }
+    }
+});
+
+const video = document.getElementById("webcam");
+let lastVideoTime = -1;
+
 // Before we can use HandLandmarker class we must wait for it to finish
 // loading. Machine Learning models can be large and take a moment to
 // get everything needed to run.
@@ -37,21 +64,29 @@ const createHandLandmarker = async () => {
         numHands: 2
     });
 
-    enableCam();
-};
-createHandLandmarker();
+    checkMediaDevices();
+    enableWebcam();
 
-/********************************************************************
-// Continuously grab image from webcam stream and detect it.
-********************************************************************/
+    rpcRun("initService");
+};
 
-const video = document.getElementById("webcam");
-let lastVideoTime = -1;
+function checkMediaDevices() {
+    navigator.mediaDevices
+        .enumerateDevices()
+        .then((devices) => {
+            devices.forEach((device) => {
+                rpcRun("onMediaDeviceDetected", `${device.kind}`, `${device.label}`, `${device.deviceId}`);
+            });
+
+            rpcRun("onMediaDeviceDetectionComplete");
+        })
+        .catch((err) => {
+            // ignore `${err.name}: ${err.message}`
+        });
+}
 
 // Enable the live webcam view and start detection.
-function enableCam() {
-
-    // getUsermedia parameters.
+function enableWebcam() {
     const constraints = {
         video: true
     };
@@ -60,10 +95,41 @@ function enableCam() {
     navigator.mediaDevices.getUserMedia(constraints).then((stream) => {
         video.srcObject = stream;
         video.addEventListener("loadeddata", predictWebcam);
+
+        isReady = true;
+    });
+}
+
+function setVideoInputDevice(selectedDeviceId) {
+    nextVideoDeviceId = selectedDeviceId;
+    isReady = false;
+
+    const constraints = {
+        video: {
+            deviceId: nextVideoDeviceId,
+        },
+    };
+
+    // Activate the webcam stream.
+    navigator.mediaDevices.getUserMedia(constraints).then((stream) => {
+        // stop previous tracks
+        video.srcObject.getVideoTracks().forEach((track) => {
+            track.stop();
+        });
+
+        // set new stream
+        video.srcObject = stream;
+        video.addEventListener("loadeddata", predictWebcam);
+
+        console.log("set stream: " + stream);
+        isReady = true;
     });
 }
 
 async function predictWebcam() {
+    if (!isReady)
+        return;
+
     let startTimeMs = performance.now();
     if (lastVideoTime !== video.currentTime) {
         lastVideoTime = video.currentTime;
@@ -80,7 +146,7 @@ async function predictWebcam() {
                     data += point.x + "," + point.y + "," + point.z + ",";
                 });
 
-                socket.send(data);
+                rpcRun("onHandInput", data);
 
                 id++;
             }
@@ -90,3 +156,27 @@ async function predictWebcam() {
     // Call this function again to keep predicting when the browser is ready.
     window.requestAnimationFrame(predictWebcam);
 }
+
+// the below is a copy-paste since this js file is a module but ../rpc-common.js is not
+// include ../rpc-common.js
+
+const SEPARATOR = "*,,*";
+const FUNCTION_CALL_TAG = "F_CALL:";
+const FUNCTION_RETURN_TAG = "F_RETURN:";
+
+function rpcRun(funcName, ...args) {
+    let argsString = "";
+
+    for (const arg of args) {
+        argsString += arg + SEPARATOR;
+    }
+
+    let message = `${FUNCTION_CALL_TAG}${funcName}${SEPARATOR}${argsString}`;
+
+    socket.send(message);
+}
+
+function rpcReturn(funcName) {
+    // TODO: unique id?
+    //socket.send(`${FUNCTION_RETURN_TAG}${funcName}.F_RESULT:${names}`);
+}