diff --git a/sdk/runanywhere-kotlin/.gitignore b/sdk/runanywhere-kotlin/.gitignore index abb51cae..1b87b619 100644 --- a/sdk/runanywhere-kotlin/.gitignore +++ b/sdk/runanywhere-kotlin/.gitignore @@ -42,3 +42,7 @@ build/jniLibs/ # Local environment local.properties + +# Secrets (NEVER commit real credentials) +secrets.properties +*.secrets.properties diff --git a/sdk/runanywhere-kotlin/build.gradle.kts b/sdk/runanywhere-kotlin/build.gradle.kts index 22fab85b..02e1d556 100644 --- a/sdk/runanywhere-kotlin/build.gradle.kts +++ b/sdk/runanywhere-kotlin/build.gradle.kts @@ -163,6 +163,14 @@ kotlin { // Android target androidTarget { + // Enable publishing Android AAR to Maven + publishLibraryVariants("release") + + // Set correct artifact ID for Android publication + mavenPublication { + artifactId = "runanywhere-sdk-android" + } + compilations.all { compilerOptions.configure { jvmTarget.set(org.jetbrains.kotlin.gradle.dsl.JvmTarget.JVM_17) diff --git a/sdk/runanywhere-kotlin/docs/Documentation.md b/sdk/runanywhere-kotlin/docs/Documentation.md index 5eaeaadb..8cbdc6a9 100644 --- a/sdk/runanywhere-kotlin/docs/Documentation.md +++ b/sdk/runanywhere-kotlin/docs/Documentation.md @@ -6,16 +6,439 @@ Complete API reference for the RunAnywhere Kotlin SDK. All public APIs are acces ## Table of Contents -1. [Core API](#core-api) -2. [Text Generation (LLM)](#text-generation-llm) -3. [Speech-to-Text (STT)](#speech-to-text-stt) -4. [Text-to-Speech (TTS)](#text-to-speech-tts) -5. [Voice Activity Detection (VAD)](#voice-activity-detection-vad) -6. [Voice Agent](#voice-agent) -7. [Model Management](#model-management) -8. [Event System](#event-system) -9. [Types & Enums](#types--enums) -10. [Error Handling](#error-handling) +1. [Quick Start](#quick-start) +2. [Core API](#core-api) +3. [Text Generation (LLM)](#text-generation-llm) +4. [Speech-to-Text (STT)](#speech-to-text-stt) +5. [Text-to-Speech (TTS)](#text-to-speech-tts) +6. [Voice Activity Detection (VAD)](#voice-activity-detection-vad) +7. [Voice Agent](#voice-agent) +8. [Model Management](#model-management) +9. [Event System](#event-system) +10. [Types & Enums](#types--enums) +11. [Error Handling](#error-handling) + +--- + +## Quick Start + +### Installation (Maven Central) + +```kotlin +// build.gradle.kts +dependencies { + // Core SDK with native libraries + implementation("io.github.sanchitmonga22:runanywhere-sdk-android:0.16.1") + + // LlamaCPP backend for LLM text generation + implementation("io.github.sanchitmonga22:runanywhere-llamacpp-android:0.16.1") + + // ONNX backend for STT/TTS/VAD + implementation("io.github.sanchitmonga22:runanywhere-onnx-android:0.16.1") +} +``` + +```kotlin +// settings.gradle.kts - add repositories +dependencyResolutionManagement { + repositoriesMode.set(RepositoriesMode.PREFER_SETTINGS) + repositories { + google() + mavenCentral() + // JitPack for transitive dependencies (android-vad, PRDownloader) + maven { url = uri("https://jitpack.io") } + } +} +``` + +### Initialize SDK + +```kotlin +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.SDKEnvironment + +// In your Application.onCreate() or Activity +RunAnywhere.initialize(environment = SDKEnvironment.DEVELOPMENT) +``` + +### Register & Load Models + +The starter app uses these specific model IDs and URLs: + +```kotlin +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.extensions.registerModel +import com.runanywhere.sdk.public.extensions.downloadModel +import com.runanywhere.sdk.public.extensions.loadLLMModel +import com.runanywhere.sdk.public.extensions.loadSTTModel +import com.runanywhere.sdk.public.extensions.loadTTSVoice +import com.runanywhere.sdk.public.extensions.Models.ModelCategory +import com.runanywhere.sdk.core.types.InferenceFramework + +// LLM Model - SmolLM2 360M (small, fast, good for demos) +RunAnywhere.registerModel( + id = "smollm2-360m-instruct-q8_0", + name = "SmolLM2 360M Instruct Q8_0", + url = "https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct-GGUF/resolve/main/smollm2-360m-instruct-q8_0.gguf", + framework = InferenceFramework.LLAMA_CPP, + modality = ModelCategory.LANGUAGE, + memoryRequirement = 400_000_000 // ~400MB +) + +// STT Model - Whisper Tiny English (fast transcription) +RunAnywhere.registerModel( + id = "sherpa-onnx-whisper-tiny.en", + name = "Sherpa Whisper Tiny (ONNX)", + url = "https://github.com/RunanywhereAI/sherpa-onnx/releases/download/runanywhere-models-v1/sherpa-onnx-whisper-tiny.en.tar.gz", + framework = InferenceFramework.ONNX, + modality = ModelCategory.SPEECH_RECOGNITION +) + +// TTS Model - Piper TTS (US English - Medium quality) +RunAnywhere.registerModel( + id = "vits-piper-en_US-lessac-medium", + name = "Piper TTS (US English - Medium)", + url = "https://github.com/RunanywhereAI/sherpa-onnx/releases/download/runanywhere-models-v1/vits-piper-en_US-lessac-medium.tar.gz", + framework = InferenceFramework.ONNX, + modality = ModelCategory.SPEECH_SYNTHESIS +) + +// Download model (returns Flow) +RunAnywhere.downloadModel("smollm2-360m-instruct-q8_0") + .catch { e -> println("Download failed: ${e.message}") } + .collect { progress -> + println("Download: ${(progress.progress * 100).toInt()}%") + } + +// Load model +RunAnywhere.loadLLMModel("smollm2-360m-instruct-q8_0") +``` + +### Text Generation (LLM) + +```kotlin +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.extensions.chat + +// Simple chat - returns String directly +val response = RunAnywhere.chat("What is AI?") +println(response) +``` + +### Speech-to-Text (STT) + +```kotlin +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.extensions.transcribe + +// Load STT model +RunAnywhere.loadSTTModel("sherpa-onnx-whisper-tiny.en") + +// Transcribe audio (16kHz, mono, 16-bit PCM ByteArray) +val transcription = RunAnywhere.transcribe(audioData) +println("You said: $transcription") +``` + +### Text-to-Speech (TTS) + +```kotlin +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.extensions.synthesize +import com.runanywhere.sdk.public.extensions.TTS.TTSOptions + +// Load TTS voice +RunAnywhere.loadTTSVoice("vits-piper-en_US-lessac-medium") + +// Synthesize audio - returns TTSOutput with audioData +val output = RunAnywhere.synthesize("Hello, world!", TTSOptions()) +// output.audioData contains WAV audio bytes + +// Play with Android AudioTrack (see example below) +``` + +### Voice Pipeline (STT → LLM → TTS) + +#### Option 1: Streaming Voice Session (Recommended) + +The `streamVoiceSession()` API handles everything automatically: +- Audio level calculation for visualization +- Speech detection (when audio level > threshold) +- Automatic silence detection (triggers processing after 1.5s of silence) +- Full STT → LLM → TTS orchestration +- Continuous conversation mode + +```kotlin +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.extensions.streamVoiceSession +import com.runanywhere.sdk.public.extensions.VoiceAgent.VoiceSessionConfig +import com.runanywhere.sdk.public.extensions.VoiceAgent.VoiceSessionEvent +import kotlinx.coroutines.* +import kotlinx.coroutines.flow.* + +// Ensure all 3 models are loaded first +RunAnywhere.loadSTTModel("sherpa-onnx-whisper-tiny.en") +RunAnywhere.loadLLMModel("smollm2-360m-instruct-q8_0") +RunAnywhere.loadTTSVoice("vits-piper-en_US-lessac-medium") + +// Your audio capture Flow (16kHz, mono, 16-bit PCM) +// See AudioCaptureService example below +val audioChunks: Flow = audioCaptureService.startCapture() + +// Configure voice session +val config = VoiceSessionConfig( + silenceDuration = 1.5, // 1.5 seconds of silence triggers processing + speechThreshold = 0.1f, // Audio level threshold for speech detection + autoPlayTTS = false, // We'll handle playback ourselves + continuousMode = true // Auto-resume listening after each turn +) + +// Start the SDK voice session - all business logic is handled by the SDK +sessionJob = scope.launch { + try { + RunAnywhere.streamVoiceSession(audioChunks, config).collect { event -> + when (event) { + is VoiceSessionEvent.Started -> { + sessionState = VoiceSessionState.LISTENING + } + + is VoiceSessionEvent.Listening -> { + audioLevel = event.audioLevel + } + + is VoiceSessionEvent.SpeechStarted -> { + sessionState = VoiceSessionState.SPEECH_DETECTED + } + + is VoiceSessionEvent.Processing -> { + sessionState = VoiceSessionState.PROCESSING + audioLevel = 0f + } + + is VoiceSessionEvent.Transcribed -> { + // User's speech was transcribed + showTranscript(event.text) + } + + is VoiceSessionEvent.Responded -> { + // LLM generated a response + showResponse(event.text) + } + + is VoiceSessionEvent.Speaking -> { + sessionState = VoiceSessionState.SPEAKING + } + + is VoiceSessionEvent.TurnCompleted -> { + // Play the synthesized audio + event.audio?.let { audio -> + sessionState = VoiceSessionState.SPEAKING + playWavAudio(audio) + } + // Resume listening state + sessionState = VoiceSessionState.LISTENING + audioLevel = 0f + } + + is VoiceSessionEvent.Stopped -> { + sessionState = VoiceSessionState.IDLE + audioLevel = 0f + } + + is VoiceSessionEvent.Error -> { + errorMessage = event.message + sessionState = VoiceSessionState.IDLE + } + } + } + } catch (e: CancellationException) { + // Expected when stopping + } catch (e: Exception) { + errorMessage = "Session error: ${e.message}" + sessionState = VoiceSessionState.IDLE + } +} + +// To stop the session: +fun stopSession() { + sessionJob?.cancel() + sessionJob = null + audioCaptureService.stopCapture() + sessionState = VoiceSessionState.IDLE +} +``` + +#### Audio Capture Service (Required for Voice Pipeline) + +```kotlin +import android.media.AudioFormat +import android.media.AudioRecord +import android.media.MediaRecorder +import kotlinx.coroutines.* +import kotlinx.coroutines.channels.awaitClose +import kotlinx.coroutines.flow.* + +class AudioCaptureService { + private var audioRecord: AudioRecord? = null + + @Volatile + private var isCapturing = false + + companion object { + const val SAMPLE_RATE = 16000 + const val CHUNK_SIZE_MS = 100 // Emit chunks every 100ms + } + + fun startCapture(): Flow = callbackFlow { + val bufferSize = AudioRecord.getMinBufferSize( + SAMPLE_RATE, + AudioFormat.CHANNEL_IN_MONO, + AudioFormat.ENCODING_PCM_16BIT + ) + val chunkSize = (SAMPLE_RATE * 2 * CHUNK_SIZE_MS) / 1000 + + try { + audioRecord = AudioRecord( + MediaRecorder.AudioSource.MIC, + SAMPLE_RATE, + AudioFormat.CHANNEL_IN_MONO, + AudioFormat.ENCODING_PCM_16BIT, + maxOf(bufferSize, chunkSize * 2) + ) + + if (audioRecord?.state != AudioRecord.STATE_INITIALIZED) { + close(IllegalStateException("AudioRecord initialization failed")) + return@callbackFlow + } + + audioRecord?.startRecording() + isCapturing = true + + val readJob = launch(Dispatchers.IO) { + val buffer = ByteArray(chunkSize) + while (isActive && isCapturing) { + val bytesRead = audioRecord?.read(buffer, 0, chunkSize) ?: -1 + if (bytesRead > 0) { + trySend(buffer.copyOf(bytesRead)) + } + } + } + + awaitClose { + readJob.cancel() + stopCapture() + } + } catch (e: Exception) { + stopCapture() + close(e) + } + } + + fun stopCapture() { + isCapturing = false + try { + audioRecord?.stop() + audioRecord?.release() + } catch (_: Exception) {} + audioRecord = null + } +} +``` + +#### Play WAV Audio (Required for Voice Pipeline) + +```kotlin +import android.media.AudioAttributes +import android.media.AudioFormat +import android.media.AudioTrack +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.delay +import kotlinx.coroutines.withContext + +suspend fun playWavAudio(wavData: ByteArray) = withContext(Dispatchers.IO) { + if (wavData.size < 44) return@withContext + + val headerSize = if (wavData.size > 44 && + wavData[0] == 'R'.code.toByte() && + wavData[1] == 'I'.code.toByte()) 44 else 0 + + val pcmData = wavData.copyOfRange(headerSize, wavData.size) + val sampleRate = 22050 // Piper TTS default sample rate + + val bufferSize = AudioTrack.getMinBufferSize( + sampleRate, AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_16BIT + ) + + val audioTrack = AudioTrack.Builder() + .setAudioAttributes( + AudioAttributes.Builder() + .setUsage(AudioAttributes.USAGE_MEDIA) + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .build() + ) + .setAudioFormat( + AudioFormat.Builder() + .setSampleRate(sampleRate) + .setEncoding(AudioFormat.ENCODING_PCM_16BIT) + .setChannelMask(AudioFormat.CHANNEL_OUT_MONO) + .build() + ) + .setBufferSizeInBytes(maxOf(bufferSize, pcmData.size)) + .setTransferMode(AudioTrack.MODE_STATIC) + .build() + + audioTrack.write(pcmData, 0, pcmData.size) + audioTrack.play() + + val durationMs = (pcmData.size.toLong() * 1000) / (sampleRate * 2) + delay(durationMs + 100) + + audioTrack.stop() + audioTrack.release() +} +``` + +#### Option 2: Manual Processing + +For more control, use `processVoice()` with your own silence detection: + +```kotlin +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.extensions.processVoice + +// Record audio (app responsibility - use AudioRecord) +val audioData: ByteArray = recordAudio() // 16kHz, mono, 16-bit PCM + +// Process through full pipeline - SDK handles orchestration +val result = RunAnywhere.processVoice(audioData) + +if (result.speechDetected) { + println("You said: ${result.transcription}") + println("AI response: ${result.response}") + + // Play synthesized audio (app responsibility) + result.synthesizedAudio?.let { playWavAudio(it) } +} +``` + +### Voice Session Events + +| Event | Description | +|-------|-------------| +| `Started` | Session started and ready | +| `Listening(audioLevel)` | Listening with real-time audio level (0.0 - 1.0) | +| `SpeechStarted` | Speech detected, accumulating audio | +| `Processing` | Silence detected, processing audio | +| `Transcribed(text)` | STT completed | +| `Responded(text)` | LLM response generated | +| `Speaking` | Playing TTS audio | +| `TurnCompleted(transcript, response, audio)` | Full turn complete with audio | +| `Stopped` | Session ended | +| `Error(message)` | Error occurred | + +### Complete Voice Pipeline Example + +See the Kotlin Starter Example app for a complete working implementation: +`starter_apps/kotlinstarterexample/app/src/main/java/com/runanywhere/kotlin_starter_example/ui/screens/VoicePipelineScreen.kt` --- @@ -714,12 +1137,38 @@ data class VoiceAgentConfiguration( ```kotlin sealed class VoiceSessionEvent { - object Listening : VoiceSessionEvent() + /** Session started and ready */ + data object Started : VoiceSessionEvent() + + /** Listening for speech with current audio level (0.0 - 1.0) */ + data class Listening(val audioLevel: Float) : VoiceSessionEvent() + + /** Speech detected, started accumulating audio */ + data object SpeechStarted : VoiceSessionEvent() + + /** Speech ended, processing audio */ + data object Processing : VoiceSessionEvent() + + /** Got transcription from STT */ data class Transcribed(val text: String) : VoiceSessionEvent() - object Thinking : VoiceSessionEvent() + + /** Got response from LLM */ data class Responded(val text: String) : VoiceSessionEvent() - object Speaking : VoiceSessionEvent() - object Idle : VoiceSessionEvent() + + /** Playing TTS audio */ + data object Speaking : VoiceSessionEvent() + + /** Complete turn result with transcript, response, and audio */ + data class TurnCompleted( + val transcript: String, + val response: String, + val audio: ByteArray? + ) : VoiceSessionEvent() + + /** Session stopped */ + data object Stopped : VoiceSessionEvent() + + /** Error occurred */ data class Error(val message: String) : VoiceSessionEvent() } ``` @@ -728,13 +1177,14 @@ sealed class VoiceSessionEvent { ```kotlin data class VoiceAgentResult( - val transcription: String, - val response: String, - val audioData: ByteArray?, - val totalLatencyMs: Double, - val sttLatencyMs: Double, - val llmLatencyMs: Double, - val ttsLatencyMs: Double + /** Whether speech was detected in the input audio */ + val speechDetected: Boolean = false, + /** Transcribed text from STT */ + val transcription: String? = null, + /** Generated response text from LLM */ + val response: String? = null, + /** Synthesized audio data from TTS (WAV format) */ + val synthesizedAudio: ByteArray? = null ) ``` @@ -1181,57 +1631,142 @@ Common error codes include: ## Usage Examples -### Complete LLM Chat +### Complete LLM Chat (Matching Starter App) ```kotlin +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.SDKEnvironment +import com.runanywhere.sdk.public.extensions.* +import com.runanywhere.sdk.public.extensions.Models.ModelCategory +import com.runanywhere.sdk.core.types.InferenceFramework + // Initialize RunAnywhere.initialize(environment = SDKEnvironment.DEVELOPMENT) -// Register and download model -val model = RunAnywhere.registerModel( - name = "Qwen 0.5B", - url = "https://huggingface.co/...", - framework = InferenceFramework.LLAMA_CPP +// Register model (same as starter app) +RunAnywhere.registerModel( + id = "smollm2-360m-instruct-q8_0", + name = "SmolLM2 360M Instruct Q8_0", + url = "https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct-GGUF/resolve/main/smollm2-360m-instruct-q8_0.gguf", + framework = InferenceFramework.LLAMA_CPP, + modality = ModelCategory.LANGUAGE, + memoryRequirement = 400_000_000 ) -RunAnywhere.downloadModel(model.id).collect { progress -> - println("Download: ${(progress.progress * 100).toInt()}%") -} +// Download model +RunAnywhere.downloadModel("smollm2-360m-instruct-q8_0") + .catch { e -> println("Download failed: ${e.message}") } + .collect { progress -> + println("Download: ${(progress.progress * 100).toInt()}%") + } // Load and use -RunAnywhere.loadLLMModel(model.id) +RunAnywhere.loadLLMModel("smollm2-360m-instruct-q8_0") -val result = RunAnywhere.generate( - prompt = "Explain AI in simple terms", - options = LLMGenerationOptions(maxTokens = 200) -) -println("Response: ${result.text}") -println("Speed: ${result.tokensPerSecond} tok/s") +// Simple chat (returns String) +val response = RunAnywhere.chat("Explain AI in simple terms") +println("Response: $response") // Cleanup RunAnywhere.unloadLLMModel() ``` -### Voice Agent Session +### Complete STT Example (Matching Starter App) ```kotlin -// Configure -RunAnywhere.configureVoiceAgent(VoiceAgentConfiguration( - sttModelId = "whisper-tiny", - llmModelId = "qwen-0.5b", - ttsVoiceId = "en-us-default" -)) +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.extensions.* + +// Register STT model +RunAnywhere.registerModel( + id = "sherpa-onnx-whisper-tiny.en", + name = "Sherpa Whisper Tiny (ONNX)", + url = "https://github.com/RunanywhereAI/sherpa-onnx/releases/download/runanywhere-models-v1/sherpa-onnx-whisper-tiny.en.tar.gz", + framework = InferenceFramework.ONNX, + modality = ModelCategory.SPEECH_RECOGNITION +) -// Start session -lifecycleScope.launch { - RunAnywhere.startVoiceSession().collect { event -> - when (event) { - is VoiceSessionEvent.Listening -> updateUI("Listening...") - is VoiceSessionEvent.Transcribed -> updateUI("You: ${event.text}") - is VoiceSessionEvent.Thinking -> updateUI("Thinking...") - is VoiceSessionEvent.Responded -> updateUI("AI: ${event.text}") - is VoiceSessionEvent.Speaking -> updateUI("Speaking...") - is VoiceSessionEvent.Error -> showError(event.message) +// Download and load +RunAnywhere.downloadModel("sherpa-onnx-whisper-tiny.en").collect { progress -> + println("Download: ${(progress.progress * 100).toInt()}%") +} +RunAnywhere.loadSTTModel("sherpa-onnx-whisper-tiny.en") + +// Transcribe audio (16kHz, mono, 16-bit PCM) +val transcription = RunAnywhere.transcribe(audioData) +println("You said: $transcription") +``` + +### Complete TTS Example (Matching Starter App) + +```kotlin +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.extensions.* +import com.runanywhere.sdk.public.extensions.TTS.TTSOptions + +// Register TTS model +RunAnywhere.registerModel( + id = "vits-piper-en_US-lessac-medium", + name = "Piper TTS (US English - Medium)", + url = "https://github.com/RunanywhereAI/sherpa-onnx/releases/download/runanywhere-models-v1/vits-piper-en_US-lessac-medium.tar.gz", + framework = InferenceFramework.ONNX, + modality = ModelCategory.SPEECH_SYNTHESIS +) + +// Download and load +RunAnywhere.downloadModel("vits-piper-en_US-lessac-medium").collect { progress -> + println("Download: ${(progress.progress * 100).toInt()}%") +} +RunAnywhere.loadTTSVoice("vits-piper-en_US-lessac-medium") + +// Synthesize audio +val output = RunAnywhere.synthesize("Hello, world!", TTSOptions()) +// output.audioData contains WAV audio bytes + +// Play with playWavAudio() helper (see Voice Pipeline section) +playWavAudio(output.audioData) +``` + +### Voice Pipeline Session (Matching Starter App) + +```kotlin +import com.runanywhere.sdk.public.RunAnywhere +import com.runanywhere.sdk.public.extensions.* +import com.runanywhere.sdk.public.extensions.VoiceAgent.VoiceSessionConfig +import com.runanywhere.sdk.public.extensions.VoiceAgent.VoiceSessionEvent + +// Ensure all 3 models are loaded +val allModelsLoaded = RunAnywhere.isLLMModelLoaded() && + RunAnywhere.isSTTModelLoaded() && + RunAnywhere.isTTSVoiceLoaded() + +if (allModelsLoaded) { + // Create audio capture flow + val audioCaptureService = AudioCaptureService() + val audioChunks = audioCaptureService.startCapture() + + // Configure and start session + val config = VoiceSessionConfig( + silenceDuration = 1.5, + speechThreshold = 0.1f, + autoPlayTTS = false, + continuousMode = true + ) + + scope.launch { + RunAnywhere.streamVoiceSession(audioChunks, config).collect { event -> + when (event) { + is VoiceSessionEvent.Listening -> updateAudioLevel(event.audioLevel) + is VoiceSessionEvent.SpeechStarted -> showSpeechDetected() + is VoiceSessionEvent.Processing -> showProcessing() + is VoiceSessionEvent.Transcribed -> showTranscript(event.text) + is VoiceSessionEvent.Responded -> showResponse(event.text) + is VoiceSessionEvent.TurnCompleted -> { + event.audio?.let { playWavAudio(it) } + } + is VoiceSessionEvent.Error -> showError(event.message) + else -> { } + } } } } diff --git a/sdk/runanywhere-kotlin/docs/KOTLIN_MAVEN_CENTRAL_PUBLISHING.md b/sdk/runanywhere-kotlin/docs/KOTLIN_MAVEN_CENTRAL_PUBLISHING.md new file mode 100644 index 00000000..40549819 --- /dev/null +++ b/sdk/runanywhere-kotlin/docs/KOTLIN_MAVEN_CENTRAL_PUBLISHING.md @@ -0,0 +1,104 @@ +# Kotlin SDK - Maven Central Publishing Guide + +Quick reference for publishing RunAnywhere Kotlin SDK to Maven Central. + +--- + +## Published Artifacts + +| Artifact | Description | +|----------|-------------| +| `io.github.sanchitmonga22:runanywhere-sdk-android` | Core SDK (AAR with native libs) | +| `io.github.sanchitmonga22:runanywhere-llamacpp-android` | LLM backend (AAR with native libs) | +| `io.github.sanchitmonga22:runanywhere-onnx-android` | STT/TTS backend (AAR with native libs) | + +--- + +## Quick Release (CI/CD) + +1. Go to **GitHub Actions** → **Publish to Maven Central** +2. Click **Run workflow** +3. Enter version (e.g., `0.17.0`) +4. Click **Run workflow** +5. Monitor progress, then verify on [central.sonatype.com](https://central.sonatype.com/search?q=io.github.sanchitmonga22) + +--- + +## Local Release + +### 1. Setup (One-Time) + +Copy credentials to `~/.gradle/gradle.properties`: +```properties +mavenCentral.username=YOUR_SONATYPE_USERNAME +mavenCentral.password=YOUR_SONATYPE_TOKEN +signing.gnupg.executable=gpg +signing.gnupg.useLegacyGpg=false +signing.gnupg.keyName=YOUR_GPG_KEY_ID +signing.gnupg.passphrase=YOUR_GPG_PASSPHRASE +``` + +### 2. Publish + +```bash +cd sdks/sdk/runanywhere-kotlin + +# Set version and publish +export SDK_VERSION=0.17.0 +./gradlew clean publishAllPublicationsToMavenCentralRepository +./gradlew :modules:runanywhere-core-llamacpp:publishAllPublicationsToMavenCentralRepository +./gradlew :modules:runanywhere-core-onnx:publishAllPublicationsToMavenCentralRepository +``` + +### 3. Verify + +Check [central.sonatype.com](https://central.sonatype.com/search?q=io.github.sanchitmonga22) (may take 30 min to sync). + +--- + +## GitHub Secrets Required + +| Secret | Description | +|--------|-------------| +| `MAVEN_CENTRAL_USERNAME` | Sonatype Central Portal token username | +| `MAVEN_CENTRAL_PASSWORD` | Sonatype Central Portal token | +| `GPG_KEY_ID` | Last 8 chars of GPG key fingerprint | +| `GPG_SIGNING_KEY` | Full armored GPG private key | +| `GPG_SIGNING_PASSWORD` | GPG key passphrase | + +--- + +## Consumer Usage + +```kotlin +// settings.gradle.kts +repositories { + mavenCentral() + maven { url = uri("https://jitpack.io") } // for transitive deps +} + +// build.gradle.kts +dependencies { + implementation("io.github.sanchitmonga22:runanywhere-sdk-android:0.17.0") + implementation("io.github.sanchitmonga22:runanywhere-llamacpp-android:0.17.0") + implementation("io.github.sanchitmonga22:runanywhere-onnx-android:0.17.0") +} +``` + +--- + +## Troubleshooting + +| Error | Fix | +|-------|-----| +| GPG signature verification failed | Upload key to `keys.openpgp.org` AND verify email | +| 403 Forbidden | Verify namespace at central.sonatype.com | +| Missing native libs in AAR | Ensure `publishLibraryVariants("release")` in build.gradle.kts | + +--- + +## Key URLs + +- **Central Portal**: https://central.sonatype.com +- **Search Artifacts**: https://central.sonatype.com/search?q=io.github.sanchitmonga22 +- **GPG Keyserver**: https://keys.openpgp.org diff --git a/sdk/runanywhere-kotlin/modules/runanywhere-core-llamacpp/build.gradle.kts b/sdk/runanywhere-kotlin/modules/runanywhere-core-llamacpp/build.gradle.kts index 3f65d011..7ee6f723 100644 --- a/sdk/runanywhere-kotlin/modules/runanywhere-core-llamacpp/build.gradle.kts +++ b/sdk/runanywhere-kotlin/modules/runanywhere-core-llamacpp/build.gradle.kts @@ -77,6 +77,14 @@ kotlin { } androidTarget { + // Enable publishing Android AAR to Maven + publishLibraryVariants("release") + + // Set correct artifact ID for Android publication + mavenPublication { + artifactId = "runanywhere-llamacpp-android" + } + compilations.all { kotlinOptions.jvmTarget = "17" } diff --git a/sdk/runanywhere-kotlin/modules/runanywhere-core-onnx/build.gradle.kts b/sdk/runanywhere-kotlin/modules/runanywhere-core-onnx/build.gradle.kts index 519daf64..001c241e 100644 --- a/sdk/runanywhere-kotlin/modules/runanywhere-core-onnx/build.gradle.kts +++ b/sdk/runanywhere-kotlin/modules/runanywhere-core-onnx/build.gradle.kts @@ -81,6 +81,14 @@ kotlin { } androidTarget { + // Enable publishing Android AAR to Maven + publishLibraryVariants("release") + + // Set correct artifact ID for Android publication + mavenPublication { + artifactId = "runanywhere-onnx-android" + } + compilations.all { kotlinOptions.jvmTarget = "17" } diff --git a/sdk/runanywhere-kotlin/secrets.template.properties b/sdk/runanywhere-kotlin/secrets.template.properties new file mode 100644 index 00000000..284c490d --- /dev/null +++ b/sdk/runanywhere-kotlin/secrets.template.properties @@ -0,0 +1,78 @@ +# ============================================================================= +# RunAnywhere SDK - Maven Central Publishing Secrets +# ============================================================================= +# +# INSTRUCTIONS: +# 1. Copy this file to ~/.gradle/gradle.properties (for local publishing) +# 2. Or add these as GitHub Secrets (for CI/CD publishing) +# 3. NEVER commit this file with real values to git! +# +# ============================================================================= + +# ----------------------------------------------------------------------------- +# MAVEN CENTRAL (Sonatype Central Portal) +# ----------------------------------------------------------------------------- +# Get these from: https://central.sonatype.com → Settings → Generate User Token +# +mavenCentral.username=REPLACE_WITH_SONATYPE_TOKEN_USERNAME +mavenCentral.password=REPLACE_WITH_SONATYPE_TOKEN_PASSWORD + +# ----------------------------------------------------------------------------- +# GPG SIGNING +# ----------------------------------------------------------------------------- +# Key ID: Last 8 characters of your GPG key fingerprint +# Run: gpg --list-secret-keys --keyid-format LONG +# Example output: sec rsa4096/ABCD1234EFGH5678 → Key ID is EFGH5678 +# +signing.gnupg.executable=gpg +signing.gnupg.useLegacyGpg=false +signing.gnupg.keyName=REPLACE_WITH_GPG_KEY_ID +signing.gnupg.passphrase=REPLACE_WITH_GPG_PASSPHRASE + +# ----------------------------------------------------------------------------- +# GITHUB SECRETS (for CI/CD - copy values to GitHub repo secrets) +# ----------------------------------------------------------------------------- +# Secret Name | Value +# ---------------------------|-------------------------------------------------- +# MAVEN_CENTRAL_USERNAME | (same as mavenCentral.username above) +# MAVEN_CENTRAL_PASSWORD | (same as mavenCentral.password above) +# GPG_KEY_ID | (same as signing.gnupg.keyName above) +# GPG_SIGNING_PASSWORD | (same as signing.gnupg.passphrase above) +# GPG_SIGNING_KEY | Full armored GPG private key (see below) +# +# To export GPG_SIGNING_KEY: +# gpg --armor --export-secret-keys YOUR_KEY_ID +# +# The output looks like: +# -----BEGIN PGP PRIVATE KEY BLOCK----- +# ...base64 encoded key... +# -----END PGP PRIVATE KEY BLOCK----- +# +# Copy the ENTIRE output (including BEGIN/END lines) as the secret value. + +# ----------------------------------------------------------------------------- +# GITHUB PACKAGES (Optional - backup repository) +# ----------------------------------------------------------------------------- +# gpr.user=REPLACE_WITH_GITHUB_USERNAME +# gpr.token=REPLACE_WITH_GITHUB_PAT + +# ============================================================================= +# HOW TO GET CREDENTIALS +# ============================================================================= +# +# 1. SONATYPE CENTRAL PORTAL: +# - Go to https://central.sonatype.com +# - Sign in (use GitHub OAuth recommended) +# - Settings → Generate User Token +# - Save both username and password/token +# +# 2. GPG KEY: +# - Install GPG: brew install gnupg +# - Generate key: gpg --full-generate-key (RSA 4096, no expiry) +# - List keys: gpg --list-secret-keys --keyid-format LONG +# - Upload to keyservers: +# gpg --keyserver keyserver.ubuntu.com --send-keys YOUR_KEY_ID +# gpg --keyserver keys.openpgp.org --send-keys YOUR_KEY_ID +# - IMPORTANT: Verify email at keys.openpgp.org for Maven Central +# +# ============================================================================= diff --git a/sdk/runanywhere-kotlin/src/commonMain/kotlin/com/runanywhere/sdk/public/extensions/RunAnywhere+VoiceAgent.kt b/sdk/runanywhere-kotlin/src/commonMain/kotlin/com/runanywhere/sdk/public/extensions/RunAnywhere+VoiceAgent.kt index efb252b2..96b370c1 100644 --- a/sdk/runanywhere-kotlin/src/commonMain/kotlin/com/runanywhere/sdk/public/extensions/RunAnywhere+VoiceAgent.kt +++ b/sdk/runanywhere-kotlin/src/commonMain/kotlin/com/runanywhere/sdk/public/extensions/RunAnywhere+VoiceAgent.kt @@ -93,6 +93,54 @@ expect fun RunAnywhere.startVoiceSession( config: VoiceSessionConfig = VoiceSessionConfig.DEFAULT, ): Flow +/** + * Stream a voice session with automatic silence detection. + * + * This is the recommended API for voice pipelines. It handles: + * - Audio level calculation for visualization + * - Speech detection (when audio level > threshold) + * - Automatic silence detection (triggers processing after silence duration) + * - STT → LLM → TTS pipeline orchestration + * - Continuous conversation mode (auto-resumes listening after TTS) + * + * The app only needs to: + * 1. Capture audio and emit chunks to the input Flow + * 2. Collect events to update UI + * 3. Play audio when TurnCompleted event is received (if autoPlayTTS is false) + * + * Example: + * ```kotlin + * // Audio capture Flow from your audio service + * val audioChunks: Flow = audioCaptureService.startCapture() + * + * RunAnywhere.streamVoiceSession(audioChunks) + * .collect { event -> + * when (event) { + * is VoiceSessionEvent.Started -> showListeningUI() + * is VoiceSessionEvent.Listening -> updateAudioLevel(event.audioLevel) + * is VoiceSessionEvent.SpeechStarted -> showSpeechDetected() + * is VoiceSessionEvent.Processing -> showProcessingUI() + * is VoiceSessionEvent.Transcribed -> showTranscript(event.text) + * is VoiceSessionEvent.Responded -> showResponse(event.text) + * is VoiceSessionEvent.TurnCompleted -> { + * // Play audio if autoPlayTTS is false + * event.audio?.let { playAudio(it) } + * } + * is VoiceSessionEvent.Stopped -> showIdleUI() + * is VoiceSessionEvent.Error -> showError(event.message) + * } + * } + * ``` + * + * @param audioChunks Flow of audio chunks (16kHz, mono, 16-bit PCM) + * @param config Session configuration (silence duration, speech threshold, etc.) + * @return Flow of voice session events + */ +expect fun RunAnywhere.streamVoiceSession( + audioChunks: Flow, + config: VoiceSessionConfig = VoiceSessionConfig.DEFAULT, +): Flow + /** * Stop the current voice session. */ diff --git a/sdk/runanywhere-kotlin/src/jvmAndroidMain/kotlin/com/runanywhere/sdk/public/extensions/RunAnywhere+VoiceAgent.jvmAndroid.kt b/sdk/runanywhere-kotlin/src/jvmAndroidMain/kotlin/com/runanywhere/sdk/public/extensions/RunAnywhere+VoiceAgent.jvmAndroid.kt index 945d44db..9a6f1e2e 100644 --- a/sdk/runanywhere-kotlin/src/jvmAndroidMain/kotlin/com/runanywhere/sdk/public/extensions/RunAnywhere+VoiceAgent.jvmAndroid.kt +++ b/sdk/runanywhere-kotlin/src/jvmAndroidMain/kotlin/com/runanywhere/sdk/public/extensions/RunAnywhere+VoiceAgent.jvmAndroid.kt @@ -22,8 +22,17 @@ import com.runanywhere.sdk.public.extensions.VoiceAgent.VoiceAgentConfiguration import com.runanywhere.sdk.public.extensions.VoiceAgent.VoiceAgentResult import com.runanywhere.sdk.public.extensions.VoiceAgent.VoiceSessionConfig import com.runanywhere.sdk.public.extensions.VoiceAgent.VoiceSessionEvent +import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.flow.Flow +import kotlinx.coroutines.flow.channelFlow import kotlinx.coroutines.flow.flow +import kotlinx.coroutines.isActive +import kotlinx.coroutines.launch +import kotlinx.coroutines.withContext +import java.io.ByteArrayOutputStream +import java.nio.ByteBuffer +import java.nio.ByteOrder +import kotlin.math.sqrt private val voiceAgentLogger = SDKLogger.voiceAgent @@ -251,3 +260,204 @@ actual suspend fun RunAnywhere.setVoiceSystemPrompt(prompt: String) { currentSystemPrompt = prompt } + +/** + * Stream a voice session with automatic silence detection. + * + * This implementation handles: + * - Audio level calculation (RMS) for visualization + * - Speech detection when audio level exceeds threshold + * - Automatic silence detection - triggers processing after configured silence duration + * - Full STT → LLM → TTS pipeline orchestration + * - Continuous conversation mode - auto-resumes after TTS completion + */ +actual fun RunAnywhere.streamVoiceSession( + audioChunks: Flow, + config: VoiceSessionConfig, +): Flow = channelFlow { + if (!isInitialized) { + send(VoiceSessionEvent.Error("SDK not initialized")) + return@channelFlow + } + + // Check if all components are loaded + if (!areAllComponentsLoaded()) { + val missing = getMissingComponents() + send(VoiceSessionEvent.Error("Models not loaded: ${missing.joinToString(", ")}")) + return@channelFlow + } + + voiceAgentLogger.info("Starting streaming voice session with auto-silence detection") + send(VoiceSessionEvent.Started) + + // Session state + val audioBuffer = ByteArrayOutputStream() + var isSpeechActive = false + var lastSpeechTime = 0L + var isProcessingTurn = false + val minAudioBytes = 16000 // ~0.5s at 16kHz, 16-bit + val silenceDurationMs = (config.silenceDuration * 1000).toLong() + + /** + * Calculate RMS (Root Mean Square) for audio level visualization + */ + fun calculateRMS(audioData: ByteArray): Float { + if (audioData.isEmpty()) return 0f + val shorts = ByteBuffer.wrap(audioData).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer() + var sum = 0.0 + while (shorts.hasRemaining()) { + val sample = shorts.get().toFloat() / Short.MAX_VALUE + sum += sample * sample + } + return sqrt(sum / (audioData.size / 2)).toFloat() + } + + /** + * Normalize audio level for visualization (0.0 to 1.0) + */ + fun normalizeAudioLevel(rms: Float): Float = (rms * 3.0f).coerceIn(0f, 1f) + + /** + * Process accumulated audio through the voice pipeline + */ + suspend fun processAudio(): Boolean { + if (isProcessingTurn) return false + isProcessingTurn = true + + val audioData = synchronized(audioBuffer) { + val data = audioBuffer.toByteArray() + audioBuffer.reset() + data + } + + if (audioData.size < minAudioBytes) { + voiceAgentLogger.debug("Audio too short to process (${audioData.size} bytes)") + isProcessingTurn = false + return false + } + + voiceAgentLogger.info("Processing ${audioData.size} bytes through voice pipeline") + send(VoiceSessionEvent.Processing) + + try { + // Step 1: Transcribe audio using STT + val transcriptionResult = withContext(Dispatchers.Default) { + CppBridgeSTT.transcribe(audioData) + } + val transcriptionText = transcriptionResult.text + + if (transcriptionText.isBlank()) { + voiceAgentLogger.debug("No speech detected in audio") + isProcessingTurn = false + return false + } + + voiceAgentLogger.info("Transcription: ${transcriptionText.take(100)}") + send(VoiceSessionEvent.Transcribed(transcriptionText)) + + // Step 2: Generate response using LLM + val systemPrompt = currentSystemPrompt ?: "You are a helpful voice assistant." + val chatPrompt = "$systemPrompt\n\nUser: $transcriptionText\n\nAssistant:" + val generationResult = withContext(Dispatchers.Default) { + CppBridgeLLM.generate(chatPrompt) + } + val responseText = generationResult.text + + voiceAgentLogger.info("Response: ${responseText.take(100)}") + send(VoiceSessionEvent.Responded(responseText)) + + // Step 3: Synthesize speech using TTS + var audioOutput: ByteArray? = null + if (responseText.isNotBlank()) { + send(VoiceSessionEvent.Speaking) + val synthesisResult = withContext(Dispatchers.Default) { + CppBridgeTTS.synthesize(responseText) + } + audioOutput = synthesisResult.audioData + voiceAgentLogger.debug("TTS synthesis complete: ${audioOutput.size} bytes") + } + + // Emit turn completed with audio for app to play + send(VoiceSessionEvent.TurnCompleted(transcriptionText, responseText, audioOutput)) + + isProcessingTurn = false + return true + } catch (e: Exception) { + voiceAgentLogger.error("Voice processing error: ${e.message}", throwable = e) + send(VoiceSessionEvent.Error("Processing error: ${e.message}")) + isProcessingTurn = false + return false + } + } + + // Main audio processing loop + var lastCheckTime = System.currentTimeMillis() + + try { + audioChunks.collect { chunk -> + if (!isActive || isProcessingTurn) return@collect + + // Accumulate audio + synchronized(audioBuffer) { + audioBuffer.write(chunk) + } + + // Calculate and emit audio level + val rms = calculateRMS(chunk) + val normalizedLevel = normalizeAudioLevel(rms) + send(VoiceSessionEvent.Listening(normalizedLevel)) + + // Speech detection + if (normalizedLevel > config.speechThreshold) { + if (!isSpeechActive) { + voiceAgentLogger.debug("Speech started (level: $normalizedLevel)") + isSpeechActive = true + send(VoiceSessionEvent.SpeechStarted) + } + lastSpeechTime = System.currentTimeMillis() + } + + // Silence detection - check periodically + val now = System.currentTimeMillis() + if (now - lastCheckTime >= 50) { // Check every 50ms + lastCheckTime = now + + if (isSpeechActive && lastSpeechTime > 0) { + if (normalizedLevel <= config.speechThreshold) { + val silenceTime = now - lastSpeechTime + if (silenceTime > silenceDurationMs) { + voiceAgentLogger.debug("Speech ended after ${silenceTime}ms of silence") + isSpeechActive = false + + // Process accumulated audio + val processed = processAudio() + + // If continuous mode, reset for next turn + if (config.continuousMode && processed) { + lastSpeechTime = 0L + // Continue collecting audio for next turn + } + } + } + } + } + } + } catch (e: kotlinx.coroutines.CancellationException) { + voiceAgentLogger.debug("Voice session cancelled") + } catch (e: Exception) { + voiceAgentLogger.error("Voice session error: ${e.message}", throwable = e) + send(VoiceSessionEvent.Error("Session error: ${e.message}")) + } + + // Process any remaining audio when stream ends + if (!isProcessingTurn) { + val remainingSize = synchronized(audioBuffer) { audioBuffer.size() } + if (remainingSize >= minAudioBytes) { + voiceAgentLogger.info("Processing remaining audio on stream end") + processAudio() + } + } + + send(VoiceSessionEvent.Stopped) + voiceAgentLogger.info("Voice session ended") +}