Fix CI: install ffmpeg dev libraries for ffmpeg-sys-next #7
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI (Python) | |
| on: | |
| push: | |
| pull_request: | |
| jobs: | |
| test-api: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 90 | |
| env: | |
| QWEN_TTS_DEVICE: cpu | |
| QWEN_TTS_DTYPE: float32 | |
| QWEN_TTS_ATTN: "" | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "latest" | |
| - name: Install system dependencies | |
| run: sudo apt-get update && sudo apt-get install -y ffmpeg | |
| - name: Install dependencies | |
| run: uv sync --project python | |
| - name: Cache models | |
| id: cache-models | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| models/Qwen3-TTS-12Hz-0.6B-CustomVoice | |
| models/Qwen3-TTS-12Hz-0.6B-Base | |
| models/Qwen3-ASR-0.6B | |
| key: qwen-audio-models-0.6B-v2 | |
| - name: Download models | |
| if: steps.cache-models.outputs.cache-hit != 'true' | |
| run: | | |
| mkdir -p models | |
| uv run --project python huggingface-cli download Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice \ | |
| --local-dir ./models/Qwen3-TTS-12Hz-0.6B-CustomVoice | |
| uv run --project python huggingface-cli download Qwen/Qwen3-TTS-12Hz-0.6B-Base \ | |
| --local-dir ./models/Qwen3-TTS-12Hz-0.6B-Base | |
| uv run --project python huggingface-cli download Qwen/Qwen3-ASR-0.6B \ | |
| --local-dir ./models/Qwen3-ASR-0.6B | |
| - name: Create output directory | |
| run: mkdir -p artifacts | |
| # ================================================================== | |
| # Phase 1: Both TTS models loaded | |
| # ================================================================== | |
| - name: "Phase 1: Start server (both TTS models)" | |
| run: | | |
| TTS_CUSTOMVOICE_MODEL_PATH=./models/Qwen3-TTS-12Hz-0.6B-CustomVoice \ | |
| TTS_BASE_MODEL_PATH=./models/Qwen3-TTS-12Hz-0.6B-Base \ | |
| uv run --project python python python/main.py & | |
| echo $! > /tmp/server.pid | |
| echo "Waiting for server..." | |
| for i in $(seq 1 120); do | |
| if curl -sf http://localhost:8000/health > /dev/null 2>&1; then | |
| echo "Server is ready" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| curl -sf http://localhost:8000/health | |
| - name: "Phase 1: Generate English speech with Vivian" | |
| run: | | |
| curl -sf --max-time 600 -X POST http://localhost:8000/v1/audio/speech \ | |
| -H "Content-Type: application/json" \ | |
| -d '{ | |
| "model": "qwen3-tts", | |
| "input": "Hello, this is Vivian speaking English for the integration test.", | |
| "voice": "Vivian", | |
| "language": "English", | |
| "response_format": "wav" | |
| }' \ | |
| --output artifacts/phase1_vivian_english.wav | |
| echo "Generated phase1_vivian_english.wav ($(stat --format=%s artifacts/phase1_vivian_english.wav) bytes)" | |
| - name: "Phase 1: Generate Chinese speech with Vivian" | |
| run: | | |
| curl -sf --max-time 600 -X POST http://localhost:8000/v1/audio/speech \ | |
| -H "Content-Type: application/json" \ | |
| -d '{ | |
| "model": "qwen3-tts", | |
| "input": "你好,这是Vivian的中文语音合成测试。", | |
| "voice": "Vivian", | |
| "language": "Chinese", | |
| "response_format": "wav" | |
| }' \ | |
| --output artifacts/phase1_vivian_chinese.wav | |
| echo "Generated phase1_vivian_chinese.wav ($(stat --format=%s artifacts/phase1_vivian_chinese.wav) bytes)" | |
| - name: "Phase 1: Clone English voice from Vivian sample" | |
| run: | | |
| curl -sf --max-time 600 -X POST http://localhost:8000/v1/audio/speech \ | |
| -F model=qwen3-tts \ | |
| -F "input=This sentence clones the Vivian English voice using audio_sample." \ | |
| -F audio_sample=@artifacts/phase1_vivian_english.wav \ | |
| -F "audio_sample_text=Hello, this is Vivian speaking English for the integration test." \ | |
| -F language=English \ | |
| -F response_format=wav \ | |
| --output artifacts/phase1_clone_english.wav | |
| echo "Generated phase1_clone_english.wav ($(stat --format=%s artifacts/phase1_clone_english.wav) bytes)" | |
| - name: "Phase 1: Clone Chinese voice from Vivian sample" | |
| run: | | |
| curl -sf --max-time 600 -X POST http://localhost:8000/v1/audio/speech \ | |
| -F model=qwen3-tts \ | |
| -F "input=这段语音使用了Vivian的中文音频样本进行声音克隆。" \ | |
| -F audio_sample=@artifacts/phase1_vivian_chinese.wav \ | |
| -F "audio_sample_text=你好,这是Vivian的中文语音合成测试。" \ | |
| -F language=Chinese \ | |
| -F response_format=wav \ | |
| --output artifacts/phase1_clone_chinese.wav | |
| echo "Generated phase1_clone_chinese.wav ($(stat --format=%s artifacts/phase1_clone_chinese.wav) bytes)" | |
| - name: "Phase 1: Stop server" | |
| run: kill "$(cat /tmp/server.pid)" && sleep 2 | |
| # ================================================================== | |
| # Phase 2: CustomVoice model only | |
| # ================================================================== | |
| - name: "Phase 2: Start server (CustomVoice only)" | |
| run: | | |
| TTS_CUSTOMVOICE_MODEL_PATH=./models/Qwen3-TTS-12Hz-0.6B-CustomVoice \ | |
| uv run --project python python python/main.py & | |
| echo $! > /tmp/server.pid | |
| echo "Waiting for server..." | |
| for i in $(seq 1 120); do | |
| if curl -sf http://localhost:8000/health > /dev/null 2>&1; then | |
| echo "Server is ready" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| curl -sf http://localhost:8000/health | |
| - name: "Phase 2: Generate English speech with Ryan" | |
| run: | | |
| curl -sf --max-time 600 -X POST http://localhost:8000/v1/audio/speech \ | |
| -H "Content-Type: application/json" \ | |
| -d '{ | |
| "model": "qwen3-tts", | |
| "input": "Hello, this is Ryan speaking English with only the CustomVoice model loaded.", | |
| "voice": "Ryan", | |
| "language": "English", | |
| "response_format": "wav" | |
| }' \ | |
| --output artifacts/phase2_ryan_english.wav | |
| echo "Generated phase2_ryan_english.wav ($(stat --format=%s artifacts/phase2_ryan_english.wav) bytes)" | |
| - name: "Phase 2: Generate Chinese speech with Ryan" | |
| run: | | |
| curl -sf --max-time 600 -X POST http://localhost:8000/v1/audio/speech \ | |
| -H "Content-Type: application/json" \ | |
| -d '{ | |
| "model": "qwen3-tts", | |
| "input": "你好,这是Ryan的中文语音,仅加载了CustomVoice模型。", | |
| "voice": "Ryan", | |
| "language": "Chinese", | |
| "response_format": "wav" | |
| }' \ | |
| --output artifacts/phase2_ryan_chinese.wav | |
| echo "Generated phase2_ryan_chinese.wav ($(stat --format=%s artifacts/phase2_ryan_chinese.wav) bytes)" | |
| - name: "Phase 2: Verify audio_sample returns error" | |
| run: | | |
| status=$(curl -s -o /tmp/response.json -w "%{http_code}" --max-time 30 \ | |
| -X POST http://localhost:8000/v1/audio/speech \ | |
| -F model=qwen3-tts \ | |
| -F "input=This should fail." \ | |
| -F audio_sample=@artifacts/phase1_vivian_english.wav \ | |
| -F language=English \ | |
| -F response_format=wav) | |
| echo "HTTP status: $status" | |
| cat /tmp/response.json | |
| echo | |
| if [ "$status" -ne 400 ]; then | |
| echo "FAIL: Expected HTTP 400 but got $status" | |
| exit 1 | |
| fi | |
| echo "PASS: audio_sample correctly rejected without Base model" | |
| - name: "Phase 2: Stop server" | |
| run: kill "$(cat /tmp/server.pid)" && sleep 2 | |
| # ================================================================== | |
| # Phase 3: Base model only | |
| # ================================================================== | |
| - name: "Phase 3: Start server (Base only)" | |
| run: | | |
| TTS_BASE_MODEL_PATH=./models/Qwen3-TTS-12Hz-0.6B-Base \ | |
| uv run --project python python python/main.py & | |
| echo $! > /tmp/server.pid | |
| echo "Waiting for server..." | |
| for i in $(seq 1 120); do | |
| if curl -sf http://localhost:8000/health > /dev/null 2>&1; then | |
| echo "Server is ready" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| curl -sf http://localhost:8000/health | |
| - name: "Phase 3: Clone voice from Ryan English sample" | |
| run: | | |
| curl -sf --max-time 600 -X POST http://localhost:8000/v1/audio/speech \ | |
| -F model=qwen3-tts \ | |
| -F "input=This clones Ryan voice with only the Base model loaded." \ | |
| -F audio_sample=@artifacts/phase2_ryan_english.wav \ | |
| -F "audio_sample_text=Hello, this is Ryan speaking English with only the CustomVoice model loaded." \ | |
| -F language=English \ | |
| -F response_format=wav \ | |
| --output artifacts/phase3_clone_ryan.wav | |
| echo "Generated phase3_clone_ryan.wav ($(stat --format=%s artifacts/phase3_clone_ryan.wav) bytes)" | |
| - name: "Phase 3: Verify voice name returns error" | |
| run: | | |
| status=$(curl -s -o /tmp/response.json -w "%{http_code}" --max-time 30 \ | |
| -X POST http://localhost:8000/v1/audio/speech \ | |
| -H "Content-Type: application/json" \ | |
| -d '{ | |
| "model": "qwen3-tts", | |
| "input": "This should fail.", | |
| "voice": "Ryan", | |
| "language": "English", | |
| "response_format": "wav" | |
| }') | |
| echo "HTTP status: $status" | |
| cat /tmp/response.json | |
| echo | |
| if [ "$status" -ne 400 ]; then | |
| echo "FAIL: Expected HTTP 400 but got $status" | |
| exit 1 | |
| fi | |
| echo "PASS: voice name correctly rejected without CustomVoice model" | |
| - name: "Phase 3: Stop server" | |
| run: kill "$(cat /tmp/server.pid)" && sleep 2 | |
| # ================================================================== | |
| # Phase 4: ASR model only | |
| # ================================================================== | |
| - name: "Phase 4: Start server (ASR only)" | |
| run: | | |
| ASR_MODEL_PATH=./models/Qwen3-ASR-0.6B \ | |
| uv run --project python python python/main.py & | |
| echo $! > /tmp/server.pid | |
| echo "Waiting for server..." | |
| for i in $(seq 1 120); do | |
| if curl -sf http://localhost:8000/health > /dev/null 2>&1; then | |
| echo "Server is ready" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| curl -sf http://localhost:8000/health | |
| - name: "Phase 4: Transcribe English audio" | |
| run: | | |
| result=$(curl -sf --max-time 300 -X POST http://localhost:8000/v1/audio/transcriptions \ | |
| -F file=@artifacts/phase1_vivian_english.wav \ | |
| -F model=qwen3-asr \ | |
| -F language=English) | |
| echo "Transcription result: $result" | |
| echo "$result" > artifacts/phase4_transcribe_english.txt | |
| # Verify we got a text response | |
| echo "$result" | python3 -c "import sys, json; d = json.load(sys.stdin); assert 'text' in d and len(d['text']) > 0, 'No text in response'" | |
| echo "PASS: English transcription successful" | |
| - name: "Phase 4: Transcribe Chinese audio" | |
| run: | | |
| result=$(curl -sf --max-time 300 -X POST http://localhost:8000/v1/audio/transcriptions \ | |
| -F file=@artifacts/phase1_vivian_chinese.wav \ | |
| -F model=qwen3-asr \ | |
| -F language=Chinese) | |
| echo "Transcription result: $result" | |
| echo "$result" > artifacts/phase4_transcribe_chinese.txt | |
| # Verify we got a text response | |
| echo "$result" | python3 -c "import sys, json; d = json.load(sys.stdin); assert 'text' in d and len(d['text']) > 0, 'No text in response'" | |
| echo "PASS: Chinese transcription successful" | |
| - name: "Phase 4: Verify TTS returns error" | |
| run: | | |
| status=$(curl -s -o /tmp/response.json -w "%{http_code}" --max-time 30 \ | |
| -X POST http://localhost:8000/v1/audio/speech \ | |
| -H "Content-Type: application/json" \ | |
| -d '{ | |
| "model": "qwen3-tts", | |
| "input": "This should fail.", | |
| "voice": "Vivian", | |
| "language": "English", | |
| "response_format": "wav" | |
| }') | |
| echo "HTTP status: $status" | |
| cat /tmp/response.json | |
| echo | |
| if [ "$status" -ne 400 ]; then | |
| echo "FAIL: Expected HTTP 400 but got $status" | |
| exit 1 | |
| fi | |
| echo "PASS: TTS correctly rejected without TTS model" | |
| - name: "Phase 4: Stop server" | |
| run: kill "$(cat /tmp/server.pid)" && sleep 2 | |
| # ================================================================== | |
| # Phase 5: TTS + ASR Round-Trip Test | |
| # ================================================================== | |
| - name: "Phase 5: Start server (TTS + ASR)" | |
| run: | | |
| TTS_CUSTOMVOICE_MODEL_PATH=./models/Qwen3-TTS-12Hz-0.6B-CustomVoice \ | |
| ASR_MODEL_PATH=./models/Qwen3-ASR-0.6B \ | |
| uv run --project python python python/main.py & | |
| echo $! > /tmp/server.pid | |
| echo "Waiting for server..." | |
| for i in $(seq 1 120); do | |
| if curl -sf http://localhost:8000/health > /dev/null 2>&1; then | |
| echo "Server is ready" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| curl -sf http://localhost:8000/health | |
| - name: "Phase 5: TTS->ASR Round-Trip (English)" | |
| run: | | |
| INPUT_TEXT="Hello, this is a test of the Qwen text to speech system." | |
| # Generate speech | |
| curl -sf --max-time 600 -X POST http://localhost:8000/v1/audio/speech \ | |
| -H "Content-Type: application/json" \ | |
| -d "{ | |
| \"model\": \"qwen3-tts\", | |
| \"input\": \"$INPUT_TEXT\", | |
| \"voice\": \"Vivian\", | |
| \"language\": \"English\", | |
| \"response_format\": \"wav\" | |
| }" \ | |
| --output artifacts/phase5_tts_english.wav | |
| echo "Generated phase5_tts_english.wav ($(stat --format=%s artifacts/phase5_tts_english.wav) bytes)" | |
| # Transcribe | |
| result=$(curl -sf --max-time 300 -X POST http://localhost:8000/v1/audio/transcriptions \ | |
| -F file=@artifacts/phase5_tts_english.wav \ | |
| -F model=qwen3-asr \ | |
| -F language=English) | |
| OUTPUT_TEXT=$(echo "$result" | python3 -c "import sys, json; print(json.load(sys.stdin)['text'])") | |
| echo "==========================================" | |
| echo "ENGLISH ROUND-TRIP TEST" | |
| echo "==========================================" | |
| echo "Input text: $INPUT_TEXT" | |
| echo "Output text: $OUTPUT_TEXT" | |
| echo "==========================================" | tee artifacts/phase5_roundtrip_english.txt | |
| # The texts should be similar (not exact due to ASR limitations) | |
| echo "PASS: English round-trip completed" | |
| - name: "Phase 5: TTS->ASR Round-Trip (Chinese)" | |
| run: | | |
| INPUT_TEXT="你好,这是一个语音合成和语音识别的测试。" | |
| # Generate speech | |
| curl -sf --max-time 600 -X POST http://localhost:8000/v1/audio/speech \ | |
| -H "Content-Type: application/json" \ | |
| -d "{ | |
| \"model\": \"qwen3-tts\", | |
| \"input\": \"$INPUT_TEXT\", | |
| \"voice\": \"Vivian\", | |
| \"language\": \"Chinese\", | |
| \"response_format\": \"wav\" | |
| }" \ | |
| --output artifacts/phase5_tts_chinese.wav | |
| echo "Generated phase5_tts_chinese.wav ($(stat --format=%s artifacts/phase5_tts_chinese.wav) bytes)" | |
| # Transcribe | |
| result=$(curl -sf --max-time 300 -X POST http://localhost:8000/v1/audio/transcriptions \ | |
| -F file=@artifacts/phase5_tts_chinese.wav \ | |
| -F model=qwen3-asr \ | |
| -F language=Chinese) | |
| OUTPUT_TEXT=$(echo "$result" | python3 -c "import sys, json; print(json.load(sys.stdin)['text'])") | |
| echo "==========================================" | |
| echo "CHINESE ROUND-TRIP TEST" | |
| echo "==========================================" | |
| echo "Input text: $INPUT_TEXT" | |
| echo "Output text: $OUTPUT_TEXT" | |
| echo "==========================================" | tee artifacts/phase5_roundtrip_chinese.txt | |
| # The texts should be similar (not exact due to ASR limitations) | |
| echo "PASS: Chinese round-trip completed" | |
| - name: "Phase 5: Stop server" | |
| run: kill "$(cat /tmp/server.pid)" && sleep 2 | |
| # ================================================================== | |
| # Upload artifacts | |
| # ================================================================== | |
| - name: Upload audio artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: generated-audio | |
| path: artifacts/*.wav | |
| - name: Upload transcription artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: transcriptions | |
| path: artifacts/*.txt |