diff --git a/.wordlist.txt b/.wordlist.txt index 19e20f6759..96d2c99b80 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -4184,4 +4184,20 @@ subgenre submodule subword techcrunch -transformative \ No newline at end of file +transformative +Aude +Gian +Iodice +SmolLM +VME +Vuilliomenet +cpus +fLO +invalidations +libtensorflowlite +macos +multithreaded +Wix's +ngrok's +qs +qu \ No newline at end of file diff --git a/assets/contributors.csv b/assets/contributors.csv index 2e0883a8c1..1bd3df8895 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -85,5 +85,9 @@ Yiyang Fan,Arm,,,, Julien Jayat,Arm,,,, Geremy Cohen,Arm,geremyCohen,geremyinanutshell,, Barbara Corriero,Arm,,,, -Nina Drozd,Arm,,ninadrozd,, +Nina Drozd,Arm,NinaARM,ninadrozd,, Jun He,Arm,JunHe77,jun-he-91969822,, +Gian Marco Iodice,Arm,,,, +Aude Vuilliomenet,Arm,,,, +Andrew Kilroy,Arm,,,, +Peter Harris,Arm,,,, diff --git a/content/learning-paths/cross-platform/mcp-ai-agent/intro-to-mcp-uv.md b/content/learning-paths/cross-platform/mcp-ai-agent/intro-to-mcp-uv.md index e1bd041ca8..94ff72c1f7 100644 --- a/content/learning-paths/cross-platform/mcp-ai-agent/intro-to-mcp-uv.md +++ b/content/learning-paths/cross-platform/mcp-ai-agent/intro-to-mcp-uv.md @@ -17,7 +17,7 @@ The Model Context Protocol (MCP) is an open specification designed to connect La - **Security by design:** MCP encourages running servers inside your own infrastructure, so sensitive data stays within your infrastructure unless explicitly shared. -- **Cross-ecosystem momentum:** recent roll-outs from an official C# SDK to Wix’s production MCP server and Microsoft’s Azure support show the MCP spec is gathering real-world traction. +- **Cross-ecosystem momentum:** recent roll-outs from an official C# SDK to Wix's production MCP server and Microsoft’s Azure support show the MCP spec is gathering real-world traction. ## What is uv? diff --git a/content/learning-paths/cross-platform/mcp-ai-agent/mcp-server.md b/content/learning-paths/cross-platform/mcp-ai-agent/mcp-server.md index caada03d31..a05eb66864 100644 --- a/content/learning-paths/cross-platform/mcp-ai-agent/mcp-server.md +++ b/content/learning-paths/cross-platform/mcp-ai-agent/mcp-server.md @@ -115,7 +115,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) You will now use ngrok to expose your locally running MCP server to the public internet over HTTPS. -1. Add ngrok’s repo to the apt package manager and install: +1. Add ngrok's repo to the apt package manager and install: ```bash curl -sSL https://ngrok-agent.s3.amazonaws.com/ngrok.asc \ | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null \ diff --git a/content/learning-paths/embedded-and-microcontrollers/_index.md b/content/learning-paths/embedded-and-microcontrollers/_index.md index 6cf07146bb..3c103cb3c0 100644 --- a/content/learning-paths/embedded-and-microcontrollers/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/_index.md @@ -61,6 +61,7 @@ tools_software_languages_filter: - GitHub: 3 - GitLab: 1 - Himax SDK: 1 +- Hugging Face: 3 - IP Explorer: 4 - Jupyter Notebook: 1 - K3s: 1 diff --git a/content/learning-paths/embedded-and-microcontrollers/llama-python-cpu/_index.md b/content/learning-paths/embedded-and-microcontrollers/llama-python-cpu/_index.md index 89150c9af4..7f43f5a1dc 100644 --- a/content/learning-paths/embedded-and-microcontrollers/llama-python-cpu/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/llama-python-cpu/_index.md @@ -28,7 +28,7 @@ tools_software_languages: - GenAI - Raspberry Pi - Python - + - Hugging Face further_reading: - resource: diff --git a/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/_index.md b/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/_index.md index 03cb02bf51..83920389c6 100644 --- a/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/rpi-llama3/_index.md @@ -31,6 +31,7 @@ tools_software_languages: - LLM - GenAI - Raspberry Pi + - Hugging Face diff --git a/content/learning-paths/embedded-and-microcontrollers/yolo-on-himax/_index.md b/content/learning-paths/embedded-and-microcontrollers/yolo-on-himax/_index.md index 71ee4e97ad..54c9844ecf 100644 --- a/content/learning-paths/embedded-and-microcontrollers/yolo-on-himax/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/yolo-on-himax/_index.md @@ -32,6 +32,8 @@ armips: tools_software_languages: - Himax SDK - Python + - Hugging Face + operatingsystems: - Linux - macOS diff --git a/content/learning-paths/laptops-and-desktops/_index.md b/content/learning-paths/laptops-and-desktops/_index.md index d061cdb95b..b3aa2da78f 100644 --- a/content/learning-paths/laptops-and-desktops/_index.md +++ b/content/learning-paths/laptops-and-desktops/_index.md @@ -10,11 +10,11 @@ operatingsystems_filter: - Android: 2 - ChromeOS: 1 - Linux: 31 -- macOS: 7 -- Windows: 43 +- macOS: 8 +- Windows: 44 subjects_filter: - CI-CD: 5 -- Containers and Virtualization: 5 +- Containers and Virtualization: 6 - Migration to Arm: 28 - ML: 2 - Performance and Architecture: 25 @@ -39,7 +39,7 @@ tools_software_languages_filter: - Coding: 16 - CSS: 1 - Daytona: 1 -- Docker: 4 +- Docker: 5 - GCC: 10 - Git: 1 - GitHub: 3 @@ -52,6 +52,7 @@ tools_software_languages_filter: - JavaScript: 2 - Kubernetes: 1 - Linux: 1 +- LLM: 1 - LLVM: 1 - llvm-mca: 1 - MSBuild: 1 @@ -62,7 +63,7 @@ tools_software_languages_filter: - ONNX Runtime: 1 - OpenCV: 1 - perf: 4 -- Python: 5 +- Python: 6 - Qt: 2 - Remote.It: 1 - RME: 1 diff --git a/content/learning-paths/laptops-and-desktops/docker-models/_index.md b/content/learning-paths/laptops-and-desktops/docker-models/_index.md new file mode 100644 index 0000000000..1c8661c8e9 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/docker-models/_index.md @@ -0,0 +1,53 @@ +--- +title: Learn how to use Docker Model Runner in AI applications + +draft: true +cascade: + draft: true + +minutes_to_complete: 45 + +who_is_this_for: This is for software developers and AI enthusiasts who want to run AI models using Docker Model Runner. + +learning_objectives: + - Run AI models locally using Docker Model Runner. + - Easily build containerized applications with LLMs. + +prerequisites: + - A computer with at least 16GB of RAM (recommended) and Docker Desktop installed (version 4.40 or later). + - Basic understanding of Docker. + - Familiarity with Large Language Model (LLM) concepts. + +author: Jason Andrews + +### Tags +skilllevels: Introductory +subjects: Containers and Virtualization +armips: + - Neoverse + - Cortex-A +operatingsystems: + - Windows + - macOS +tools_software_languages: + - Docker + - Python + - LLM + +further_reading: + - resource: + title: Docker Model Runner Documentation + link: https://docs.docker.com/model-runner/ + type: documentation + - resource: + title: Introducing Docker Model Runner + link: https://www.docker.com/blog/introducing-docker-model-runner/ + type: blog + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- + diff --git a/content/learning-paths/laptops-and-desktops/docker-models/_next-steps.md b/content/learning-paths/laptops-and-desktops/docker-models/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/docker-models/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/laptops-and-desktops/docker-models/compose-app.png b/content/learning-paths/laptops-and-desktops/docker-models/compose-app.png new file mode 100644 index 0000000000..41bbd49fa1 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/docker-models/compose-app.png differ diff --git a/content/learning-paths/laptops-and-desktops/docker-models/compose.md b/content/learning-paths/laptops-and-desktops/docker-models/compose.md new file mode 100644 index 0000000000..fcc3657c08 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/docker-models/compose.md @@ -0,0 +1,123 @@ +--- +title: "Run a containerized AI chat app with Docker Compose" +weight: 3 +layout: "learningpathall" +--- + +Docker Compose makes it easy to run multi-container applications. Docker Compose can also include AI models in your project. + +In this section, you'll learn how to use Docker Compose to deploy a web-based AI chat application that uses Docker Model Runner as the backend for AI inference. + +## Clone the example project + +The example project, named [docker-model-runner-chat](https://github.com/jasonrandrews/docker-model-runner-chat) is available on GitHub. It provides a simple web interface to interact with local AI models such as Llama 3.2 or Gemma 3. + +First, clone the example repository: + +```console +git clone https://github.com/jasonrandrews/docker-model-runner-chat.git +cd docker-model-runner-chat +``` + +## Review the Docker Compose file + +The `compose.yaml` file defines how the application is deployed using Docker Compose. + +It sets up two services: + +- **ai-chat**: A Flask-based web application that provides the chat user interface. It is built from the local directory, exposes port 5000 for browser access, mounts the project directory as a volume for live code updates, loads environment variables from `vars.env`, and waits for the `ai-runner` service to be ready before starting. +- **ai-runner**: This service uses the Docker Model Runner provider to run the selected AI model (for example, `ai/gemma3`). The configuration under `provider` tells Docker to use the model runner extension and specifies which model to load. + +The setup allows the web app to communicate with the model runner service as if it were an OpenAI-compatible API, making it easy to swap models or update endpoints by changing environment variables or compose options. + +Review the `compose.yaml` file to see the two services. + +```yaml +services: + ai-chat: + build: + context: . + ports: + - "5000:5000" + volumes: + - ./:/app + env_file: + - vars.env + depends_on: + - ai-runner + ai-runner: + provider: + type: model + options: + model: ai/gemma3 +``` + +## Start the application + +From the project directory, start the app with: + +```console +docker compose up --build +``` + +Docker Compose will build the web app image and start both services. + +## Access the chat interface + +Open your browser and copy and paste the local URL below: + +```console +http://localhost:5000 +``` + +You can now chat with the AI model using the web interface. Enter your prompt and view the response in real time. + +![Compose #center](compose-app.png) + +## Configuration + +You can change the AI model or endpoint by editing the `vars.env` file before starting the containers. The file contains environment variables used by the web application: + +- `BASE_URL`: The base URL for the AI model API. By default, it is set to `http://model-runner.docker.internal/engines/v1/`, which allows the web app to communicate with the Docker Model Runner service. This is the default endpoint setup by Docker to access the model. +- `MODEL`: The AI model to use (for example, `ai/gemma3` or `ai/llama3.2`). + +The `vars.env` file is shown below. + +```console +BASE_URL=http://model-runner.docker.internal/engines/v1/ +MODEL=ai/gemma3 +``` + +To use a different model, change the `MODEL` value. For example: + +```console +MODEL=ai/llama3.2 +``` + +Make sure to change the model in the `compose.yaml` file also. + +You can also change the `temperature` and `max_tokens` values in `app.py` to further customize the application. + +## Stop the application + +To stop the services, press `Ctrl+C` in the terminal. + +You can also run the command below in another terminal to stop the services. + +```console +docker compose down +``` + +## Troubleshooting + +Use the steps below if you have any issues running the application: + +- Ensure Docker and Docker Compose are installed and running +- Make sure port 5000 is not in use by another application +- Check logs with: + +```console +docker compose logs +``` + +In this section, you learned how to use Docker Compose to run a containerized AI chat application with a web interface and local model inference from Docker Model Runner. diff --git a/content/learning-paths/laptops-and-desktops/docker-models/models-tab.png b/content/learning-paths/laptops-and-desktops/docker-models/models-tab.png new file mode 100644 index 0000000000..62850d0012 Binary files /dev/null and b/content/learning-paths/laptops-and-desktops/docker-models/models-tab.png differ diff --git a/content/learning-paths/laptops-and-desktops/docker-models/models.md b/content/learning-paths/laptops-and-desktops/docker-models/models.md new file mode 100644 index 0000000000..3b8a2897cf --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/docker-models/models.md @@ -0,0 +1,197 @@ +--- +title: "Run AI models using Docker Model Runner" +weight: 2 +layout: "learningpathall" +--- + +Docker Model Runner is an official Docker extension that allows you to run Large Language Models (LLMs) on your local computer. It provides a convenient way to deploy and use AI models across different environments, including Arm-based systems, without complex setup or cloud dependencies. + +Docker uses [llama.cpp](https://github.com/ggml-org/llama.cpp), an open source C/C++ project developed by Georgi Gerganov that enables efficient LLM inference on a variety of hardware, but you do not need to download, build, or install any LLM frameworks. + +Docker Model Runner provides a easy to use CLI that is familiar to Docker users. + +## Before you begin + +Verify Docker is running with: + +```console +docker version +``` + +You should see output showing your Docker version. + +Confirm the Docker Desktop version is 4.40 or above, for example: + +```output +Server: Docker Desktop 4.41.2 (191736) +``` + +Make sure the Docker Model Runner is enabled. + +```console +docker model --help +``` + +You should see the usage message: + +```output +Usage: docker model COMMAND + +Docker Model Runner + +Commands: + inspect Display detailed information on one model + list List the available models that can be run with the Docker Model Runner + logs Fetch the Docker Model Runner logs + pull Download a model + push Upload a model + rm Remove models downloaded from Docker Hub + run Run a model with the Docker Model Runner + status Check if the Docker Model Runner is running + tag Tag a model + version Show the Docker Model Runner version +``` + +If Docker Model Runner is not enabled, enable it using the [Docker Model Runner documentation](https://docs.docker.com/model-runner/). + +You should also see the Models icon in your Docker Desktop sidebar. + +![Models #center](models-tab.png) + +## Running your first AI model with Docker Model Runner + +Docker Model Runner is an extension for Docker Desktop that simplifies running AI models locally. + +Docker Model Runner automatically selects compatible model versions and optimizes performance for the Arm architecture. + +You can try Docker Model Runner by using an LLM from Docker Hub. + +The example below uses the [SmolLM2 model](https://hub.docker.com/r/ai/smollm2), a compact language model with 360 million parameters, designed to run efficiently on-device while performing a wide range of language tasks. You can explore additional [models in Docker Hub](https://hub.docker.com/u/ai). + +Download the model using: + +```console +docker model pull ai/smollm2 +``` + +For a simple chat interface, run the model: + +```console +docker model run ai/smollm2 +``` + +Enter a prompt at the CLI: + +```console +write a simple hello world program in C++ +``` + +You see the output from the SmolLM2 model: + +```output +#include + +int main() { + std::cout << "Hello, World!" << std::endl; + return 0; +} +``` + +You can ask more questions and continue to chat. + +To exit the chat use the `/bye` command. + +You can print the list of models on your computer using: + +```console +docker model list +``` + +Your list will be different based on the models you have downloaded. + +```output +MODEL NAME PARAMETERS QUANTIZATION ARCHITECTURE MODEL ID CREATED SIZE +ai/gemma3 3.88 B IQ2_XXS/Q4_K_M gemma3 0b329b335467 2 months ago 2.31 GiB +ai/phi4 14.66 B IQ2_XXS/Q4_K_M phi3 03c0bc8e0f5a 2 months ago 8.43 GiB +ai/smollm2 361.82 M IQ2_XXS/Q4_K_M llama 354bf30d0aa3 2 months ago 256.35 MiB +ai/llama3.2 3.21 B IQ2_XXS/Q4_K_M llama 436bb282b419 2 months ago 1.87 GiB +``` + +## Use the OpenAI endpoint to call the model + +From your host computer you can access the model using the OpenAI endpoint and a TCP port. + +First, enable the TCP port to connect with the model: + +```console +docker desktop enable model-runner --tcp 12434 +``` + +Next, use a text editor to save the code below in a file named `curl-test.sh`: + +```bash +#!/bin/sh + +curl http://localhost:12434/engines/llama.cpp/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "ai/smollm2", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Please write a hello world program in Java." + } + ] + }' +``` + +Run the shell script: + +```console +bash ./curl-test.sh | jq +``` + +If you don't have `jq` installed, you eliminate piping the output. + +The output, including the performance information, is shown below: + +```output +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "role": "assistant", + "content": "Here's a simple \"Hello World\" program in Java:\n\n```java\npublic class HelloWorld {\n public static void main(String[] args) {\n System.out.println(\"Hello, World!\");\n }\n}\n```\n\nThis program declares a `HelloWorld` class, defines a `main` method that contains the program's execution, and then uses `System.out.println` to print \"Hello, World!\" to the console." + } + } + ], + "created": 1748622685, + "model": "ai/smollm2", + "system_fingerprint": "b1-a0f7016", + "object": "chat.completion", + "usage": { + "completion_tokens": 101, + "prompt_tokens": 28, + "total_tokens": 129 + }, + "id": "chatcmpl-uZGBuFoS2ERodT4KilStxDwhySLQBTN9", + "timings": { + "prompt_n": 28, + "prompt_ms": 32.349, + "prompt_per_token_ms": 1.1553214285714284, + "prompt_per_second": 865.5599863983431, + "predicted_n": 101, + "predicted_ms": 469.524, + "predicted_per_token_ms": 4.648752475247525, + "predicted_per_second": 215.11147459980745 + } +} +``` + +In this section you learned how to run AI models using Docker Model Runner. Continue to see how to use Docker Compose to build an application with a built-in AI model. diff --git a/content/learning-paths/mobile-graphics-and-gaming/_index.md b/content/learning-paths/mobile-graphics-and-gaming/_index.md index 2047161d94..59b948e122 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/_index.md @@ -9,7 +9,7 @@ key_ip: - Mali maintopic: true operatingsystems_filter: -- Android: 30 +- Android: 31 - Linux: 27 - macOS: 12 - Windows: 11 @@ -17,7 +17,7 @@ subjects_filter: - Gaming: 6 - Graphics: 6 - ML: 10 -- Performance and Architecture: 30 +- Performance and Architecture: 31 subtitle: Optimize Android apps and build faster games using cutting-edge Arm tech title: Mobile, Graphics, and Gaming tools_software_languages_filter: @@ -32,9 +32,9 @@ tools_software_languages_filter: - Arm Performance Studio: 2 - assembly: 1 - Bazel: 1 -- C: 1 +- C: 2 - C#: 3 -- C++: 8 +- C++: 9 - C/C++: 1 - CCA: 1 - Clang: 10 @@ -45,6 +45,7 @@ tools_software_languages_filter: - GCC: 10 - GenAI: 2 - GoogleTest: 1 +- Hugging Face: 5 - Java: 6 - KleidiAI: 1 - Kotlin: 7 diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-android-chat-app-using-onnxruntime/_index.md b/content/learning-paths/mobile-graphics-and-gaming/build-android-chat-app-using-onnxruntime/_index.md index cdcdd640e1..b8ac59ad38 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/build-android-chat-app-using-onnxruntime/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-android-chat-app-using-onnxruntime/_index.md @@ -26,6 +26,8 @@ tools_software_languages: - ONNX Runtime - Android - Mobile + - Hugging Face + operatingsystems: - Windows - Android diff --git a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/_index.md b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/_index.md index 8be35d1194..878514a0c7 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/build-llama3-chat-android-app-using-executorch-and-xnnpack/_index.md @@ -35,6 +35,8 @@ tools_software_languages: - Java - C++ - Python + - Hugging Face + operatingsystems: - macOS - Android diff --git a/content/learning-paths/mobile-graphics-and-gaming/kleidiai-on-android-with-mediapipe-and-xnnpack/_index.md b/content/learning-paths/mobile-graphics-and-gaming/kleidiai-on-android-with-mediapipe-and-xnnpack/_index.md index f0c8835231..59f5b162c7 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/kleidiai-on-android-with-mediapipe-and-xnnpack/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/kleidiai-on-android-with-mediapipe-and-xnnpack/_index.md @@ -31,6 +31,8 @@ tools_software_languages: - Android NDK - Bazel - XNNPACK + - Hugging Face + operatingsystems: - Linux diff --git a/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/_index.md b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/_index.md new file mode 100644 index 0000000000..e94238425a --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/_index.md @@ -0,0 +1,61 @@ +--- +title: Optimizing graphics vertex efficiency for Arm GPUs + +draft: true +cascade: + draft: true + +minutes_to_complete: 10 + +who_is_this_for: This is an advanced topic for Android graphics application developers. + +learning_objectives: + - Optimize vertex representations on Arm GPUs + - How to interpret Vertex Memory Efficiency in Arm Frame Advisor + +prerequisites: + - An understanding of vertex attributes + - Familiarity with Arm Frame Advisor, part of Arm Performance Studio + +author: + - Andrew Kilroy + - Peter Harris + +### Tags +skilllevels: Advanced +subjects: Performance and Architecture +armips: + - Immortalis + - Mali +tools_software_languages: + - C + - C++ +operatingsystems: + - Android + +further_reading: + - resource: + title: Arm GPU Best Practices Developer Guide + link: https://developer.arm.com/documentation/101897/0304/Vertex-shading/Attribute-layout + type: documentation + - resource: + title: Frame Advisor User Guide + link: https://developer.arm.com/documentation/102693/latest/ + type: documentation + - resource: + title: Analyse a Frame with Frame Advisor + link: https://learn.arm.com/learning-paths/mobile-graphics-and-gaming/analyze_a_frame_with_frame_advisor/ + type: blog + - resource: + title: Arm Performance Studio + link: https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio%20for%20Mobile + type: website + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/_next-steps.md b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/fa-found-bad-vme-in-content-metrics.png b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/fa-found-bad-vme-in-content-metrics.png new file mode 100644 index 0000000000..2739ecc7ce Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/fa-found-bad-vme-in-content-metrics.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/fa-navigate-to-call.png b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/fa-navigate-to-call.png new file mode 100644 index 0000000000..fa3287e35a Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/fa-navigate-to-call.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/initial-memory-layout.png b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/initial-memory-layout.png new file mode 100644 index 0000000000..a4cdf726a0 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/initial-memory-layout.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/initial-memory-layout.svg b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/initial-memory-layout.svg new file mode 100644 index 0000000000..66c333a834 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/initial-memory-layout.svg @@ -0,0 +1,318 @@ + + + + + + + + + + + + + + + + + + mesh[0].position + + + + mesh[0].color + + + + mesh[0].normal + + + + mesh[2].position + + + + mesh[2].color + + + + mesh[2].normal + + ... + + + + mesh[1].position + + + + mesh[1].color + + + + mesh[1].normal + + + Increasingmemoryaddress + + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/vme-learning-path.md b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/vme-learning-path.md new file mode 100644 index 0000000000..7df536257d --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/optimizing-vertex-efficiency/vme-learning-path.md @@ -0,0 +1,161 @@ +--- +title: Optimizing graphics vertex efficiency for Arm GPUs +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +# Optimizing graphics vertex efficiency for Arm GPUs + +You are writing a graphics application targeting an Arm Immortalis +GPU, and not hitting your desired performance. When running the Arm +Frame Advisor tool, you spot that the draw calls in your shadow map +creation pass have poor Vertex Memory Efficiency (VME) scores. How +should you go about improving this? + +![Frame Advisor screenshot](fa-found-bad-vme-in-content-metrics.png) + +In this Learning Path, you will learn about a common source of rendering +inefficiency, how to spot the issue using Arm Frame Advisor, and how +to rectify it. + + +## Shadow mapping + +In this scenario, draw calls in the shadow map render pass are the +source of our poor VME scores. Let's start by reviewing exactly what +these draws are doing. + +Shadow mapping is the mechanism that decides, for every visible pixel, +whether it is lit or in shadow. A shadow map is a texture that is +created as the first part of this process. It is rendered from the +point of view of the light source, and stores the distance to all of +the objects that light can see. Parts of a surface that are visible +to the light are lit, and any part that is occluded must be in shadow. + +## Mesh layout + +The primary input into shadow map creation is the object geometry for +all of the objects that cast shadows. In this scenario, let's +assume that the vertex data for each object is stored in memory as an +array structure, which is a commonly used layout in many applications: + +``` C++ +struct Vertex { + float position[3]; + float color[3]. + float normal[3]; +}; + +std::vector mesh { + // Model data ... +}; + +``` + +This would give the mesh the following layout in memory: + +![Initial memory layout](initial-memory-layout.png) + +## Why is this sub-optimal? + +This looks like a standard way of passing mesh data into a GPU, +so where is the inefficiency coming from? + +The vertex data that is defined contains all of the attributes that +you need for your object, including those that are needed to compute +color in the main lighting pass. When generating the shadow map, +you only need to compute the position of the object, so most +of your vertex attributes will be unused by the shadow map generation +draw calls. + +The inefficiency comes from how hardware gets the data it needs from +main memory so that computation can proceed. Processors do not fetch +single values from DRAM, but instead fetch a small neighborhood of +data, because this is the most efficient way to read from DRAM. For Arm +GPUs, the hardware will read an entire 64 byte cache line at a time. + +In this example, an attempt to fetch a vertex position during shadow +map creation would also load the nearby color and normal values, +even though you do not need them. + + +## Detecting a sub-optimal layout + +Arm Frame Advisor analyzes the attribute memory layout for each draw +call the application makes, and provides the Vertex Memory Efficiency +(VME) metric to show how efficiently that attribute layout is working. + +![Location of vertex memory efficiency in FA](fa-navigate-to-call.png) + +A VME of 1.0 would indicate that the draw call is making an optimal +use of the memory bandwidth, with no unnecessary data fetches. + +A VME of less than one indicates that unnecessary data is being loaded +from memory, wasting bandwidth on data that is not being used in the +computation on the GPU. + +In this mesh layout you are only using 12 bytes for the `position` +field, out of a total vertex size of 36 bytes, so your VME score would +be only 0.33. + + +## Fixing a sub-optimal layout + +Shadow mapping only needs to load position, so to fix this issue you +need to use a memory layout that allows position to be fetched in +isolation from the other data. It is still preferable to leave the +other attributes interleaved. On the CPU, this would look like the following: + +``` C++ +struct VertexPart1 { + float position[3]; +}; + +struct VertexPart2 { + float color[3]. + float normal[3]; +}; + +std::vector mesh { + // Model data ... +}; + +std::vector mesh { + // Model data ... +}; +``` + +This allows the shadow map creation pass to read only useful position +data, without any waste. The main lighting pass that renders the full +object will then read from both memory regions. + +The good news is that this technique is actually a useful one to apply +all of the time, even for the main lighting pass! Many mobile GPUs, +including Arm GPUs, process geometry in two passes. The first pass +computes only the primitive position, and second pass will process +the remainder of the vertex shader only for the primitives that are +visible after primitive culling has been performed. By splitting +the position attributes into a separate stream, you avoid wasting +memory bandwidth fetching non-position data for primitives that are +ultimately discarded by primitive culling tests. + + +# Conclusion + +Arm Frame Advisor can give you actionable metrics that can identify +specific inefficiencies in your application to optimize. + +The VME metric shows how efficiently you are using your input +vertex memory bandwidth, indicating what proportion of the input +data is actually used by the shader program. VME can be improved by +changing vertex memory layout to separate the different streams of +data such that only the data needed for type of computation is packed +together. Try not to mix data in that a computation would not use. + +# Other links + +Arm's advice on [attribute layouts][2] + +[2]: https://developer.arm.com/documentation/101897/0304/Vertex-shading/Attribute-layout diff --git a/content/learning-paths/mobile-graphics-and-gaming/profiling-ml-on-arm/_index.md b/content/learning-paths/mobile-graphics-and-gaming/profiling-ml-on-arm/_index.md index 0854d2f8d7..a6766f609c 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/profiling-ml-on-arm/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/profiling-ml-on-arm/_index.md @@ -29,6 +29,8 @@ armips: tools_software_languages: - Android Studio - LiteRT + - Hugging Face + operatingsystems: - Android - Linux diff --git a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/1-prerequisites.md b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/1-prerequisites.md index a5435413d0..16c4e3573e 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/1-prerequisites.md +++ b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/1-prerequisites.md @@ -15,7 +15,6 @@ Your first task is to prepare a development environment with the required softwa - Android NDK: version r25b or newer. - Python: version 3.10 or newer (tested with 3.10). - CMake: version 3.16.0 or newer (tested with 3.28.1). -- [Arm GNU Toolchain](/install-guides/gcc/arm-gnu). ### Create workspace directory @@ -79,14 +78,26 @@ Bazel is an open-source build tool which you will use to build LiteRT libraries. {{< tabpane code=true >}} {{< tab header="Linux">}} cd $WORKSPACE -wget https://github.com/bazelbuild/bazel/releases/download/7.4.1/bazel-7.4.1-installer-linux-x86_64.sh +export BAZEL_VERSION=7.4.1 +wget https://github.com/bazelbuild/bazel/releases/download/{$BAZEL_VERSION}/bazel-{$BAZEL_VERSION}-installer-linux-x86_64.sh sudo bash bazel-7.4.1-installer-linux-x86_64.sh +export PATH="/usr/local/bin:$PATH" {{< /tab >}} {{< tab header="MacOS">}} -brew install bazel@7 +cd $WORKSPACE +export BAZEL_VERSION=7.4.1 +curl -fLO "https://github.com/bazelbuild/bazel/releases/download/{$BAZEL_VERSION}/bazel-{$BAZEL_VERSION}-installer-darwin-arm64.sh" +sudo bash bazel-7.4.1-installer-darwin-arm64.sh +export PATH="/usr/local/bin:$PATH" {{< /tab >}} {{< /tabpane >}} +You can verify the installation and check the version with: + +```console +bazel --version +``` + ### Install Android NDK To run the model on Android, install Android Native Development Kit (Android NDK): @@ -98,9 +109,9 @@ wget https://dl.google.com/android/repository/android-ndk-r25b-linux.zip unzip android-ndk-r25b-linux.zip {{< /tab >}} {{< tab header="MacOS">}} +cd $WORKSPACE wget https://dl.google.com/android/repository/android-ndk-r25b-darwin.zip -unzip android-ndk-r25b-darwin -mv android-ndk-r25b-darwin ~/Library/Android/android-ndk-r25b +unzip android-ndk-r25b-darwin.zip {{< /tab >}} {{< /tabpane >}} @@ -109,12 +120,13 @@ For easier access and execution of Android NDK tools, add these to the `PATH` an {{< tabpane code=true >}} {{< tab header="Linux">}} export NDK_PATH=$WORKSPACE/android-ndk-r25b/ +export ANDROID_NDK_HOME=$NDK_PATH export PATH=$NDK_PATH/toolchains/llvm/prebuilt/linux-x86_64/bin/:$PATH {{< /tab >}} {{< tab header="MacOS">}} -export NDK_PATH=~/Library/Android/android-ndk-r25b -export PATH=$PATH:$NDK_PATH/toolchains/llvm/prebuilt/darwin-x86_64/bin -export PATH=$PATH:~/Library/Android/sdk/cmdline-tools/latest/bin +export NDK_PATH=$WORKSPACE/android-ndk-r25b/ +export ANDROID_NDK_HOME=$NDK_PATH +export PATH=$NDK_PATH/toolchains/llvm/prebuilt/darwin-x86_64/bin/:$PATH {{< /tab >}} {{< /tabpane >}} diff --git a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/3-converting-model.md b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/3-converting-model.md index 9961483097..af2aa9f56d 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/3-converting-model.md +++ b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/3-converting-model.md @@ -24,22 +24,20 @@ In this section, you will explore two different conversion routes, to convert th 1. **ONNX to LiteRT**: using the `onnx2tf` tool. This is the traditional two-step approach (PyTorch -> ONNX -> LiteRT). You will use it to convert the Conditioners submodule. -2. **PyTorch to LiteRT**: using the Google AI Edge Torch tool. You will use this tool to convert the DiT and AutoEncoder submodules. +2. **PyTorch to LiteRT**: using the [Google AI Edge Torch](https://developers.googleblog.com/en/ai-edge-torch-high-performance-inference-of-pytorch-models-on-mobile-devices/) tool. You will use this tool to convert the DiT and AutoEncoder submodules. -## Download the sample code - -The Conditioners submodule is made of the T5Encoder model. You will use the ONNX to TFLite conversion for this submodule. +## Create a virtual environment To avoid dependency issues, create a virtual environment. For example, you can use the following command: ```bash cd $WORKSPACE -python3.10 -m venv env -source env/bin/activate +python3.10 -m venv .venv +source .venv/bin/activate ``` -Clone the examples repository: +## Clone the examples repository ```bash cd $WORKSPACE @@ -47,7 +45,7 @@ git clone https://github.com/ARM-software/ML-examples.git cd ML-examples/kleidiai-examples/audiogen/ ``` -Install the required Python packages for this, including *onnx2tf* and *ai_edge_litert* +## Install the required dependencies ```bash bash install_requirements.sh @@ -58,13 +56,13 @@ bash install_requirements.sh If you are using GPU on your machine, you may notice the following error: ```text Traceback (most recent call last): - File "$WORKSPACE/env/lib/python3.10/site-packages/torch/_inductor/runtime/hints.py", + File "$WORKSPACE/.venv/lib/python3.10/site-packages/torch/_inductor/runtime/hints.py", line 46, in from triton.backends.compiler import AttrsDescriptor ImportError: cannot import name 'AttrsDescriptor' from 'triton.backends.compiler' -($WORKSPACE/env/lib/python3.10/site-packages/triton/backends/compiler.py) +($WORKSPACE/.venv/lib/python3.10/site-packages/triton/backends/compiler.py) . ImportError: cannot import name 'AttrsDescriptor' from 'triton.compiler.compiler' -($WORKSPACE/env/lib/python3.10/site-packages/triton/compiler/compiler.py) +($WORKSPACE/.venv/lib/python3.10/site-packages/triton/compiler/compiler.py) ``` Reinstall the following dependency: @@ -89,13 +87,14 @@ You can use the provided script to convert the Conditioners submodule: python3 ./scripts/export_conditioners.py --model_config "$WORKSPACE/model_config.json" --ckpt_path "$WORKSPACE/model.ckpt" ``` -After successful conversion, you now have a `tflite_conditioners` directory containing models with different precision (e.g., float16, float32). + +After successful conversion, you now have a `conditioners_tflite` directory containing models with different precision (e.g., float16, float32). You will be using the float32.tflite model for on-device inference. -### Convert DiT and AutoEncoder +### Convert DiT and AutoEncoder Submodules -To convert the DiT and AutoEncoder submodules, use the [Generative API](https://github.com/google-ai-edge/ai-edge-torch/tree/main/ai_edge_torch/generative/) provided by the ai-edge-torch tools. This enables you to export a generative PyTorch model directly to `.tflite` using three main steps: +To convert the DiT and AutoEncoder submodules, use the [Generative API](https://github.com/google-ai-edge/ai-edge-torch/tree/main/ai_edge_torch/generative/) provided by the `ai-edge-torch` tools. This enables you to export a generative PyTorch model directly to `.tflite` using three main steps: 1. Model re-authoring. 2. Quantization. diff --git a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/4-building-litert.md b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/4-building-litert.md index f61a11611b..1d8531458e 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/4-building-litert.md +++ b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/4-building-litert.md @@ -33,11 +33,13 @@ Ensure the `NDK_PATH` variable is set to your previously installed Android NDK: {{< tabpane code=true >}} {{< tab header="Linux">}} export NDK_PATH=$WORKSPACE/android-ndk-r25b/ +export ANDROID_NDK_HOME=$NDK_PATH export PATH=$NDK_PATH/toolchains/llvm/prebuilt/linux-x86_64/bin/:$PATH {{< /tab >}} {{< tab header="MacOS">}} -export NDK_PATH=~/Library/Android/android-ndk-r25b -export PATH=$PATH:$NDK_PATH/toolchains/llvm/prebuilt/darwin-x86_64/bin +export NDK_PATH=$WORKSPACE/android-ndk-r25b/ +export ANDROID_NDK_HOME=$NDK_PATH +export PATH=$NDK_PATH/toolchains/llvm/prebuilt/darwin-x86_64/bin/:$PATH {{< /tab >}} {{< /tabpane >}} {{% /notice %}} @@ -54,6 +56,7 @@ python3 ./configure.py |Please input the desired Python library path to use[$WORKSPACE/lib/python3.10/site-packages] | Enter | |Do you wish to build TensorFlow with ROCm support? [y/N]|N (No)| |Do you wish to build TensorFlow with CUDA support?|N| +|Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -Wno-sign-compare]:| Enter | |Do you want to use Clang to build TensorFlow? [Y/n]|N| |Would you like to interactively configure ./WORKSPACE for Android builds? [y/N]|y (Yes) | |Please specify the home path of the Android NDK to use. [Default is /home/user/Android/Sdk/ndk-bundle]| Enter | @@ -63,15 +66,24 @@ python3 ./configure.py |Please specify an Android build tools version to use. [Default is 35.0.0]| Enter | |Do you wish to build TensorFlow with iOS support? [y/N]:| n | -Once the Bazel configuration is complete, you can build TFLite as follows: +Once the Bazel configuration is complete, you can build LiteRT for your target platform as follows: -```console +{{< tabpane code=true >}} + {{< tab header="Android">}} bazel build -c opt --config android_arm64 //tensorflow/lite:libtensorflowlite.so \ --define tflite_with_xnnpack=true \ --define=xnn_enable_arm_i8mm=true \ --define tflite_with_xnnpack_qs8=true \ --define tflite_with_xnnpack_qu8=true -``` + {{< /tab >}} + {{< tab header="MacOS">}} +bazel build -c opt --config macos //tensorflow/lite:libtensorflowlite.so \ + --define tflite_with_xnnpack=true \ + --define xnn_enable_arm_i8mm=true \ + --define tflite_with_xnnpack_qs8=true \ + --define tflite_with_xnnpack_qu8=true + {{< /tab >}} +{{< /tabpane >}} The final step is to build flatbuffers used by the application: ``` @@ -81,7 +93,7 @@ cmake ../tensorflow/lite/tools/cmake/native_tools/flatbuffers cmake --build . ``` -Now that LiteRT and FlatBuffers are built, you're ready to compile and deploy the Stable Audio Open Small inference application on your Android device. +Now that LiteRT and FlatBuffers are built, you're ready to compile and deploy the Stable Audio Open Small inference application on your Android or macOS device. diff --git a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/5-creating-simple-program.md b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/5-creating-simple-program-for-android.md similarity index 82% rename from content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/5-creating-simple-program.md rename to content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/5-creating-simple-program-for-android.md index 2696b39b8a..38b2c4850d 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/5-creating-simple-program.md +++ b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/5-creating-simple-program-for-android.md @@ -1,5 +1,5 @@ --- -title: Create a simple program +title: Create a simple program for Android target weight: 6 ### FIXED, DO NOT MODIFY @@ -36,7 +36,7 @@ A SentencePiece model is a type of subword tokenizer which is used by the audiog ```bash cd $WORKSPACE -wget https://huggingface.co/google-t5/t5-base/tree/main +wget https://huggingface.co/google-t5/t5-base/resolve/main/spiece.model ``` Verify this model was downloaded to your `WORKSPACE`. @@ -76,7 +76,13 @@ Start a new shell to access the device's system from your development machine: adb shell ``` -Finally, run the program on your Android device. Play around with the advice from [Download the model](../2-testing-model) section. +From there, you can then run the audiogen application, which requires just three input arguments: + +* **Model Path:** The directory containing your LiteRT models and spiece.model files +* **Prompt:** A text description of the desired audio (e.g., warm arpeggios on house beats 120BPM with drums effect) +* **CPU Threads:** The number of CPU threads to use (e.g., 4) + +Play around with the advice from [Download and test the model](../2-testing-model) section. ```bash cd /data/local/tmp/app @@ -90,4 +96,4 @@ You can now pull the generated `output.wav` back to your host machine and listen adb pull /data/local/tmp/app/output.wav ``` -You should now have gained hands-on experience running the Stable Audio Open Small model with LiteRT on Arm-based devices. This includes setting up the environment, optimizing the model for on-device inference, and understanding how efficient runtimes like LiteRT make low-latency generative AI possible at the edge. You’re now better equipped to explore and deploy AI-powered audio applications on mobile and embedded platforms. \ No newline at end of file +You should now have gained hands-on experience running the Stable Audio Open Small model with LiteRT on Arm-based devices. This includes setting up the environment, optimizing the model for on-device inference, and understanding how efficient runtimes like LiteRT make low-latency generative AI possible at the edge. You’re now better equipped to explore and deploy AI-powered audio applications on mobile and embedded platforms. diff --git a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/6-creating-simple-program-for-macos.md b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/6-creating-simple-program-for-macos.md new file mode 100644 index 0000000000..635c2f2e95 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/6-creating-simple-program-for-macos.md @@ -0,0 +1,66 @@ +--- +title: Create a simple program for macOS target +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Create and build a simple program + +As a final step, you’ll build a simple program that runs inference on all three submodules directly on a macOS device. + +The program takes a text prompt as input and generates an audio file as output. + +```bash +cd $WORKSPACE/ML-examples/kleidiai-examples/audiogen/app +mkdir build && cd build +``` + +Ensure the NDK path is set correctly and build with `cmake`: + +```bash +cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ + -DTF_INCLUDE_PATH=$TF_SRC_PATH \ + -DTF_LIB_PATH=$TF_SRC_PATH/bazel-bin/tensorflow/lite \ + -DFLATBUFFER_INCLUDE_PATH=$TF_SRC_PATH/flatc-native-build/flatbuffers/include \ + .. + +make -j +``` +After the example application builds successfully, a binary file named `audiogen` is created. + +A SentencePiece model is a type of subword tokenizer which is used by the audiogen application, you’ll need to download the *spiece.model* file from: + +```bash +cd $LITERT_MODELS_PATH +wget https://huggingface.co/google-t5/t5-base/resolve/main/spiece.model +``` + +Verify this model was downloaded to your `WORKSPACE`. + +```text +ls $LITERT_MODELS_PATH/spiece.model +``` + +Copy the shared LiteRT dynamic library to the $LITERT_MODELS_PATH. +```bash +cp $TF_SRC_PATH/bazel-bin/tensorflow/lite/libtensorflowlite.so $LITERT_MODELS_PATH/ +``` + +From there, you can then run the audiogen application, which requires just three input arguments: + +* **Model Path:** The directory containing your LiteRT models and spiece.model files +* **Prompt:** A text description of the desired audio (e.g., warm arpeggios on house beats 120BPM with drums effect) +* **CPU Threads:** The number of CPU threads to use (e.g., 4) + +Play around with the advice from [Download and test the model](../2-testing-model) section. + +```bash +cd $WORKSPACE/ML-examples/kleidiai-examples/audiogen/app/ +./build/audiogen $LITERT_MODELS_PATH "warm arpeggios on house beats 120BPM with drums effect" 4 +``` + +You can now check the generated `output.wav` and listen to the result. + +You should now have gained hands-on experience running the Stable Audio Open Small model with LiteRT on Arm-based devices. This includes setting up the environment, optimizing the model for on-device inference, and understanding how efficient runtimes like LiteRT make low-latency generative AI possible at the edge. You’re now better equipped to explore and deploy AI-powered audio applications on mobile and embedded platforms. diff --git a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/_index.md b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/_index.md index f20be9c201..87385c6769 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/run-stable-audio-open-small-with-lite-rt/_index.md @@ -3,7 +3,7 @@ title: Generate audio with Stable Audio Open Small on LiteRT minutes_to_complete: 30 -who_is_this_for: This is an introductory topic for developers looking to deploy the Stable Audio Open Small text-to-audio model using LiteRT on an Android device. +who_is_this_for: This is an introductory topic for developers looking to deploy the Stable Audio Open Small text-to-audio model using LiteRT on an Android™ device or on a reasonably modern platform with macOS®. learning_objectives: - Download and test the Stable Audio Open Small model. @@ -19,6 +19,9 @@ prerequisites: author: - Nina Drozd + - Gian Marco Iodice + - Adnan AlSinan + - Aude Vuilliomenet - Annie Tallund ### Tags @@ -31,6 +34,7 @@ armips: tools_software_languages: - C++ - Python + - Hugging Face operatingsystems: - Linux @@ -42,8 +46,8 @@ further_reading: link: https://stability.ai/news/stability-ai-and-arm-release-stable-audio-open-small-enabling-real-world-deployment-for-on-device-audio-control type: blog - resource: - title: Stability AI optimized its audio generation model to run on Arm chips - link: https://techcrunch.com/2025/03/03/stability-ai-optimized-its-audio-generation-model-to-run-on-arm-chips/ + title: "Unlocking audio generation on Arm CPUs to all: Running Stable Audio Open Small with KleidiAI" + link: https://community.arm.com/arm-community-blogs/b/ai-blog/posts/audio-generation-arm-cpus-stable-audio-open-small-kleidiai type: blog - resource: title: Fast Text-to-Audio Generation with Adversarial Post-Training diff --git a/content/learning-paths/servers-and-cloud-computing/_index.md b/content/learning-paths/servers-and-cloud-computing/_index.md index 1ba6abdd1f..35e65e6093 100644 --- a/content/learning-paths/servers-and-cloud-computing/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/_index.md @@ -95,6 +95,7 @@ tools_software_languages_filter: - GoogleTest: 1 - HammerDB: 1 - Herd7: 1 +- Hugging Face: 9 - InnoDB: 1 - Intrinsics: 1 - Java: 3 diff --git a/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/_index.md b/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/_index.md index b3280ab9ed..823d400381 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/_index.md @@ -1,22 +1,20 @@ --- -title: Explore Performance Gains by Increasing the Linux Kernel Page Size on Arm - -draft: true -cascade: - draft: true +title: Explore performance gains by increasing the Linux kernel page size on Arm minutes_to_complete: 30 who_is_this_for: This is an introductory topic for developers who want to modify the Linux kernel page size on Arm-based systems to improve performance for memory-intensive workloads. learning_objectives: - - Verify the current page size on your system. - - Install the 64K page size kernel specific to your OS. - - Verify the new 64K page size is active. - - Revert to the default 4K page size kernel (optional). + - Explain the differences in page size configuration between Arm64 and x86 architectures. + - Understand how page size affects memory efficiency and system performance. + - Check the current memory page size on an Arm-based Linux system. + - Install and boot into a Linux kernel configured with 64K page size support. + - Confirm that the 64K page size is active. + - Optionally revert to the default 4K page size kernel. prerequisites: - - An Arm-based Linux system running Ubuntu, Debian, or CentOS. + - Access to an Arm-based Linux system running Ubuntu, Debian, or CentOS. author: Geremy Cohen @@ -38,11 +36,11 @@ further_reading: link: https://amperecomputing.com/tuning-guides/understanding-memory-page-sizes-on-arm64 type: documentation - resource: - title: Page (computer memory) – Wikipedia + title: Computer Memory, Wikipedia page link: https://en.wikipedia.org/wiki/Page_(computer_memory) type: documentation - resource: - title: Debian Kernel Source Guide + title: Network setup, Debian Kernel Source Guide link: https://www.debian.org/doc/manuals/debian-reference/ch05.en.html#_kernel_source type: documentation - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/centos.md b/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/centos.md index 2906c9f9ce..8191301d2c 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/centos.md +++ b/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/centos.md @@ -5,11 +5,11 @@ weight: 5 layout: learningpathall --- -Follow the steps below to install a 64K page size kernel on [CentOS 9 or newer](https://www.centos.org/download/). +Follow the steps below to install a 64K page size kernel on [CentOS 9 or later](https://www.centos.org/download/). ## Verify the current page size -Verify you’re using a 4 KB pagesize kernel by entering the following commands: +Verify you’re using a 4KB pagesize kernel by entering the following commands: ```bash getconf PAGESIZE @@ -25,9 +25,9 @@ The output should be similar to below. The kernel flavor (the string after the v The 4096 indicates the current page size is 4KB. If you see a value that is different, you are already using a page size other than 4096 (4K). On Arm systems, the valid options are 4K, 16K, and 64K. -## Install the 64k kernel package: +## Install the 64K kernel package: -Enter the command below to install the 64k kernel: +Enter the command below to install the 64K kernel: ```bash sudo dnf -y install kernel-64k @@ -68,7 +68,7 @@ The output shows the 64k kernel is running: 5.14.0-583.el9.aarch64+64k ``` -## Revert back to the 4K kernel +## Revert to the 4K kernel To revert to the original 4K kernel, enter the following: @@ -100,7 +100,7 @@ sudo grubby --set-default "$k4" sudo reboot ``` -Upon reboot, verify you’re on a 4 KB pagesize kernel by entering the following commands: +Upon reboot, verify you’re on a 4KB pagesize kernel by entering the following commands: ```bash getconf PAGESIZE diff --git a/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/debian.md b/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/debian.md index 496c381307..487bb9ef7b 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/debian.md +++ b/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/debian.md @@ -5,7 +5,7 @@ weight: 4 layout: learningpathall --- -Follow the steps below to install a 64K page size kernel on [Debian 11 “Bullseye” or newer](https://www.debian.org/releases/bullseye/). +Follow the steps below to install a 64K page size kernel on [Debian 11 “Bullseye” or later](https://www.debian.org/releases/bullseye/). Debian does not provide a 64K kernel package, so you will need to compile it from source. @@ -17,7 +17,7 @@ The instructions below use the Debian source package. ## Verify the current page size -Verify you’re using a 4 KB pagesize kernel by entering the following commands: +Verify you’re using a 4KB pagesize kernel by entering the following commands: ```bash getconf PAGESIZE @@ -100,11 +100,11 @@ The output shows the 64k kernel is running: 6.12.22-64k ``` -This indicates the current page size is 64K, and you are using the new custom made 64k kernel. +This indicates the current page size is 64K, and you are using the new custom-built 64k kernel. -## Revert back to the 4K kernel +## Revert to the 4K kernel -To revert back to the kernel we started with, enter: +To revert to the kernel we started with, enter: ```bash dpkg-query -W -f='${Package}\n' 'linux-image-*-64k*' 'linux-headers-*-64k*' \ @@ -127,4 +127,4 @@ The output should be similar to below -- the full kernel name may vary, but the 6.1.0-34-cloud-arm64 ``` -The 4096 indicates the current page size has been reverted to 4KB. \ No newline at end of file +The 4096 indicates the current page size has been reverted to 4 KB. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/overview.md b/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/overview.md index 4fd6d00088..8eeced1079 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/overview.md +++ b/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/overview.md @@ -1,52 +1,72 @@ --- -title: Page Size Overview +title: Overview weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## How does the CPU locate data in memory? +## Page size fundamentals -When your program asks for a memory address, the CPU doesn’t directly reach into RAM or swap space for it; that would be slow, unsafe, and inefficient. - -Instead, it goes through the virtual memory system, where it asks for a specific chunk of memory called a page. Pages map virtual memory locations to physical memory locations in RAM or swap space. +Before you modify the Linux kernel page size on an Arm system, you need to know what a page is, why size matters, and how size affects performance. ## What’s a memory page? -Think of your computer’s memory like a big sheet of graph paper. Each page is one square on that sheet. The page table is the legend that identifies which square (virtual address) maps to which spot in physical RAM. On x86, 4K is the only page size option, but Arm-based systems allow you to use 4K, 16K, or 64K page sizes to fine tune the performance of your applications. +Think of your computer’s memory like a big sheet of graph paper. Each page is one square on that sheet. + +The page table acts like a legend on the map, showing which virtual address square corresponds to a specific location in physical RAM. This mapping is managed by the operating system and the CPU’s Memory Management Unit (MMU). + +To keep track of these mappings efficiently, CPUs use a fast lookup cache called the Translation Lookaside Buffer (TLB). Every access first attempts a TLB hit; a miss forces a page table lookup. If the page isn't already in the TLB, the CPU must fetch the mapping from memory—a process that adds latency and stalls execution. + +On x86 systems, 4K pages are the standard, while Arm-based systems support multiple page sizes - typically 4K, 16K, or 64K. This flexibility allows developers to fine-tune performance for specific workloads. This Learning Path explains how to switch between 4K and 64K pages on different Linux distributions. -## How should I select the memory page size? +## How does the CPU locate data in memory? -Points to consider when thinking about page size: +When your program accesses a memory address, the CPU doesn’t directly fetch data from RAM or swap space for it. That would be slow, unsafe, and inefficient. Direct physical access would bypass isolation and invalidate caches. + +Instead, it goes through the virtual memory system, where it asks for a specific chunk of memory called a page. Pages map virtual memory locations to physical memory locations in RAM or swap space. -- **4K pages** are the safe, default choice. They let you use memory in small slices and keep waste low. Since they are smaller, you need more of them when handling larger memory footprint applications. This creates more overhead for the operating system to manage, but it may be worth it for the flexibility. They are great for applications that need to access small bits of data frequently, like web servers or databases with lots of small transactions. +## How does page size affect performance? -- **64K pages** shine when you work with large, continuous data such as video frames or large database caches because they cut down on management overhead. They will use more memory if you don’t use the whole page, but they can also speed up access times for large data sets. +Changing the page size has a cascading effect on system performance: -When selecting your page size, it's important to try both options under real-world conditions, as it will depend on the data size and retrieval patterns of the data you are working with. +**Memory Fragmentation**: Smaller pages reduce internal fragmentation, which is the wasted memory per allocation. Larger pages can increase waste if your workloads don’t use the full page. -In addition, the page size may need to be reviewed over time as the application, memory usage patterns, and data sizes may change. +**TLB Pressure**: With smaller pages such as 4K, more entries are needed to map the same amount of memory. This increases TLB misses and page-table-walk overhead. Larger pages, such as 64K, reduce the number of entries and can lower TLB pressure. +**I/O Efficiency**: Disk I/O and DMA operations often perform better with larger pages, because fewer page boundaries are crossed during transfers (fewer interrupts, larger DMA bursts). -### Summary of page size differences +### Trade-offs to consider | Aspect | 4K Pages | 64K Pages | |-----------------|--------------------------------------|----------------------------------------| | **Size** | Small “bricks” (4 KB each) | Big “bricks” (64 KB each) | -| **Flexibility** | Very flexible—good for lots of tiny bits of data | Less flexible—best when data comes in large chunks | +| **Flexibility** | Best for flexibility and compatibility | Best for large, contiguous memory workloads | | **Efficiency** | Needs more entries (more bookkeeping) | Needs fewer entries (less bookkeeping) | -| **Waste** | At most 4 KB unused per page | Up to 64 KB unused if not fully used | +| **Waste** | At most 4 KB unused per page | Up to ~63 KB unused if not fully used | +| **TLB reach** | Lower, more misses | Higher, fewer misses | + +This Learning Path covers switching between 4K and 64K page sizes because these are supported by most Arm Linux distributions. In some cases, you may find that 16K page size is a sweet spot for your application, but Linux kernel, hardware, and software support is limited. One example of 16K page size is [Asahi Linux](https://asahilinux.org/). + +## How do I select the memory page size? + +Points to consider when thinking about page size: + +- **4K pages** are the safe, default choice. They let you use memory in small slices and keep waste low. Since they are smaller, you need more of them when handling larger memory footprint applications. This creates more overhead for the operating system to manage, but it may be worth it for the flexibility. They are great for applications that need to access small bits of data frequently, like web servers or databases with lots of small transactions. + +- **64K pages** shine when you work with large, contiguous data such as video frames or large database caches because they cut down on management overhead. They will use more memory if you don’t use the whole page, but they can also speed up access times for large data sets. + +Choosing the right page size depends on how your application uses memory, as both the data size and retrieval patterns of the data you are working with are influencing factors. Benchmark different options under real-world workloads to determine which delivers better performance. -This Learning Path covers switching between 4K and 64K page sizes because these are supported by most Arm Linux distributions. In some cases, you may find that 16K page size is a sweet spot for your application, but Linux kernel, hardware, and software support is limited. One example of 16k page size is [Asahi Linux](https://asahilinux.org/). +In addition, the page size might need to be reviewed over time as the application, memory usage patterns, and data sizes might change. -## Experiment to see which works best for your workload +## Try out a page size for your workload The best way to determine the impact of page size on application performance is to experiment with both options. -{{% notice Do not test on Production%}} -Modifying the Linux kernel page size can lead to system instability or failure. Perform testing in a non-production environment before applying to production systems. +{{% notice Warning%}} +Do not modify the Linux kernel page size in a production environment. It can lead to system instability or failure. Perform testing in a non-production environment before applying to production systems. {{% /notice %}} Select the Arm Linux distribution you are using to find out how to install the 64K page size kernel. diff --git a/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/ubuntu.md b/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/ubuntu.md index 07cf938a72..5b55c62b93 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/ubuntu.md +++ b/content/learning-paths/servers-and-cloud-computing/arm_linux_page_size/ubuntu.md @@ -5,11 +5,11 @@ weight: 3 layout: learningpathall --- -Follow the steps below to install a 64K page size kernel on [Ubuntu 22.04 LTS or newer](https://releases.ubuntu.com/22.04/). +To install a 64 K page size kernel on [Ubuntu 22.04 LTS or later](https://releases.ubuntu.com/22.04/), follow the steps below. ## Verify the current page size -Verify you’re using a 4 KB pagesize kernel by entering the following commands: +Verify you’re using a 4KB base-page-size kernel by entering the following commands: ```bash getconf PAGESIZE @@ -23,7 +23,7 @@ The output should be similar to below. The kernel flavor (the string after the v 6.1.0-34-cloud-arm64 ``` -The 4096 indicates the current page size is 4KB. If you see a value that is different, you are already using a page size other than 4096 (4K). On Arm systems, the valid options are 4K, 16K, and 64K. +The 4096 indicates the current page size is 4 KB. If you see a value that is different, you are already using a page size other than 4096 (4K). On Arm systems, the valid options are 4K, 16K, and 64K. ## Install the required dependencies and the 64K kernel @@ -34,15 +34,15 @@ sudo apt-get -y update sudo apt-get -y install git build-essential autoconf automake libtool gdb wget linux-generic-64k ``` -Next, run the following command to configure grub to load the 64K kernel by default: +Next, run the following command to configure GRUB to load the 64K kernel by default: ```bash echo "GRUB_FLAVOUR_ORDER=generic-64k" | sudo tee /etc/default/grub.d/local-order.cfg ``` -## Update grub and reboot +## Update GRUB and reboot -Commit your changes to grub and reboot by entering the following: +Commit your changes to GRUB and reboot by entering the following: ```bash sudo update-grub @@ -56,32 +56,32 @@ getconf PAGESIZE uname -r ``` -The output shows the 64k kernel is running: +The output shows the 64K kernel is running: ```output 65536 6.8.0-59-generic-64k ``` -This indicates the current page size is 64K and you are running the new 64K kernel. +This indicates that the current page size is 64K and that you are running the new 64K kernel. -## Revert back to the 4K kernel +## Revert to the 4K kernel -To revert back to the original 4K kernel, run the following commands: +To revert to the original 4K kernel, run the following commands: ```bash echo "GRUB_FLAVOUR_ORDER=generic" | sudo tee /etc/default/grub.d/local-order.cfg sudo update-grub sudo reboot ``` -Upon reboot, verify you’re on a 4 KB pagesize kernel by entering the following commands: +Upon reboot, verify you’re on a 4KB pagesize kernel by entering the following commands: ```bash getconf PAGESIZE uname -r ``` -The output shows the 4k kernel is running: +The output shows the 4K kernel is running: ```output 4096 diff --git a/content/learning-paths/servers-and-cloud-computing/benchmark-nlp/_index.md b/content/learning-paths/servers-and-cloud-computing/benchmark-nlp/_index.md index a833648990..7f91f0ce13 100644 --- a/content/learning-paths/servers-and-cloud-computing/benchmark-nlp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/benchmark-nlp/_index.md @@ -25,6 +25,7 @@ operatingsystems: tools_software_languages: - Python - PyTorch + - Hugging Face further_reading: - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md index 6f9e60bff5..f56a19089e 100644 --- a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md @@ -1,16 +1,12 @@ --- title: Analyze cache behavior with Perf C2C on Arm -draft: true -cascade: - draft: true - minutes_to_complete: 15 -who_is_this_for: This topic is for developers who want to optimize cache access patterns on Arm servers using Perf C2C. +who_is_this_for: This topic is for performance-oriented developers working on Arm-based cloud or server systems who want to optimize memory access patterns and investigate cache inefficiencies using Perf C2C and Arm SPE. learning_objectives: - - Avoid false sharing in C++ using memory alignment. + - Identify and fix false sharing issues using Perf C2C, a cache line analysis tool. - Enable and use the Arm Statistical Profiling Extension (SPE) on Linux systems. - Investigate cache line performance with Perf C2C. diff --git a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-1.md b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-1.md index d99dac7e85..7cea8d6ff9 100644 --- a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-1.md +++ b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-1.md @@ -1,36 +1,61 @@ --- -title: Introduction to Arm SPE and false sharing +title: Arm Statistical Profiling Extension and false sharing weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Introduction to the Arm Statistical Profiling Extension (SPE) +## What is the Arm Statistical Profiling Extension (SPE), and what does it do? -Standard performance tracing relies on counting completed instructions, capturing only architectural instructions without revealing the actual memory addresses, pipeline latencies, or considering micro-operations in flight. Moreover, the “skid” phenomenon where events are falsely attributed to later instructions can mislead developers. +{{% notice Learning goal%}} +In this section, you’ll learn how to use SPE to gain low-level insight into how your applications interact with the CPU. You’ll explore how to detect and resolve false sharing. By combining cache line alignment techniques with Perf C2C, you can identify inefficient memory access patterns and significantly boost CPU performance on Arm-based systems. +{{% /notice %}} -SPE integrates sampling directly into the CPU pipeline, triggering on individual micro-operations rather than retired instructions, thereby eliminating skid and blind spots. Each SPE sample record includes relevant metadata, such as data addresses, per-µop pipeline latency, triggered PMU event masks, and the memory hierarchy source, enabling fine-grained and precise cache analysis. +Arm’s Statistical Profiling Extension (SPE) gives you a powerful way to understand what’s really happening inside your applications at the microarchitecture level. -This enables software developers to tune user-space software for characteristics such as memory latency and cache accesses. Importantly, cache statistics are enabled with the Linux Perf cache-to-cache (C2C) utility. +Introduced in Armv8.2, SPE captures a statistical view of how instructions move through the CPU, which allows you to dig into issues like memory access latency, cache misses, and pipeline behavior. -Please refer to the [Arm SPE white paper](https://developer.arm.com/documentation/109429/latest/) for more details. +Most Linux profiling tools focus on retired instruction counts, which means they miss key details like memory addresses, cache latency, and micro-operation behavior. This can lead to misleading results, especially due to a phenomenon called “skid,” where events are falsely attributed to later instructions. + +SPE integrates sampling directly into the CPU pipeline, triggering on individual micro-operations instead of retired instructions. This approach eliminates skid and blind spots. Each SPE sample record includes relevant metadata, such as: + +* Data addresses +* Per-µop pipeline latency +* Triggered PMU event masks +* Memory hierarchy source + +This enables fine-grained, precise cache analysis. + +SPE helps developers optimize user-space applications by showing where cache latency or memory access delays are happening. Importantly, cache statistics are enabled with the Linux Perf Cache-to-Cache (C2C) utility. + +For more information, see the [*Arm Statistical Profiling Extension: Performance Analysis Methodology White Paper*](https://developer.arm.com/documentation/109429/latest/). In this Learning Path, you will use SPE and Perf C2C to diagnose a cache issue for an application running on a Neoverse server. -## False sharing within the cache +## What is false sharing and why should I care about it? + +In large-scale, multithreaded applications, false sharing can degrade performance by introducing hundreds of unnecessary cache line invalidations per second - often with no visible red flags in the source code. + +Even when two threads touch entirely separate variables, modern processors move data in fixed-size cache lines, which is typically 64 bytes. If those distinct variables happen to occupy bytes within the same line, every time one thread writes its variable the core’s cache must gain exclusive ownership of the whole line, forcing the other core’s copy to be invalidated. -Even when two threads touch entirely separate variables, modern processors move data in fixed-size cache lines (nominally 64-bytes). If those distinct variables happen to occupy bytes within the same line, every time one thread writes its variable the core’s cache must gain exclusive ownership of the whole line, forcing the other core’s copy to be invalidated. The second thread, still working on its own variable, then triggers a coherence miss to fetch the line back, and the ping-pong pattern repeats. Please see the illustration below, taken from the Arm SPE white paper, for a visual explanation. +The second thread, still working on its own variable, then triggers a coherence miss to fetch the line back, and the ping-pong pattern repeats. -![false_sharing_diagram](./false_sharing_diagram.png) +The diagram below, taken from the Arm SPE white paper, provides a visual representation of two threads on separate cores alternately gaining exclusive access to the same cache line. -Because false sharing hides behind ordinary writes, the easiest time to eliminate it is while reading or refactoring the source code by padding or realigning the offending variables before compilation. In large, highly concurrent codebases, however, data structures are often accessed through several layers of abstraction, and many threads touch memory via indirection, so the subtle cache-line overlap may not surface until profiling or performance counters reveal unexpected coherence misses. +![false_sharing_diagram alt-text#center](./false_sharing_diagram.png "Two threads on separate cores alternately gain exclusive access to the same cache line.") + +## Why false sharing is hard to spot and fix + +False sharing often hides behind seemingly ordinary writes, making it tricky to catch without tooling. The best time to eliminate it is early, while reading or refactoring code, by padding or realigning variables before compilation. But in large, highly concurrent C++ codebases, memory is frequently accessed through multiple layers of abstraction. Threads may interact with shared data indirectly, causing subtle cache line overlaps that don’t become obvious until performance profiling reveals unexpected coherence misses. Tools like Perf C2C can help uncover these issues by tracing cache-to-cache transfers and identifying hot memory locations affected by false sharing. From a source-code perspective nothing is “shared,” but at the hardware level both variables are implicitly coupled by their physical location. ## Alignment to cache lines -In C++11, you can manually specify the alignment of an object with the `alignas` specifier. For example, the C++11 source code below manually aligns the the `struct` every 64 bytes (typical cache line size on a modern processor). This ensures that each instance of `AlignedType` is on a separate cache line. +In C++11, you can manually specify the alignment of an object with the `alignas` specifier. + +For example, the C++11 source code below manually aligns the `struct` every 64 bytes (typical cache line size on a modern processor). This ensures that each instance of `AlignedType` is on a separate cache line. ```cpp #include @@ -43,7 +68,7 @@ struct alignas(64) AlignedType { int main() { - // If we create four atomic integers like this, there's a high probability + // If you create four atomic integers like this, there's a high probability // they'll wind up next to each other in memory std::atomic a; std::atomic b; @@ -74,9 +99,9 @@ int main() { } ``` -The example output below shows the variables e, f, g and h occur at least 64-bytes apart in the byte-addressable architecture. Whereas variables a, b, c and d occur 8 bytes apart, occupying the same cache line. +The output below shows that the variables e, f, g and h occur at least 64 bytes apart in the byte-addressable architecture. Whereas variables a, b, c, and d occur 8 bytes apart, occupying the same cache line. -Although this is a contrived example, in a production workload there may be several layers of indirection that unintentionally result in false sharing. For these complex cases, to understand the root cause you will use Perf C2C. +Although this is a simplified example, in a production workload there might be several layers of indirection that unintentionally result in false sharing. For these complex cases, use Perf C2C to trace cache line interactions and pinpoint the root cause of performance issues. ```output Without Alignment can occupy same cache line @@ -96,4 +121,10 @@ Address of AlignedType g - 0xffffeb6c60c0 Address of AlignedType h - 0xffffeb6c6080 ``` -Continue to the next section to learn how to set up a system to run Perf C2C. \ No newline at end of file +## Summary + +In this section, you explored what Arm SPE is and why it offers a deeper, more accurate view of application performance. You also examined how a subtle issue like false sharing can impact multithreaded code, and how to mitigate it using data alignment techniques in C++. + +Next, you'll set up your environment and use Perf C2C to capture and analyze real-world cache behavior on an Arm Neoverse system. + + diff --git a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-2.md b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-2.md index a4348ad7f9..d4dbd95c6f 100644 --- a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-2.md +++ b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-2.md @@ -1,20 +1,23 @@ --- -title: Configure your environment for Arm SPE profiling +title: Set up your environment for Arm SPE and Perf C2C profiling weight: 3 ### FIXED, DO NOT MODIFY layout: learningpathall --- - ## Select a system with SPE support -SPE requires both hardware and operating system support. Many cloud instances running Linux do not enable SPE-based profiling. +{{% notice Learning goal%}} +Before you can start profiling cache behavior with Arm SPE and Perf C2C, your system needs to meet a few requirements. In this section, you’ll learn how to check whether your hardware and kernel support Arm SPE, install the necessary tools, and validate that Linux Perf can access the correct performance monitoring events. By the end, your environment will be ready to record and analyze memory access patterns using `perf c2c` on an Arm Neoverse system. +{{% /notice %}} + +SPE requires support from both your hardware and the operating system. Many cloud instances running Linux do not enable SPE-based profiling. You need to identify a system that supports SPE using the information below. If you are looking for an AWS system, you can use a `c6g.metal` instance running Amazon Linux 2023 (AL2023). -Check the underlying Neoverse processor and operating system kernel version with the following commands. +Check the underlying Neoverse processor and operating system kernel version with the following commands: ```bash lscpu | grep -i "model name" @@ -23,7 +26,7 @@ uname -r The output includes the CPU type and kernel release version: -```ouput +```output Model name: Neoverse-N1 6.1.134-152.225.amzn2023.aarch64 ``` @@ -43,7 +46,7 @@ Run the following command to confirm if the SPE kernel module is loaded: sudo modprobe arm_spe_pmu ``` -If the module is not loaded (blank output), SPE may still be available. +If the module is not loaded (and there is blank output), SPE might still be available. Run this command to check if SPE is included in the kernel: @@ -51,7 +54,7 @@ Run this command to check if SPE is included in the kernel: ls /sys/bus/event_source/devices/ | grep arm_spe ``` -If SPE is available, the output is: +If SPE is available, the output you will see is: ```output arm_spe_0 @@ -63,11 +66,11 @@ If the output is blank then SPE is not available. You can install and run a Python script named Sysreport to summarize your system's performance profiling capabilities. -Refer to [Get ready for performance analysis with Sysreport](https://learn.arm.com/learning-paths/servers-and-cloud-computing/sysreport/) to learn how to install and run it. +See the Learning Path [Get ready for performance analysis with Sysreport](https://learn.arm.com/learning-paths/servers-and-cloud-computing/sysreport/) to learn how to install and run it. Look at the Sysreport output and confirm SPE is available by checking the `perf sampling` field. -If the printed value is SPE then SPE is available. +If the printed value is SPE, then SPE is available. ```output ... @@ -85,7 +88,7 @@ Performance features: ## Confirm Arm SPE is available to Perf -Run the following command to confirm SPE is available to Perf: +Run the following command to confirm SPE is available to `perf`: ```bash sudo perf list "arm_spe*" @@ -99,32 +102,34 @@ List of pre-defined events (to be used in -e or -M): arm_spe_0// [Kernel PMU event] ``` -Assign capabilities to Perf by running: +Assign capabilities to `perf` by running: ```bash sudo setcap cap_perfmon,cap_sys_ptrace,cap_sys_admin+ep $(which perf) ``` -If `arm_spe` is not available because of your system configuration or if you don't have PMU permission, the `perf c2c` command will fail. +If `arm_spe` isn’t available due to your system configuration or limited PMU access, the `perf c2c` command will fail. -To confirm Perf can access SPE run: +To confirm `perf` can access SPE, run: ```bash perf c2c record ``` -The output showing the failure is: +If SPE access is blocked, you’ll see output like this: ```output failed: memory events not supported ``` {{% notice Note %}} -If you are unable to use SPE it may be a restriction based on your cloud instance size or operating system. +If you are unable to use SPE it might be a restriction based on your cloud instance size or operating system. -Generally, access to a full server (also known as metal instances) with a relatively new kernel is needed for Arm SPE support. +Generally, access to a full server (also known as metal instances) with a relatively new kernel is required for Arm SPE support. For more information about enabling SPE, see the [perf-arm-spe manual page](https://man7.org/linux/man-pages/man1/perf-arm-spe.1.html) {{% /notice %}} -Continue to learn how to use Perf C2C on an example application. +## Summary + +You've confirmed that your system supports Arm SPE, installed the necessary tools, and verified that Perf C2C can access SPE events. You're now ready to start collecting detailed performance data using Perf C2C. In the next section, you’ll run a real application and use Perf C2C to capture cache sharing behavior and uncover memory performance issues. diff --git a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-3.md b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-3.md index 69b9bfd2ba..0e587a2990 100644 --- a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-3.md +++ b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-3.md @@ -1,5 +1,5 @@ --- -title: False Sharing Example +title: False sharing example weight: 4 ### FIXED, DO NOT MODIFY @@ -8,6 +8,10 @@ layout: learningpathall ## Example code +{{% notice Learning Goal%}} +The example code in this section demonstrates how false sharing affects performance by comparing two multithreaded programs; one with cache-aligned data structures, and one without. You’ll compile and run both versions, observe the runtime difference, and learn how memory layout affects cache behavior. This sets the stage for analyzing performance with Perf C2C in the next section. +{{% /notice %}} + Use a text editor to copy and paste the C example code below into a file named `false_sharing_example.c` The code is adapted from [Joe Mario](https://github.com/joemario/perf-c2c-usage-files) and is discussed thoroughly in the Arm Statistical Profiling Extension Whitepaper. @@ -285,7 +289,7 @@ int main ( int argc, char *argv[] ) ### Code explanation -The key data structure that occupies the cache is `struct Buf`. With a 64-byte cache line size, each line can hold 8, 8-byte `long` integers. +The key data structure that occupies the cache is `struct _buf`. With a 64-byte cache line size, each line can hold 8, 8-byte `long` integers. If you do not pass in the `NO_FALSE_SHARING` macro during compilation the `Buf` data structure will contain the elements below. Each structure neatly occupies the entire 64-byte cache line. @@ -306,7 +310,7 @@ typedef struct _buf { Alternatively if you pass in the `NO_FALSE_SHARING` macro during compilation, the `Buf` structure has a different shape. -The 40 bytes of padding pushes the reader variables onto a different cache line. However, notice that this is with the tradeoff the new `Buf` structures occupies multiple cache lines (12 long integers). Therefore it leaves unused cache space of 25% per `Buf` structure. +The 40 bytes of padding pushes the reader variables onto a different cache line. However, notice that this is with the tradeoff the new `Buf` structures occupies multiple cache lines (12 long integers). Therefore it leaves unused cache space of 25% per `Buf` structure. This trade-off uses more memory but eliminates false sharing, improving performance by reducing cache line contention. ```output typedef struct _buf { @@ -345,5 +349,6 @@ user 0m8.869s sys 0m0.000s ``` -Continue to the next section to learn how to use Perf C2C to analyze the example code. +## Summary +In this section, you ran a hands-on C example to see how false sharing can significantly degrade performance in multithreaded applications. By comparing two versions of the same program, one with aligned memory access and one without, you saw how something as subtle as cache line layout can result in a 2x difference in runtime. This practical example sets the foundation for using Perf C2C to capture and analyze real cache line sharing behavior in the next section. diff --git a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-4.md b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-4.md index 1ac72d6d95..61fdaebbfc 100644 --- a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-4.md +++ b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/how-to-4.md @@ -6,9 +6,13 @@ weight: 5 layout: learningpathall --- -## Measuring Performance +## Compare Performance with perf stat -A straight forward method to observe the performance characteristics of both binaries would be to use the `perf stat` command. +{{% notice Learning goal %}} +In this section, you’ll learn how to use Linux Perf tools and Arm SPE to identify performance bottlenecks in multithreaded applications. You’ll compare aligned and unaligned workloads, detect cache-related slowdowns such as false sharing, and trace memory contention down to the source code using Perf C2C. +{{% /notice %}} + +A simple way to observe the performance difference of both binaries is to use the `perf stat` command. For example, run the false sharing version using: @@ -66,13 +70,13 @@ The output is similar to: Comparing the results you can see the run time is significantly different (13.01 s vs. 6.49 s). -Additionally, the instructions per cycle (IPC) is notably different, (0.74 vs. 1.70) and looks to be commensurate to run time. +The instructions per cycle (IPC) are also notably different, (0.74 vs. 1.70) and look to be commensurate to run time. -## Understanding the root cause +## Pinpoint pipeline bottlenecks with top-down analysis There are many root causes of variations in IPC. -To identify an area to focus on we will start off using the [top-down methodology](https://developer.arm.com/documentation/109542/0100/Arm-Topdown-methodology). Install the python script using the [install guide](https://learn.arm.com/install-guides/topdown-tool/). +To identify where the bottleneck occurs, we’ll start by using the [Arm Topdown methodology](https://developer.arm.com/documentation/109542/0100/Arm-Topdown-methodology). Install the python script using the [Telemetry Solution Install Guide](https://learn.arm.com/install-guides/topdown-tool/). Run the following command to observe the ratio of frontend to backend stall cycles. These indicate which section of the CPU pipeline is waiting on resources and causing slower performance. @@ -92,7 +96,7 @@ Backend Stalled Cycles. 75.24% cycles The output shows there are disproportionately more backend stall cycles. This indicates the CPU is waiting for data. You could follow the top-down methodology further looking at the stage 2 microarchitecture analysis, but for sake of brevity you can jump to recording events with SPE. -## Skid when using Perf Record +## Skid: When perf record misleads The naive approach would be to record the events using the `perf record` subcommand. Running the following commands can be used to demonstrate skid, inaccuracy or "slippage" in the instruction location recorded by the Performance Monitoring Unit (PMU) when a performance event is sampled. @@ -110,7 +114,7 @@ sudo perf c2c record -g ./false_sharing 1 sudo perf annotate ``` -The left screenshot shows the canonical `perf record` command, here the `adrp` instruction falsely reports 52% of the time. However, using `perf c2c` that leverages `arm_spe`, we observe 99% of time associated with the `ldr`, load register command. The standard `perf record` data can be quite misleading! +The left screenshot shows the canonical `perf record` command, here the `adrp` instruction falsely reports 52% of the time. However, using `perf c2c` that leverages `arm_spe`, you can see 99% of time associated with the `ldr`, load register command. The standard `perf record` data can be quite misleading! ![perf-record-annotate](./perf-record-error-skid.png) ![perf-c2c-record-annotate](./perf-c2c-record.png) @@ -144,7 +148,7 @@ Next, press `d` character to display the cache line details. The last `Source:Li ![perf-c2c-gif](./perf-c2c.gif) -Looking at the corresponding source code, we observe the following. +Looking at the corresponding source code, you can see the following: ```output ... @@ -155,4 +159,8 @@ Looking at the corresponding source code, we observe the following. The output from SPE-based profiling with Perf C2C shows that attempting to access and increment the `lock0` and `reader1` variable, is causing the bottleneck. -The insight generated from Perf C2C indicates to reorganize the layout of the data structure. \ No newline at end of file +The insight generated from Perf C2C indicates to reorganize the layout of the data structure. + +## Summary + +In this section, you used multiple tools to analyze and diagnose a real performance issue caused by false sharing. You compared performance between aligned and unaligned code using perf stat, investigated backend stalls with topdown-tool, and saw how standard perf record can mislead due to instruction skid. Finally, you used Perf C2C with Arm SPE to pinpoint the exact variables and code lines causing contention, giving you actionable insight into how to reorganize your data layout for better performance. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md b/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md index 686f36cb46..75904fd24f 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md @@ -30,7 +30,7 @@ tools_software_languages: - GenAI - Python - Demo - + - Hugging Face further_reading: - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md b/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md index af5a970ed4..fc2f68b7d8 100644 --- a/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md @@ -25,6 +25,8 @@ tools_software_languages: - Python - GenAI - RAG + - Hugging Face + operatingsystems: - Linux diff --git a/content/learning-paths/servers-and-cloud-computing/nlp-hugging-face/_index.md b/content/learning-paths/servers-and-cloud-computing/nlp-hugging-face/_index.md index 5481736d3c..efddeacee2 100644 --- a/content/learning-paths/servers-and-cloud-computing/nlp-hugging-face/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/nlp-hugging-face/_index.md @@ -24,6 +24,7 @@ operatingsystems: tools_software_languages: - Python - PyTorch + - Hugging Face further_reading: - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md b/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md index 1e79d82bcc..59ac9e3129 100644 --- a/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md @@ -32,7 +32,7 @@ tools_software_languages: - GenAI - Python - PyTorch - + - Hugging Face further_reading: - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/rag/_index.md b/content/learning-paths/servers-and-cloud-computing/rag/_index.md index 46fefc235b..5687564dc8 100644 --- a/content/learning-paths/servers-and-cloud-computing/rag/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/rag/_index.md @@ -34,6 +34,7 @@ tools_software_languages: - Streamlit - Google Axion - Demo + - Hugging Face further_reading: - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md b/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md index cb0bb86cc5..ebd2ade135 100644 --- a/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md @@ -27,7 +27,7 @@ tools_software_languages: - LLM - GenAI - Python - + - Hugging Face further_reading: - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/vllm/_index.md b/content/learning-paths/servers-and-cloud-computing/vllm/_index.md index 13ee6081d4..4ab33e47ca 100644 --- a/content/learning-paths/servers-and-cloud-computing/vllm/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/vllm/_index.md @@ -28,7 +28,7 @@ tools_software_languages: - LLM - GenAI - Python - + - Hugging Face further_reading: - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/whisper/_index.md b/content/learning-paths/servers-and-cloud-computing/whisper/_index.md index 55d39188e0..8f05259578 100644 --- a/content/learning-paths/servers-and-cloud-computing/whisper/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/whisper/_index.md @@ -31,6 +31,8 @@ tools_software_languages: - Python - Whisper - Demo + - Hugging Face + cloud_service_providers: AWS diff --git a/data/stats_current_test_info.yml b/data/stats_current_test_info.yml index cad3fceeee..eabfc682a2 100644 --- a/data/stats_current_test_info.yml +++ b/data/stats_current_test_info.yml @@ -1,5 +1,5 @@ summary: - content_total: 369 + content_total: 371 content_with_all_tests_passing: 0 content_with_tests_enabled: 61 sw_categories: diff --git a/data/stats_weekly_data.yml b/data/stats_weekly_data.yml index 178781f649..5efe046e36 100644 --- a/data/stats_weekly_data.yml +++ b/data/stats_weekly_data.yml @@ -6009,3 +6009,111 @@ avg_close_time_hrs: 0 num_issues: 9 percent_closed_vs_total: 0.0 +- a_date: '2025-06-02' + content: + automotive: 2 + cross-platform: 33 + embedded-and-microcontrollers: 41 + install-guides: 101 + iot: 6 + laptops-and-desktops: 37 + mobile-graphics-and-gaming: 33 + servers-and-cloud-computing: 118 + total: 371 + contributions: + external: 95 + internal: 497 + github_engagement: + num_forks: 30 + num_prs: 7 + individual_authors: + adnan-alsinan: 2 + alaaeddine-chakroun: 2 + albin-bernhardsson: 1 + alex-su: 1 + alexandros-lamprineas: 1 + andrew-choi: 2 + annie-tallund: 4 + arm: 3 + arnaud-de-grandmaison: 4 + arnaud-de-grandmaison.: 1 + aude-vuilliomenet: 1 + avin-zarlez: 1 + barbara-corriero: 1 + basma-el-gaabouri: 1 + ben-clark: 1 + bolt-liu: 2 + brenda-strech: 1 + chaodong-gong: 1 + chen-zhang: 1 + christophe-favergeon: 1 + christopher-seidl: 7 + cyril-rohr: 1 + daniel-gubay: 1 + daniel-nguyen: 2 + david-spickett: 2 + dawid-borycki: 33 + diego-russo: 2 + dominica-abena-o.-amanfo: 1 + elham-harirpoush: 2 + florent-lebeau: 5 + "fr\xE9d\xE9ric--lefred--descamps": 2 + gabriel-peterson: 5 + gayathri-narayana-yegna-narayanan: 1 + georgios-mermigkis: 1 + geremy-cohen: 1 + gian-marco-iodice: 1 + graham-woodward: 1 + han-yin: 1 + iago-calvo-lista: 1 + james-whitaker: 1 + jason-andrews: 102 + joe-stech: 4 + johanna-skinnider: 2 + jonathan-davies: 2 + jose-emilio-munoz-lopez: 1 + julie-gaskin: 5 + julio-suarez: 6 + jun-he: 1 + kasper-mecklenburg: 1 + kieran-hejmadi: 10 + koki-mitsunami: 2 + konstantinos-margaritis: 8 + kristof-beyls: 1 + leandro-nunes: 1 + liliya-wu: 1 + mark-thurman: 1 + masoud-koleini: 1 + mathias-brossard: 1 + michael-hall: 5 + na-li: 1 + nader-zouaoui: 2 + nikhil-gupta: 1 + nina-drozd: 1 + nobel-chowdary-mandepudi: 6 + odin-shen: 7 + owen-wu: 2 + pareena-verma: 44 + paul-howard: 3 + pranay-bakre: 5 + preema-merlin-dsouza: 1 + przemyslaw-wirkus: 2 + rin-dobrescu: 1 + roberto-lopez-mendez: 2 + ronan-synnott: 45 + shuheng-deng: 1 + thirdai: 1 + tianyu-li: 2 + tom-pilar: 1 + uma-ramalingam: 1 + varun-chari: 2 + visualsilicon: 1 + willen-yang: 1 + ying-yu: 2 + yiyang-fan: 1 + zach-lasiuk: 2 + zhengjun-xing: 2 + issues: + avg_close_time_hrs: 0 + num_issues: 12 + percent_closed_vs_total: 0.0