ci: further improve perf benchmarks (#39)

Build everything first, on Github-hosted runners. Then benchmark everything on our own hardware. Make sure the run has a nice descriptive name. Don't fail the build when JMH not provided.
TimefoldAI · Jan 5, 2025 · 5960a8b · 5960a8b
1 parent 3f276e1
commit 5960a8b
Show file tree

Hide file tree

Showing 8 changed files with 100 additions and 112 deletions.
diff --git a/.github/scripts/prevent_stale_fork.sh b/.github/scripts/prevent_stale_fork.sh
diff --git a/.github/workflows/performance_score_director.yml b/.github/workflows/performance_score_director.yml
@@ -1,10 +1,16 @@
-# - Runs entirely on a single machine, a self-hosted runner on Github Actions.
-# - Both baseline and SUT (Software Under Test) are built from source first,
-#   as it is wasteful to run the baseline only to find out that the SUT does not build.
-# - The baseline is established first, then the SUT is measured.
-# - Each benchmark gives a 99.9 % confidence interval.
-# - The confidence intervals are compared to determine if the branch under test is a regression or an improvement.
-# - The error threshold is expected to be below +/- 2.0 %.
+# Both baseline and SUT (Software Under Test) are built from source first,
+# with their binaries uploaded as artifacts.
+# This is done on GitHub infrastructure, to achieve maximum parallelization.
+#
+# The benchmark job downloads the binaries and runs them.
+# The baseline is established first, then the SUT is measured.
+# They both run in the same job,
+# to guarantee they ran on the same machine with the same performance characteristics.
+# This is done on a self-hosted runner which we completely control.
+#
+# Each benchmark gives a 99.9 % confidence interval.
+# The confidence intervals are compared to determine if the branch under test is a regression or an improvement.
+# The error threshold is expected to be below +/- 2.0 %.
 name: Performance Regression Test - Score Director
 
 on:
@@ -31,63 +37,56 @@ on:
         default: '3.0'
         required: true
 
-jobs:
+run-name: "Timefold Solver v${{ github.event.inputs.baseline }} vs. ${{ github.event.inputs.branch_owner }}/${{ github.event.inputs.branch }} on Java ${{ github.event.inputs.jdk }}"
 
-  benchmark:
-    runs-on: self-hosted
+jobs:
+  build:
+    runs-on: ubuntu-latest # Leverage massive parallelization of Github-hosted runners.
     strategy:
-      fail-fast: false # Jobs fail if the benchmark error is over predefined thresholds; other benchmarks continue.
+      fail-fast: true # If one compilation fails, abort everything.
       matrix:
         example: [cloud_balancing, conference_scheduling, curriculum_course, examination, machine_reassignment, meeting_scheduling, nurse_rostering, patient_admission_scheduling, task_assigning, traveling_tournament, tsp, vehicle_routing]
     env:
       MVN_USERNAME: '${{ secrets.JFROG_ENTERPRISE_READ_ONLY_ACCESS_USERNAME }}'
       MVN_PASSWORD: '${{ secrets.JFROG_ENTERPRISE_READ_ONLY_ACCESS_TOKEN }}'
     steps:
-      - name: Phase 0 - Checkout timefold-solver-benchmarks
+      - name: Checkout timefold-solver-benchmarks
         uses: actions/checkout@v4
         with:
           repository: TimefoldAI/timefold-solver-benchmarks
           path: ./timefold-solver-benchmarks
 
-      - name: Phase 0 - Setup JDK and Maven
+      - name: Setup JDK and Maven
         uses: actions/setup-java@v4
         with:
-          java-version: ${{ github.event.inputs.jdk }}
+          java-version: 17 # Always build with the least recent supported JDK.
           distribution: 'temurin'
           cache: 'maven'
           server-id: 'timefold-solver-enterprise'
           server-username: 'MVN_USERNAME'
           server-password: 'MVN_PASSWORD'
 
-      - name: Phase 0 - Setup Async Profiler
-        working-directory: ./timefold-solver-benchmarks
-        run: |
-          export FILENAME=async-profiler-${{ github.event.inputs.async_profiler_version }}-linux-x64.tar.gz
-          wget https://github.com/async-profiler/async-profiler/releases/download/v${{ github.event.inputs.async_profiler_version }}/$FILENAME
-          tar -xzf $FILENAME 
-          ls -l
-
-      - name: Phase 1 - (Baseline) Compile the benchmark
+      - name: (Baseline) Compile the benchmark
         working-directory: ./timefold-solver-benchmarks
         shell: bash
         run: |
           mvn clean install -B -Dquickly -Dversion.ai.timefold.solver=${{ github.event.inputs.baseline }} -Dversion.tools.provider="${{ github.event.inputs.async_profiler_version }}"
           mv target/benchmarks.jar benchmarks-baseline.jar
 
-      - name: Phase 1 - (SUT) Checkout timefold-solver
+      - name: (SUT) Checkout timefold-solver
         uses: actions/checkout@v4
         with:
           repository: ${{ github.event.inputs.branch_owner }}/timefold-solver
           ref: ${{ github.event.inputs.branch }}
           path: ./timefold-solver
 
-      - name: Phase 1 - (SUT) Quickly build timefold-solver
+      - name: (SUT) Quickly build timefold-solver
         working-directory: ./timefold-solver
         shell: bash
         run: mvn -B -Dquickly clean install
 
       # Clone timefold-solver-enterprise
-      - name: Phase 1 - (SUT) Checkout timefold-solver-enterprise (Specified)
+      - name: (SUT) Checkout timefold-solver-enterprise (Specified)
         id: checkout-solver-enterprise
         uses: actions/checkout@v4
         continue-on-error: true
@@ -96,7 +95,7 @@ jobs:
           ref: ${{ github.event.inputs.branch }}
           token: ${{ secrets.BENCHMARK_PUBLISH_TOKEN }}
           path: ./timefold-solver-enterprise
-      - name: Phase 1 - (SUT) Checkout timefold-solver-enterprise (Fallback)
+      - name: (SUT) Checkout timefold-solver-enterprise (Fallback)
         if: steps.checkout-solver-enterprise.outcome != 'success'
         uses: actions/checkout@v4
         with:
@@ -105,20 +104,61 @@ jobs:
           token: ${{ secrets.BENCHMARK_PUBLISH_TOKEN }}
           path: ./timefold-solver-enterprise
 
-      - name: Phase 1 - (SUT) Quickly build timefold-solver-enterprise
+      - name: (SUT) Quickly build timefold-solver-enterprise
         working-directory: ./timefold-solver-enterprise
         shell: bash
         run: mvn -B -Dquickly clean install
 
-      - name: Phase 1 - (SUT) Compile the benchmarks
+      - name: (SUT) Compile the benchmarks
         working-directory: ./timefold-solver-benchmarks
         shell: bash
         run: |
           mvn clean install -B -Dquickly -Dversion.tools.provider="${{ github.event.inputs.async_profiler_version }}"
           mv target/benchmarks.jar benchmarks-sut.jar
 
+      - name: Upload the binaries
+        uses: actions/upload-artifact@v4
+        with:
+          name: binaries-${{ matrix.example }}
+          path: |
+            ./timefold-solver-benchmarks/benchmarks-baseline.jar
+            ./timefold-solver-benchmarks/benchmarks-sut.jar
+          if-no-files-found: error
+
+  benchmark:
+    needs: build
+    runs-on: self-hosted # We need a stable machine to actually run the benchmarks.
+    strategy:
+      fail-fast: false # Jobs fail if the benchmark error is over predefined thresholds; other benchmarks continue.
+      matrix:
+        example: [cloud_balancing, conference_scheduling, curriculum_course, examination, machine_reassignment, meeting_scheduling, nurse_rostering, patient_admission_scheduling, task_assigning, traveling_tournament, tsp, vehicle_routing]
+    env:
+      MVN_USERNAME: '${{ secrets.JFROG_ENTERPRISE_READ_ONLY_ACCESS_USERNAME }}'
+      MVN_PASSWORD: '${{ secrets.JFROG_ENTERPRISE_READ_ONLY_ACCESS_TOKEN }}'
+    steps:
+      - name: Setup JDK and Maven
+        uses: actions/setup-java@v4
+        with:
+          java-version: ${{ github.event.inputs.jdk }} # Use the JDK version specified by the user.
+          distribution: 'temurin'
+          check-latest: true
+
+      - name: Download the benchmark binaries
+        uses: actions/download-artifact@v4
+        with:
+          name: binaries-${{ matrix.example }}
+          path: ./timefold-solver-benchmarks/
+
+      - name: Setup Async Profiler
+        working-directory: ./timefold-solver-benchmarks
+        run: |
+          export FILENAME=async-profiler-${{ github.event.inputs.async_profiler_version }}-linux-x64.tar.gz
+          wget https://github.com/async-profiler/async-profiler/releases/download/v${{ github.event.inputs.async_profiler_version }}/$FILENAME
+          tar -xzf $FILENAME 
+          ls -l
+
       # Fine-tuned for stability on GHA.
-      - name: Phase 2 - Configure the benchmark
+      - name: Configure the benchmark
         working-directory: ./timefold-solver-benchmarks
         shell: bash
         run: |
@@ -131,7 +171,7 @@ jobs:
           cat scoredirector-benchmark.properties
           chmod +x run-scoredirector.sh
 
-      - name: Phase 2 - (Baseline) Run the benchmark
+      - name: (Baseline) Run the benchmark
         working-directory: ./timefold-solver-benchmarks
         id: benchmark_baseline
         env:
@@ -144,8 +184,8 @@ jobs:
           echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]|round' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT"
           echo "RANGE_MID=$(jq '.[0].primaryMetric.score|round' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT"
 
-      - name: Phase 2 - (SUT) Run the benchmark
-        id: benchmark_new
+      - name: (SUT) Run the benchmark
+        id: benchmark_sut
         working-directory: ./timefold-solver-benchmarks
         env:
           RUN_ID: ${{ github.event.inputs.branch }}
@@ -158,58 +198,46 @@ jobs:
           echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]|round' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT"
           echo "RANGE_MID=$(jq '.[0].primaryMetric.score|round' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT"
 
-      - name: Phase 3 - Archive benchmark data
+      - name: Archive benchmark data
         uses: actions/upload-artifact@v4
         with:
           name: results-${{ matrix.example }}-${{ github.event.inputs.baseline }}_vs_${{ github.event.inputs.branch }}
           path: |
-            ./timefold-solver-benchmarks/benchmarks-baseline.jar
-            ./timefold-solver-benchmarks/benchmarks-sut.jar
             ./timefold-solver-benchmarks/scoredirector-benchmark.properties
             ./timefold-solver-benchmarks/results/scoredirector
 
-      - name: Phase 3 - Report results
+      - name: Report results
         working-directory: ./timefold-solver-benchmarks
         env:
-          OLD_RANGE_START: ${{ steps.benchmark_baseline.outputs.RANGE_START }}
-          OLD_RANGE_MID:   ${{ steps.benchmark_baseline.outputs.RANGE_MID }}
-          OLD_RANGE_END:   ${{ steps.benchmark_baseline.outputs.RANGE_END }}
-          NEW_RANGE_START: ${{ steps.benchmark_new.outputs.RANGE_START }}
-          NEW_RANGE_MID:   ${{ steps.benchmark_new.outputs.RANGE_MID }}
-          NEW_RANGE_END:   ${{ steps.benchmark_new.outputs.RANGE_END }}
+          BASELINE_RANGE_START: ${{ steps.benchmark_baseline.outputs.RANGE_START }}
+          BASELINE_RANGE_MID:   ${{ steps.benchmark_baseline.outputs.RANGE_MID }}
+          BASELINE_RANGE_END:   ${{ steps.benchmark_baseline.outputs.RANGE_END }}
+          SUT_RANGE_START: ${{ steps.benchmark_sut.outputs.RANGE_START }}
+          SUT_RANGE_MID:   ${{ steps.benchmark_sut.outputs.RANGE_MID }}
+          SUT_RANGE_END:   ${{ steps.benchmark_sut.outputs.RANGE_END }}
         shell: bash
         run: |
-          export OLD_DEV=$(echo "scale=2; ($OLD_RANGE_MID / $OLD_RANGE_START) * 100 - 100" | bc)
-          export NEW_DEV=$(echo "scale=2; ($NEW_RANGE_MID / $NEW_RANGE_START) * 100 - 100" | bc)
-          export DIFF_START=$(echo "scale=2; ($OLD_RANGE_START / $NEW_RANGE_START) * 100" | bc)
-          export DIFF_MID=$(echo "scale=2; ($OLD_RANGE_MID / $NEW_RANGE_MID) * 100" | bc)
-          export DIFF_END=$(echo "scale=2; ($OLD_RANGE_END / $NEW_RANGE_END) * 100" | bc)
+          export BASELINE_DEV=$(echo "scale=2; ($BASELINE_RANGE_MID / $BASELINE_RANGE_START) * 100 - 100" | bc)
+          export SUT_DEV=$(echo "scale=2; ($SUT_RANGE_MID / $SUT_RANGE_START) * 100 - 100" | bc)
+          export DIFF_MID=$(echo "scale=2; ($BASELINE_RANGE_MID / $SUT_RANGE_MID) * 100" | bc)
           export FAIL=false
           
           if (( $(echo "$DIFF_MID >= 98.00" | bc -l) && $(echo "$DIFF_MID <= 102.00"|bc -l) )); then
             # Ignore differences of up to 2 %; we can't expect that level of precision anyway.
             exit 0
+          elif [ "$SUT_RANGE_START" -gt "$BASELINE_RANGE_END" ]; then
+            echo "### 🚀🚀🚀 Statistically significant improvement 🚀🚀🚀" >> $GITHUB_STEP_SUMMARY
+          elif [ "$SUT_BASELINE_START" -gt "$SUT_RANGE_END" ]; then
+            echo "### ‼️‼️‼️ Statistically significant regression ‼️‼️‼️" >> $GITHUB_STEP_SUMMARY
+            export FAIL=true
           else
-            if [ "$NEW_RANGE_START" -le "$OLD_RANGE_END" ] && [ "$NEW_RANGE_END" -ge "$OLD_RANGE_START" ]; then
-              if [ "$NEW_RANGE_START" -ge "$OLD_RANGE_MID" ]; then
-                echo "### 🍀 Possible improvement 🍀" >> $GITHUB_STEP_SUMMARY
-              elif [ "$OLD_RANGE_END" -le "$NEW_RANGE_MID" ]; then
-                echo "### ⚠️ Possible regression ⚠️" >> $GITHUB_STEP_SUMMARY
-              else
-                exit 0
-              fi          
-            elif [ "$NEW_RANGE_START" -gt "$OLD_RANGE_END" ]; then
-              echo "### 🚀🚀🚀 Statistically significant improvement 🚀🚀🚀" >> $GITHUB_STEP_SUMMARY
-            else
-              echo "### ‼️‼️‼️ Statistically significant regression ‼️‼️‼️" >> $GITHUB_STEP_SUMMARY
-              export FAIL=true
-            fi
+            exit 0
           fi          
           
           echo "|        |   **Ref**   |      **Mean**     |" >> $GITHUB_STEP_SUMMARY
           echo "|:------:|:-----------:|:-----------------:|" >> $GITHUB_STEP_SUMMARY
-          echo "|  _Old_ | [v${{ github.event.inputs.baseline }}](https://github.com/TimefoldAI/timefold-solver/releases/tag/v${{ github.event.inputs.baseline }}) | ${OLD_RANGE_MID} ± ${OLD_DEV} % |" >> $GITHUB_STEP_SUMMARY
-          echo "|  _New_ | [${{ github.event.inputs.branch_owner }}'s ${{ github.event.inputs.branch }}](https://github.com/${{ github.event.inputs.branch_owner }}/timefold-solver/tree/${{ github.event.inputs.branch }}) | ${NEW_RANGE_MID} ± ${NEW_DEV} % |" >> $GITHUB_STEP_SUMMARY
+          echo "|  _Old_ | [v${{ github.event.inputs.baseline }}](https://github.com/TimefoldAI/timefold-solver/releases/tag/v${{ github.event.inputs.baseline }}) | ${BASELINE_RANGE_MID} ± ${BASELINE_DEV} % |" >> $GITHUB_STEP_SUMMARY
+          echo "|  _New_ | [${{ github.event.inputs.branch_owner }}'s ${{ github.event.inputs.branch }}](https://github.com/${{ github.event.inputs.branch_owner }}/timefold-solver/tree/${{ github.event.inputs.branch }}) | ${SUT_RANGE_MID} ± ${SUT_DEV} % |" >> $GITHUB_STEP_SUMMARY
           echo "| _Diff_ |             |   ${DIFF_MID} %   |" >> $GITHUB_STEP_SUMMARY
           
           echo "" >> $GITHUB_STEP_SUMMARY

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
@@ -125,4 +125,4 @@ jobs:
       - name: Build and test timefold-solver-benchmarks
         working-directory: ./timefold-solver-benchmarks
         shell: bash
-        run: mvn -B -DskipJMH clean verify
+        run: mvn -B clean verify
diff --git a/.github/workflows/turtle.yml b/.github/workflows/turtle.yml
@@ -54,4 +54,4 @@ jobs:
       - name: Run and test timefold-solver-benchmarks per example
         working-directory: ./timefold-solver-benchmarks
         shell: bash
-        run: mvn -B -DskipJMH -Dai.timefold.solver.benchmarks.examples.turtle=${{matrix.example}} -Dai.timefold.solver.benchmarks.examples.turtle.runTimeLimitMinutes=300 test
+        run: mvn -B -Dai.timefold.solver.benchmarks.examples.turtle=${{matrix.example}} -Dai.timefold.solver.benchmarks.examples.turtle.runTimeLimitMinutes=300 test
diff --git a/README.adoc b/README.adoc
@@ -50,7 +50,7 @@ Having acquired that, build and run any of the benchmarks:
 
 [source,shell]
 ----
-mvn clean install -DskipJMH
+mvn clean install
 ./run-tsplib95.sh
 ./run-cvrplib.sh
 ----

diff --git a/pom.xml b/pom.xml
@@ -20,15 +20,15 @@
         <version.org.junit>5.10.2</version.org.junit>
         <version.org.mockito>5.11.0</version.org.mockito>
         <version.spotless.plugin>2.43.0</version.spotless.plugin>
-        <version.tools.profiler>3.0</version.tools.profiler>
         <spotless.skip>false</spotless.skip>
         <spotless.goal>apply</spotless.goal>
         <java.module.name>ai.timefold.solver.benchmarks</java.module.name>
         <java.release>21</java.release>
         <jmh.version>1.37</jmh.version>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <uberjar.name>benchmarks</uberjar.name>
-        <async.profiler.directory>async-profiler-${version.tools.profiler}-linux-x64</async.profiler.directory>
+        <!-- Used to set a property in buildtime.properties, that will later be read by the JMH benchmark. -->
+        <version.tools.profiler>3.0</version.tools.profiler>
     </properties>
 
     <dependencyManagement>
@@ -259,23 +259,6 @@
                 <skipTests>true</skipTests>
             </properties>
         </profile>
-        <profile>
-            <id>jmh</id>
-            <activation>
-                <property>
-                    <name>!skipJMH</name>
-                </property>
-            </activation>
-            <dependencies>
-                <dependency>
-                    <groupId>tools.profiler</groupId>
-                    <artifactId>async-profiler-converter</artifactId>
-                    <version>${version.tools.profiler}</version>
-                    <scope>system</scope>
-                    <systemPath>${project.basedir}/${async.profiler.directory}/lib/converter.jar</systemPath>
-                </dependency>
-            </dependencies>
-        </profile>
     </profiles>
 
     <repositories>

diff --git a/...enchmarks/examples/conferencescheduling/score/ConferenceSchedulingConstraintProvider.java b/...enchmarks/examples/conferencescheduling/score/ConferenceSchedulingConstraintProvider.java
@@ -282,8 +282,8 @@ Constraint themeTrackConflict(ConstraintFactory factory) {
 
     Constraint themeTrackRoomStability(ConstraintFactory factory) {
         return factory.forEachUniquePair(Talk.class,
-                equal(talk -> talk.getTimeslot().getStartDateTime().toLocalDate()), 
-                        filtering((talk1, talk2) -> !talk1.getRoom().equals(talk2.getRoom())))
+                equal(talk -> talk.getTimeslot().getStartDateTime().toLocalDate()),
+                filtering((talk1, talk2) -> !talk1.getRoom().equals(talk2.getRoom())))
                 .expand((talk1, talk2) -> talk2.overlappingThemeTrackCount(talk1))
                 .penalize(HardSoftScore.ofSoft(10),
                         (talk1, talk2, overlappingTrackCount) -> overlappingTrackCount * talk1.combinedDurationInMinutes(talk2))

diff --git a/src/main/resources/buildtime.properties b/src/main/resources/buildtime.properties
@@ -1 +1 @@
-async.profiler.path = ${async.profiler.directory}
+async.profiler.path = async-profiler-${version.tools.profiler}-linux-x64
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		async.profiler.path = ${async.profiler.directory}
		async.profiler.path = async-profiler-${version.tools.profiler}-linux-x64