diff --git a/.github/skills/azure-typespec-author/evaluate/.vally.yaml b/.github/skills/azure-typespec-author/evaluate/.vally.yaml index e0308b86996..aac362f065f 100644 --- a/.github/skills/azure-typespec-author/evaluate/.vally.yaml +++ b/.github/skills/azure-typespec-author/evaluate/.vally.yaml @@ -14,7 +14,7 @@ environments: type: stdio command: dotnet args: ["run", "--project", "../../../../tools/azsdk-cli/Azure.Sdk.Tools.Cli", "--", "start"] - timeout: 300000 + timeout: "10m" env: AZSDKTOOLS_AGENT_TESTING: "false" AZSDKTOOLS_COLLECT_TELEMETRY: "false" @@ -25,7 +25,7 @@ environments: type: stdio command: dotnet args: ["run", "--project", "../../../../tools/azsdk-cli/Azure.Sdk.Tools.Cli", "--", "start"] - timeout: 300000 + timeout: "10m" env: AZSDKTOOLS_AGENT_TESTING: "false" AZSDKTOOLS_COLLECT_TELEMETRY: "false" @@ -44,4 +44,4 @@ suites: warning: evals: ["evals/005001.eval.yaml"] all: - evals: ["evals/*.eval.yaml"] \ No newline at end of file + evals: ["evals/*.eval.yaml"] diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001001.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001001.eval.yaml index b81d05f1586..00e66ba8f1b 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001001.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001001.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001002.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001002.eval.yaml index a7ab314a0b3..b2642759cc3 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001002.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001002.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001003.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001003.eval.yaml index 5a170070fc0..18c60e02666 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001003.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001003.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001004.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001004.eval.yaml index 0940cca560d..d2fb436654f 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001004.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001004.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001005.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001005.eval.yaml index a07a68a97f4..afcaeacf921 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001005.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001005.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: @@ -23,57 +23,58 @@ stimuli: max_tokens: 5000 environment: files: - - src: ../fixtures/001005-version-add-preview-after-preview\employee.tsp + - src: ../fixtures/001005-version-add-preview-after-preview/employee.tsp dest: employee.tsp - - src: ../fixtures/001005-version-add-preview-after-preview\main.tsp + - src: ../fixtures/001005-version-add-preview-after-preview/main.tsp dest: main.tsp - - src: ../fixtures/001005-version-add-preview-after-preview\shared.tsp + - src: ../fixtures/001005-version-add-preview-after-preview/shared.tsp dest: shared.tsp - - src: ../fixtures/001005-version-add-preview-after-preview\tspconfig.yaml + - src: ../fixtures/001005-version-add-preview-after-preview/tspconfig.yaml dest: tspconfig.yaml - - src: ../fixtures/001005-version-add-preview-after-preview\package.json + - src: ../fixtures/001005-version-add-preview-after-preview/package.json dest: package.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2024-10-01-preview\Employees_CreateOrUpdate_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2024-10-01-preview/Employees_CreateOrUpdate_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_CreateOrUpdate_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2024-10-01-preview\Employees_Delete_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2024-10-01-preview/Employees_Delete_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_Delete_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2024-10-01-preview\Employees_Get_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2024-10-01-preview/Employees_Get_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_Get_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2024-10-01-preview\Employees_ListByResourceGroup_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2024-10-01-preview/Employees_ListByResourceGroup_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_ListByResourceGroup_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2024-10-01-preview\Employees_ListBySubscription_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2024-10-01-preview/Employees_ListBySubscription_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_ListBySubscription_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2024-10-01-preview\Employees_Update_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2024-10-01-preview/Employees_Update_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_Update_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2024-10-01-preview\Operations_List_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2024-10-01-preview/Operations_List_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Operations_List_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2024-10-01-preview\Operations_List_MinimumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2024-10-01-preview/Operations_List_MinimumSet_Gen.json dest: examples/2024-10-01-preview/Operations_List_MinimumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2021-10-01\Employees_CreateOrUpdate_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2021-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2021-10-01\Employees_Delete_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2021-10-01/Employees_Delete_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Delete_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2021-10-01\Employees_Get_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2021-10-01/Employees_Get_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Get_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2021-10-01\Employees_ListByResourceGroup_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2021-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2021-10-01\Employees_ListByResourceGroup_MinimumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2021-10-01/Employees_ListByResourceGroup_MinimumSet_Gen.json dest: examples/2021-10-01/Employees_ListByResourceGroup_MinimumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2021-10-01\Employees_ListBySubscription_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2021-10-01/Employees_ListBySubscription_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_ListBySubscription_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2021-10-01\Employees_ListBySubscription_MinimumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2021-10-01/Employees_ListBySubscription_MinimumSet_Gen.json dest: examples/2021-10-01/Employees_ListBySubscription_MinimumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2021-10-01\Employees_Update_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2021-10-01/Employees_Update_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Update_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2021-10-01\Operations_List_MaximumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2021-10-01/Operations_List_MaximumSet_Gen.json dest: examples/2021-10-01/Operations_List_MaximumSet_Gen.json - - src: ../fixtures/001005-version-add-preview-after-preview\examples\2021-10-01\Operations_List_MinimumSet_Gen.json + - src: ../fixtures/001005-version-add-preview-after-preview/examples/2021-10-01/Operations_List_MinimumSet_Gen.json dest: examples/2021-10-01/Operations_List_MinimumSet_Gen.json graders: - type: tool-calls config: required: - edit + - web_fetch - azure-sdk-mcp-azsdk_run_typespec_validation - type: skill-invocation config: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001006.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001006.eval.yaml index 13e796fc3ea..ce8fb183df9 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001006.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001006.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: @@ -23,57 +23,58 @@ stimuli: max_tokens: 5000 environment: files: - - src: ../fixtures/001006-version-add-preview-after-stable\employee.tsp + - src: ../fixtures/001006-version-add-preview-after-stable/employee.tsp dest: employee.tsp - - src: ../fixtures/001006-version-add-preview-after-stable\main.tsp + - src: ../fixtures/001006-version-add-preview-after-stable/main.tsp dest: main.tsp - - src: ../fixtures/001006-version-add-preview-after-stable\shared.tsp + - src: ../fixtures/001006-version-add-preview-after-stable/shared.tsp dest: shared.tsp - - src: ../fixtures/001006-version-add-preview-after-stable\tspconfig.yaml + - src: ../fixtures/001006-version-add-preview-after-stable/tspconfig.yaml dest: tspconfig.yaml - - src: ../fixtures/001006-version-add-preview-after-stable\package.json + - src: ../fixtures/001006-version-add-preview-after-stable/package.json dest: package.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2021-10-01\Employees_CreateOrUpdate_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2021-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2021-10-01\Employees_Delete_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2021-10-01/Employees_Delete_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Delete_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2021-10-01\Employees_Get_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2021-10-01/Employees_Get_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Get_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2021-10-01\Employees_ListByResourceGroup_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2021-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2021-10-01\Employees_ListBySubscription_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2021-10-01/Employees_ListBySubscription_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_ListBySubscription_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2021-10-01\Employees_Update_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2021-10-01/Employees_Update_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Update_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2021-10-01\Operations_List_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2021-10-01/Operations_List_MaximumSet_Gen.json dest: examples/2021-10-01/Operations_List_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2021-10-01\Operations_List_MinimumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2021-10-01/Operations_List_MinimumSet_Gen.json dest: examples/2021-10-01/Operations_List_MinimumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2024-10-01\Employees_CreateOrUpdate_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2024-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2024-10-01\Employees_Delete_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2024-10-01/Employees_Delete_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_Delete_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2024-10-01\Employees_Get_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2024-10-01/Employees_Get_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_Get_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2024-10-01\Employees_ListByResourceGroup_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2024-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2024-10-01\Employees_ListByResourceGroup_MinimumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2024-10-01/Employees_ListByResourceGroup_MinimumSet_Gen.json dest: examples/2024-10-01/Employees_ListByResourceGroup_MinimumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2024-10-01\Employees_ListBySubscription_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2024-10-01/Employees_ListBySubscription_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_ListBySubscription_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2024-10-01\Employees_ListBySubscription_MinimumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2024-10-01/Employees_ListBySubscription_MinimumSet_Gen.json dest: examples/2024-10-01/Employees_ListBySubscription_MinimumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2024-10-01\Employees_Update_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2024-10-01/Employees_Update_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_Update_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2024-10-01\Operations_List_MaximumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2024-10-01/Operations_List_MaximumSet_Gen.json dest: examples/2024-10-01/Operations_List_MaximumSet_Gen.json - - src: ../fixtures/001006-version-add-preview-after-stable\examples\2024-10-01\Operations_List_MinimumSet_Gen.json + - src: ../fixtures/001006-version-add-preview-after-stable/examples/2024-10-01/Operations_List_MinimumSet_Gen.json dest: examples/2024-10-01/Operations_List_MinimumSet_Gen.json graders: - type: tool-calls config: required: - edit + - web_fetch - azure-sdk-mcp-azsdk_run_typespec_validation - type: skill-invocation config: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001007.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001007.eval.yaml index 87c42bc2b1e..b474e77add5 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001007.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001007.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: @@ -23,63 +23,64 @@ stimuli: max_tokens: 5000 environment: files: - - src: ../fixtures/001007-version-add-stable-after-preview\employee.tsp + - src: ../fixtures/001007-version-add-stable-after-preview/employee.tsp dest: employee.tsp - - src: ../fixtures/001007-version-add-stable-after-preview\main.tsp + - src: ../fixtures/001007-version-add-stable-after-preview/main.tsp dest: main.tsp - - src: ../fixtures/001007-version-add-stable-after-preview\shared.tsp + - src: ../fixtures/001007-version-add-stable-after-preview/shared.tsp dest: shared.tsp - - src: ../fixtures/001007-version-add-stable-after-preview\tspconfig.yaml + - src: ../fixtures/001007-version-add-stable-after-preview/tspconfig.yaml dest: tspconfig.yaml - - src: ../fixtures/001007-version-add-stable-after-preview\package.json + - src: ../fixtures/001007-version-add-stable-after-preview/package.json dest: package.json - - src: ../fixtures/001007-version-add-stable-after-preview\readme.md + - src: ../fixtures/001007-version-add-stable-after-preview/readme.md dest: readme.md - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2021-10-01\Employees_CreateOrUpdate_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2021-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2021-10-01\Employees_Delete_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2021-10-01/Employees_Delete_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Delete_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2021-10-01\Employees_Get_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2021-10-01/Employees_Get_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Get_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2021-10-01\Employees_ListByResourceGroup_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2021-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2021-10-01\Employees_ListByResourceGroup_MinimumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2021-10-01/Employees_ListByResourceGroup_MinimumSet_Gen.json dest: examples/2021-10-01/Employees_ListByResourceGroup_MinimumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2021-10-01\Employees_ListBySubscription_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2021-10-01/Employees_ListBySubscription_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_ListBySubscription_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2021-10-01\Employees_ListBySubscription_MinimumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2021-10-01/Employees_ListBySubscription_MinimumSet_Gen.json dest: examples/2021-10-01/Employees_ListBySubscription_MinimumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2021-10-01\Employees_Update_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2021-10-01/Employees_Update_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Update_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2021-10-01\Operations_List_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2021-10-01/Operations_List_MaximumSet_Gen.json dest: examples/2021-10-01/Operations_List_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2021-10-01\Operations_List_MinimumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2021-10-01/Operations_List_MinimumSet_Gen.json dest: examples/2021-10-01/Operations_List_MinimumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2024-10-01-preview\Employees_CreateOrUpdate_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2024-10-01-preview/Employees_CreateOrUpdate_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_CreateOrUpdate_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2024-10-01-preview\Employees_Delete_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2024-10-01-preview/Employees_Delete_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_Delete_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2024-10-01-preview\Employees_Get_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2024-10-01-preview/Employees_Get_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_Get_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2024-10-01-preview\Employees_ListByResourceGroup_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2024-10-01-preview/Employees_ListByResourceGroup_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_ListByResourceGroup_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2024-10-01-preview\Employees_ListByResourceGroup_MinimumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2024-10-01-preview/Employees_ListByResourceGroup_MinimumSet_Gen.json dest: examples/2024-10-01-preview/Employees_ListByResourceGroup_MinimumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2024-10-01-preview\Employees_ListBySubscription_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2024-10-01-preview/Employees_ListBySubscription_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_ListBySubscription_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2024-10-01-preview\Employees_ListBySubscription_MinimumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2024-10-01-preview/Employees_ListBySubscription_MinimumSet_Gen.json dest: examples/2024-10-01-preview/Employees_ListBySubscription_MinimumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2024-10-01-preview\Employees_Update_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2024-10-01-preview/Employees_Update_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Employees_Update_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2024-10-01-preview\Operations_List_MaximumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2024-10-01-preview/Operations_List_MaximumSet_Gen.json dest: examples/2024-10-01-preview/Operations_List_MaximumSet_Gen.json - - src: ../fixtures/001007-version-add-stable-after-preview\examples\2024-10-01-preview\Operations_List_MinimumSet_Gen.json + - src: ../fixtures/001007-version-add-stable-after-preview/examples/2024-10-01-preview/Operations_List_MinimumSet_Gen.json dest: examples/2024-10-01-preview/Operations_List_MinimumSet_Gen.json graders: - type: tool-calls config: required: - edit + - web_fetch - azure-sdk-mcp-azsdk_run_typespec_validation - type: skill-invocation config: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001008.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001008.eval.yaml index a7900d3f387..2a931ad453b 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001008.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001008.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: @@ -23,63 +23,64 @@ stimuli: max_tokens: 5000 environment: files: - - src: ../fixtures/001008-version-add-stable-after-stable\employee.tsp + - src: ../fixtures/001008-version-add-stable-after-stable/employee.tsp dest: employee.tsp - - src: ../fixtures/001008-version-add-stable-after-stable\main.tsp + - src: ../fixtures/001008-version-add-stable-after-stable/main.tsp dest: main.tsp - - src: ../fixtures/001008-version-add-stable-after-stable\shared.tsp + - src: ../fixtures/001008-version-add-stable-after-stable/shared.tsp dest: shared.tsp - - src: ../fixtures/001008-version-add-stable-after-stable\tspconfig.yaml + - src: ../fixtures/001008-version-add-stable-after-stable/tspconfig.yaml dest: tspconfig.yaml - - src: ../fixtures/001008-version-add-stable-after-stable\package.json + - src: ../fixtures/001008-version-add-stable-after-stable/package.json dest: package.json - - src: ../fixtures/001008-version-add-stable-after-stable\readme.md + - src: ../fixtures/001008-version-add-stable-after-stable/readme.md dest: readme.md - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2021-10-01\Employees_CreateOrUpdate_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2021-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2021-10-01\Employees_Delete_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2021-10-01/Employees_Delete_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Delete_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2021-10-01\Employees_Get_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2021-10-01/Employees_Get_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Get_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2021-10-01\Employees_ListByResourceGroup_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2021-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2021-10-01\Employees_ListByResourceGroup_MinimumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2021-10-01/Employees_ListByResourceGroup_MinimumSet_Gen.json dest: examples/2021-10-01/Employees_ListByResourceGroup_MinimumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2021-10-01\Employees_ListBySubscription_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2021-10-01/Employees_ListBySubscription_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_ListBySubscription_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2021-10-01\Employees_ListBySubscription_MinimumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2021-10-01/Employees_ListBySubscription_MinimumSet_Gen.json dest: examples/2021-10-01/Employees_ListBySubscription_MinimumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2021-10-01\Employees_Update_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2021-10-01/Employees_Update_MaximumSet_Gen.json dest: examples/2021-10-01/Employees_Update_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2021-10-01\Operations_List_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2021-10-01/Operations_List_MaximumSet_Gen.json dest: examples/2021-10-01/Operations_List_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2021-10-01\Operations_List_MinimumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2021-10-01/Operations_List_MinimumSet_Gen.json dest: examples/2021-10-01/Operations_List_MinimumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2024-10-01\Employees_CreateOrUpdate_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2024-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_CreateOrUpdate_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2024-10-01\Employees_Delete_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2024-10-01/Employees_Delete_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_Delete_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2024-10-01\Employees_Get_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2024-10-01/Employees_Get_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_Get_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2024-10-01\Employees_ListByResourceGroup_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2024-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_ListByResourceGroup_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2024-10-01\Employees_ListByResourceGroup_MinimumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2024-10-01/Employees_ListByResourceGroup_MinimumSet_Gen.json dest: examples/2024-10-01/Employees_ListByResourceGroup_MinimumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2024-10-01\Employees_ListBySubscription_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2024-10-01/Employees_ListBySubscription_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_ListBySubscription_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2024-10-01\Employees_ListBySubscription_MinimumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2024-10-01/Employees_ListBySubscription_MinimumSet_Gen.json dest: examples/2024-10-01/Employees_ListBySubscription_MinimumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2024-10-01\Employees_Update_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2024-10-01/Employees_Update_MaximumSet_Gen.json dest: examples/2024-10-01/Employees_Update_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2024-10-01\Operations_List_MaximumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2024-10-01/Operations_List_MaximumSet_Gen.json dest: examples/2024-10-01/Operations_List_MaximumSet_Gen.json - - src: ../fixtures/001008-version-add-stable-after-stable\examples\2024-10-01\Operations_List_MinimumSet_Gen.json + - src: ../fixtures/001008-version-add-stable-after-stable/examples/2024-10-01/Operations_List_MinimumSet_Gen.json dest: examples/2024-10-01/Operations_List_MinimumSet_Gen.json graders: - type: tool-calls config: required: - edit + - web_fetch - azure-sdk-mcp-azsdk_run_typespec_validation - type: skill-invocation config: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001009.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001009.eval.yaml index a8d6b64e0ac..4f300d120bf 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001009.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001009.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001010.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001010.eval.yaml index 078f85448d4..f07692373b6 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001010.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001010.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001011.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001011.eval.yaml index 05512728bd2..e658dbe3275 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001011.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001011.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001012.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001012.eval.yaml index 223f339b0c1..f1110c53445 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001012.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001012.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/001013.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/001013.eval.yaml index ccefd24795b..e7ff82bfda2 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/001013.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/001013.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002001.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002001.eval.yaml index 3104bd51d32..4e82c8460ad 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/002001.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/002001.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002002.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002002.eval.yaml index bc2347f6392..a0ab2ea4910 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/002002.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/002002.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002003.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002003.eval.yaml index d5b2e7d6ba8..ae679640d74 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/002003.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/002003.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002004.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002004.eval.yaml index 34d569946f8..654a6326b8c 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/002004.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/002004.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002005.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002005.eval.yaml index 0d62311c28b..4b99638ff29 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/002005.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/002005.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002006.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002006.eval.yaml index d5014ddbe0f..4f506cfdbf1 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/002006.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/002006.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002007.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002007.eval.yaml index bb239bb5e4b..52ada2b010d 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/002007.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/002007.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002008.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002008.eval.yaml index 9f17bc7a107..17652dc75b8 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/002008.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/002008.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002009.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002009.eval.yaml index 6d92881c639..f934d9e717a 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/002009.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/002009.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002010.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002010.eval.yaml index ebed4a60f67..34cc8ec3d1d 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/002010.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/002010.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/003001.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/003001.eval.yaml index 464d49b6d2a..b58615b1015 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/003001.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/003001.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/003002.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/003002.eval.yaml index f9c6e3db279..1a5553f9e7e 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/003002.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/003002.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/004001.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/004001.eval.yaml index a30a48400ce..3ebe320c61e 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/004001.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/004001.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/004002.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/004002.eval.yaml index 38c8cfebecf..6766ddef802 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/004002.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/004002.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/004003.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/004003.eval.yaml index a1e94e44a3f..6475a92884f 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/004003.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/004003.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/005001.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/005001.eval.yaml index a9c30d6b466..e3cb5fdcb17 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/005001.eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/005001.eval.yaml @@ -8,10 +8,10 @@ environment: azsdk-mcp # Execution configuration config: - runs: 1 # Trials per stimulus - timeout: 1800 # Seconds per trial - model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution - executor: copilot-sdk # Which executor to use + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use # Test cases stimuli: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/eval.yaml index b02a0868e0a..3d9ba95824b 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/eval.yaml @@ -13,7 +13,7 @@ environment: azsdk-mcp # Execution configuration config: runs: 1 - timeout: 1800 + timeout: 1200 model: claude-opus-4.6 executor: copilot-sdk # Cases are intentionally sorted by case id. diff --git a/eng/pipelines/azure-typespec-author-benchmark.yml b/eng/pipelines/azure-typespec-author-benchmark.yml new file mode 100644 index 00000000000..f766bd53637 --- /dev/null +++ b/eng/pipelines/azure-typespec-author-benchmark.yml @@ -0,0 +1,227 @@ +# NOTE: Please refer to https://aka.ms/azsdk/engsys/ci-yaml before editing this file. +trigger: none +pr: none + +parameters: +- name: PythonVersion + type: string + default: '3.10' +- name: SkillBranch + displayName: Branch containing the azure-typespec-author skill source (SKILL.md + references/) to evaluate. Leave empty to use the branch this pipeline was triggered from. + type: string + default: '' + +extends: + template: /eng/pipelines/templates/stages/1es-redirect.yml + parameters: + Use1ESOfficial: true + stages: + - stage: EvalTypeSpecAuthor + displayName: Azure TypeSpec Author Skill Evaluation + variables: + - template: /eng/pipelines/templates/variables/globals.yml + - template: /eng/pipelines/templates/variables/image.yml + - group: Azure SDK QA Bot Dev Variables + - group: AzSDK_Eval_Variable_group + pool: + name: $(LINUXPOOL) + image: $(LINUXVMIMAGE) + os: linux + jobs: + - job: RunEvals + displayName: Run Vally Evaluations + timeoutInMinutes: 120 + steps: + - checkout: self + fetchDepth: 0 + + # Optionally overlay the azure-typespec-author skill source + # (SKILL.md + references/) from a different branch so we can + # evaluate skill changes that live on another branch while + # keeping the evaluate/ directory and pipeline config from + # the branch this pipeline was triggered from. + - script: | + set -e + BRANCH="${{ parameters.SkillBranch }}" + if [ -z "$BRANCH" ]; then + echo "SkillBranch parameter not provided; using the source branch checked out by the pipeline." + git rev-parse --abbrev-ref HEAD + git rev-parse HEAD + exit 0 + fi + echo "Overlaying azure-typespec-author skill source from branch: $BRANCH" + git fetch origin "$BRANCH" + rm -rf .github/skills/azure-typespec-author/SKILL.md \ + .github/skills/azure-typespec-author/references + git checkout "origin/$BRANCH" -- \ + .github/skills/azure-typespec-author/SKILL.md \ + .github/skills/azure-typespec-author/references + echo "Skill source overlaid from origin/$BRANCH ($(git rev-parse origin/$BRANCH))" + ls -R .github/skills/azure-typespec-author + displayName: Checkout skill branch + + # Install Go and build/start the QA bot backend + - script: | + echo "Downloading Go 1.24.0..." + curl -LO https://go.dev/dl/go1.24.0.linux-amd64.tar.gz + echo "Removing existing Go installation..." + sudo rm -rf /usr/local/go + echo "Extracting Go 1.24.0..." + sudo tar -C /usr/local -xzf go1.24.0.linux-amd64.tar.gz + echo "Setting environment variables..." + echo "##vso[task.setvariable variable=GOROOT]/usr/local/go" + echo "##vso[task.prependpath]/usr/local/go/bin" + displayName: Install Go + + - script: go version + displayName: Check Go version + + - task: AzureCLI@2 + displayName: Start QA bot backend service + inputs: + azureSubscription: 'azuresdkqabot-dev' + scriptType: bash + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory)/tools/sdk-ai-bots/azure-sdk-qa-bot-backend + inlineScript: | + export GOPROXY=https://proxy.golang.org,direct + go build -o qa-bot-service + chmod +x qa-bot-service + nohup ./qa-bot-service > qa-bot-service.log 2>&1 & + SERVICE_PID=$! + echo "Service started with PID: $SERVICE_PID" + sleep 20 + echo "=== Service log ===" + cat qa-bot-service.log || true + echo "===================" + if ! kill -0 $SERVICE_PID 2>/dev/null; then + echo "ERROR: Service process has exited unexpectedly" + exit 1 + fi + if ! lsof -ti:8088 > /dev/null 2>&1; then + echo "ERROR: No process listening on port 8088" + exit 1 + fi + echo "Service is running on port 8088" + env: + SYSTEM_ACCESSTOKEN: $(System.AccessToken) + AZURE_APPCONFIG_ENDPOINT: $(AZURE_APPCONFIG_ENDPOINT) + + # Install .NET SDK for azsdk-cli MCP server. + # Use the version pinned in global.json so the build/restore works. + - task: UseDotNet@2 + displayName: Install .NET SDK (global.json) + inputs: + useGlobalJson: true + workingDirectory: $(Build.SourcesDirectory) + - task: UseDotNet@2 + displayName: Install .NET 8.0 SDK + inputs: + version: '8.x' + + # Authenticate to NuGet so the azsdk-cli MCP server can restore + # packages from the internal Azure Artifacts feed (the 1ES pool + # blocks direct egress to nuget.org). + - task: NuGetAuthenticate@1 + displayName: Authenticate NuGet + + # Pre-build the azsdk-cli (the MCP server). vally launches it + # via `dotnet run` (defaults to Debug, no -c flag), but doing + # the restore + build here surfaces any failure as a clear + # pipeline error instead of a silent MCP server startup + # failure during evaluations. + - script: | + set -e + dotnet restore tools/azsdk-cli/Azure.Sdk.Tools.Cli/Azure.Sdk.Tools.Cli.csproj + dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli/Azure.Sdk.Tools.Cli.csproj \ + --no-restore --nologo + displayName: Build azsdk-cli (MCP server) + + # Install Node.js, Vally CLI, and Copilot SDK. + # Use the internal Azure Artifacts npm mirror (with auth) because + # the 1ES managed pool blocks direct egress to registry.npmjs.org. + - task: NodeTool@0 + displayName: Use Node.js 22 + inputs: + versionSpec: '22.x' + + - template: /eng/common/pipelines/templates/steps/create-authenticated-npmrc.yml + parameters: + npmrcPath: $(Build.SourcesDirectory)/.npmrc + registryUrl: https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-tools/npm/registry/ + + - script: npm install --global @microsoft/vally-cli@0.4.0 --userconfig $(Build.SourcesDirectory)/.npmrc + displayName: Install Vally CLI + + - script: npm install --global @github/copilot-sdk --userconfig $(Build.SourcesDirectory)/.npmrc + displayName: Install Copilot SDK + + # Pre-install TypeSpec compiler globally from the internal npm + # mirror so `azsdk_run_typespec_validation` (and any direct + # `npx tsp`/`npm install` the agent attempts) succeeds without + # network access to registry.npmjs.org. + - script: | + set -e + npm install --global @typespec/compiler --userconfig $(Build.SourcesDirectory)/.npmrc + echo "tsp version:" + tsp --version || true + displayName: Install TypeSpec compiler + + # Smoke test (informational only): try to start MCP server and + # capture any startup errors. We don't fail the pipeline here + # because reliably driving an MCP stdio JSON-RPC handshake from + # a shell is fragile; vally/copilot-sdk drives the protocol + # properly during the eval step. + - script: | + cd $(Build.SourcesDirectory)/tools/azsdk-cli/Azure.Sdk.Tools.Cli + echo "--- CLI commands ---" + dotnet run --no-build -- --help 2>&1 | head -40 || true + echo "--- list registered MCP tools ---" + dotnet run --no-build -- list 2>&1 | head -200 || true + displayName: MCP server smoke test (informational) + continueOnError: true + env: + AZSDKTOOLS_AGENT_TESTING: "true" + AZSDKTOOLS_COLLECT_TELEMETRY: "false" + AZURE_SDK_KB_ENDPOINT: "http://localhost:8088" + + # Run evaluations + # The vally config (.vally.yaml) lives in the skill's evaluate/ + # directory; cd there so vally discovers evals/*.eval.yaml. + # Vally requires imeout to be a duration string (e.g. "30m"), + # but the source eval files use bare integer seconds. Rewrite + # at runtime so the source stays untouched. + - script: | + cd .github/skills/azure-typespec-author/evaluate + mkdir -p results + for f in evals/*.eval.yaml; do + sed -i -E 's/^([[:space:]]*timeout:[[:space:]]+)([0-9]+)([[:space:]]*(#.*)?)$/\1"\2s"\3/' "$f" + done + vally eval --suite all --output-dir results --verbose + displayName: Run evaluations + continueOnError: true + env: + GITHUB_TOKEN: $(azuresdk-copilot-github-pat) + + - script: | + cd .github/skills/azure-typespec-author/evaluate + mkdir -p results-2 + vally eval --eval-spec eval.yaml --output-dir results-2 --workers 3 --verbose + displayName: Run all evaluations + continueOnError: true + condition: succeededOrFailed() + env: + GITHUB_TOKEN: $(azuresdk-copilot-github-pat) + + templateContext: + outputs: + - output: pipelineArtifact + path: $(Build.SourcesDirectory)/.github/skills/azure-typespec-author/evaluate/results + artifact: eval-results-$(Build.BuildId) + displayName: Upload eval results + condition: always() + - output: pipelineArtifact + path: $(Build.SourcesDirectory)/.github/skills/azure-typespec-author/evaluate/results-2 + artifact: eval-results2-$(Build.BuildId) + displayName: Upload eval results 2 + condition: always()