Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
c19a507
WIP hub deployment
abatallas May 20, 2025
e3b0e7d
Further refinement of deploy hub script and creation of deploy spoke …
abatallas May 23, 2025
1dc0929
Pass along monitoring ingestion endpoint to bicep
abatallas May 23, 2025
bd26bdf
Accept command line input in spoke deployment script
abatallas May 23, 2025
71cc398
Add RA to MI for monitoring in Bicep
abatallas May 23, 2025
b971d5c
Ingest monitoring parameters in create_cc_param.py
abatallas May 23, 2025
5887a11
Incident: passwords stored in git
ryanhamel May 26, 2025
fefd0bf
WIP: tweak spoke to use outputs/ dir
ryanhamel May 26, 2025
d41a795
WIP add hub-mi bicep and creation script
ryanhamel May 27, 2025
0062bbf
WIP add custom slurm template
ryanhamel May 27, 2025
07ab3c7
ignore bicep/hub/build dir
ryanhamel May 27, 2025
5098145
make init.sh executable
ryanhamel May 28, 2025
4196f3d
Use params folder and add --what-if to create_hub.sh
ryanhamel May 28, 2025
d5037d7
add ENTER default values, until this is a template
ryanhamel May 28, 2025
a782b04
add what-if, monitoring params, minor fixes
ryanhamel May 28, 2025
214c34e
Add htc2 nodearray, custom template, monitoring params
ryanhamel May 28, 2025
28d48dc
Remove metrics role from jetpack MI
ryanhamel May 28, 2025
81c0696
Add hub-mi to hub and spoke
ryanhamel May 28, 2025
505c8b1
add basic README.md
ryanhamel May 28, 2025
625a6fc
Fix the slurm template to load projects from cyclecloud
aditigaur4 May 28, 2025
f433d70
Add htc2 to partitions output
ryanhamel May 30, 2025
9780987
add pyxis cluster-init
ryanhamel May 30, 2025
2f9e826
Copy custom slurm.txt instead of prod slurm template to user home dir
ryanhamel May 30, 2025
63defb3
Fix deploy_spoke.sh support for grafana inputs
ryanhamel May 30, 2025
17330ff
update slurm.txt
ryanhamel May 30, 2025
51a1b84
WIP README updates
ryanhamel May 30, 2025
c1821aa
Add 3 new nodearrays and synchronize to latest standard spoke settings
ryanhamel Jun 2, 2025
c9e351a
README: Add steps for installing cyclecloud8 via /opt/ccw/install.sh
ryanhamel Jun 2, 2025
6285122
Update hub readme with private endpoint instructions and create a def…
abatallas Jun 2, 2025
3bbf825
Larger scheduler, ANF, and use existing dns for storage
ryanhamel Jun 3, 2025
cd1bf2e
Blob: add build 3408
ryanhamel Jun 3, 2025
1160b6a
Use two gpu partitions
ryanhamel Jun 3, 2025
94766e9
Enable SPG on hpc/hpc2
ryanhamel Jun 3, 2025
1087e6a
Fix error in ccwBastion module scope that blocks deployment
abatallas Jun 3, 2025
a27aedb
Delete NVMe-related scripts for gb200 clusters (#279)
abatallas Jun 3, 2025
7f95375
Blobs: use build 3433
ryanhamel Jun 3, 2025
6844708
Merge branch 'abatallas/gb200_hub_spoke' of https://github.com/Azure/…
ryanhamel Jun 3, 2025
481835d
Blobs: use build 3438
bwatrous Jun 19, 2025
5f1ade7
Create vnet link to hub private DNS zone and correct README step 5 in…
abatallas Jul 10, 2025
2a9ebba
Add monitoring project version as parameter in mainTemplate.bicep
abatallas Jul 10, 2025
dcc8874
Fix file path in create_hub_mi.sh
abatallas Jul 10, 2025
97913b4
Remove accelerated networking auto-enable patch from install.sh (#295)
abatallas Jul 15, 2025
113adb7
updated preview CycleCloud 8.8.0 release to build 3455
bwatrous Jul 16, 2025
40ebba1
updated preview CycleCloud 8.8.0 release to build 3455
bwatrous Jul 16, 2025
c1ad11f
Update spoke deployment name for uniqueness across regions
abatallas Jul 17, 2025
fe80e0c
Update custom Slurm template
abatallas Jul 17, 2025
d21ddff
Update b64-encoded create_cc_param.py
abatallas Jul 17, 2025
0d78761
Raise monitoring project version to 1.0.1
abatallas Jul 17, 2025
e1763e9
Update monitoring project version to 1.0.1 in hub creation script
abatallas Jul 17, 2025
59d97ad
Assign Monitoring Metrics Publisher role to hub MI for DCR RG rather …
abatallas Jul 18, 2025
55cd920
Script for in-place update of CC to latest insiders fast build with C…
abatallas Aug 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -407,3 +407,5 @@ arm-ttk/

# delete_roles.sh files
util/.role_assignment_cleanup*
bicep/hub/build/
bicep/hub/params/*.json
4 changes: 2 additions & 2 deletions bicep/anf.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ import {tags_t, availabilityZone_t} from './types.bicep'
param name string
param location string
param tags tags_t
param availabilityZone availabilityZone_t[]
param availabilityZone availabilityZone_t[] = []
param resourcePostfix string = uniqueString(resourceGroup().id)
param subnetId string
param serviceLevel string
param sizeTiB int
param defaultMountOptions string
param defaultMountOptions string = 'rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev,nconnect=8'
param infrastructureOnly bool = false
var capacity = sizeTiB * 1024 * 1024 * 1024 * 1024

Expand Down
61 changes: 42 additions & 19 deletions bicep/ccw.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ param insidersBuild bool

param branch string
param projectVersion string
param pyxisProjectVersion string
param monitoringProjectVersion string
param monitoringIngestionEndpoint string
param monitoringIdentityClientId string
param hubMI string

param adminUsername string
@secure()
Expand All @@ -25,7 +28,9 @@ param clusterInitSpecs types.cluster_init_param_t
param slurmSettings types.slurmSettings_t
param schedulerNode types.scheduler_t
param loginNodes types.login_t
param htc types.htc_t
param d64d types.htc_t
param d16d types.htc_t
param m64 types.htc_t
param hpc types.hpc_t
param gpu types.hpc_t
param tags types.resource_tags_t
Expand Down Expand Up @@ -103,7 +108,7 @@ output vnet types.networkOutput_t = union(
var deploy_bastion = network.?bastion ?? false
module ccwBastion './bastion.bicep' = if (deploy_bastion) {
name: 'ccwBastion'
scope: create_new_vnet ? az.resourceGroup() : az.resourceGroup(split(network.?existing_vnet_id, '/')[4])
scope: az.resourceGroup()
params: {
location: location
tags: getTags('Microsoft.Network/bastionHosts', tags)
Expand Down Expand Up @@ -202,7 +207,7 @@ module mySQLccw './mysql.bicep' = if (create_database) {
params: {
location: location
tags: getTags('Microsoft.DBforMySQL/flexibleServers', tags)
Name: db_name
// Name: db_name
adminUser: adminUsername
adminPassword: databaseAdminPassword
subnetId: subnets.database.id
Expand Down Expand Up @@ -315,25 +320,24 @@ output filerInfoFinal types.filerInfo_t = {
output cyclecloudPrincipalId string = infrastructureOnly ? '' : ccwVM.outputs.principalId

output managedIdentityId string = infrastructureOnly ? '' : ccwManagedIdentity.outputs.managedIdentityId

// Automatically inject the ccw and pyxis cluster init specs
// Automatically inject the ccw and monitoring cluster init specs

var ccwClusterInitSpec = {
type: 'gitHubReleaseURL'
gitHubReleaseURL: uri('https://github.com/Azure/cyclecloud-slurm-workspace/releases/tag/', projectVersion)
spec: 'default'
target: ['login', 'scheduler', 'htc', 'hpc', 'gpu', 'dynamic']
target: ['login', 'scheduler', 'd64d', 'd16d', 'm64', 'hpc', 'gpu', 'dynamic']
}

var pyxisClusterInitSpec = {
var monitoringClusterInitSpec = {
type: 'gitHubReleaseURL'
gitHubReleaseURL: uri('https://github.com/Azure/cyclecloud-pyxis/releases/tag/', pyxisProjectVersion)
gitHubReleaseURL: uri('https://github.com/Azure/cyclecloud-monitoring/releases/tag/', monitoringProjectVersion)
spec: 'default'
target: ['login', 'scheduler', 'htc', 'hpc', 'gpu', 'dynamic']
target: ['login', 'scheduler', 'd64d', 'd16d', 'm64', 'hpc', 'gpu', 'dynamic']
}

// Projects <= 2025.02.06 have the pyxis logic embedded in the ccw cluster init spec
var requiredClusterInitSpecs = [ccwClusterInitSpec, pyxisClusterInitSpec]
// Use of azslurm 4.0 does not require pyxis
var requiredClusterInitSpecs = [ccwClusterInitSpec, monitoringClusterInitSpec]

output clusterInitSpecs types.cluster_init_param_t = union(requiredClusterInitSpecs, clusterInitSpecs)

Expand All @@ -344,12 +348,24 @@ output schedulerNode types.scheduler_t = schedulerNode
output loginNodes types.login_t = loginNodes

output partitions types.partitions_t = {
htc: union({
sku: htc.sku
maxNodes: htc.maxNodes
osImage: htc.osImage
useSpot: htc.?useSpot ?? false
}, contains(htc,'availabilityZone') ? { availabilityZone: htc.?availabilityZone } : {})
d64d: union({
sku: d64d.sku
maxNodes: d64d.maxNodes
osImage: d64d.osImage
useSpot: d64d.?useSpot ?? false
}, contains(d64d,'availabilityZone') ? { availabilityZone: d64d.?availabilityZone } : {})
d16d: union({
sku: d16d.sku
maxNodes: d16d.maxNodes
osImage: d16d.osImage
useSpot: d16d.?useSpot ?? false
}, contains(d16d,'availabilityZone') ? { availabilityZone: d16d.?availabilityZone } : {})
m64: union({
sku: m64.sku
maxNodes: m64.maxNodes
osImage: m64.osImage
useSpot: m64.?useSpot ?? false
}, contains(m64,'availabilityZone') ? { availabilityZone: m64.?availabilityZone } : {})
hpc: hpc
gpu: gpu
}
Expand Down Expand Up @@ -389,7 +405,7 @@ output manualInstall bool = manualInstall
output acceptMarketplaceTerms bool = acceptMarketplaceTerms

output ood object = union(ood, {
version: '1.0.1'
version: '1.1.0'
nic: deployOOD ? oodNIC.outputs.NICId : ''
managedIdentity: deployOOD ? createOODMI ? oodNewManagedIdentity.id : ood.?appManagedIdentityId : ''
clientId: deployOOD ? registerOODApp ? oodApp.outputs.oodClientAppId : ood.?appId : ''
Expand All @@ -402,9 +418,16 @@ output oodManualRegistration object = {
fqdn: deployOOD ? oodNIC.outputs.privateIp : ''
}

output monitoring object = {
ingestionEndpoint: monitoringIngestionEndpoint
identityClientId: monitoringIdentityClientId
}
output hubMI string = hubMI

output files object = {
availability_zones_json: loadTextContent('./files-to-load/encoded/availability_zones.json.base64')
create_cc_param_py: loadTextContent('./files-to-load/encoded/create_cc_param.py.base64')
cyclecloud_install_py: loadTextContent('./files-to-load/encoded/cyclecloud_install.py.base64')
initial_params_json: loadTextContent('./files-to-load/encoded/initial_params.json.base64')
slurm_txt: loadTextContent('./files-to-load/encoded/slurm.txt.base64')
}
1 change: 1 addition & 0 deletions bicep/exports.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ var role_lookup = {
'Storage Account Contributor': resourceId('microsoft.authorization/roleDefinitions', '17d1049b-9a84-46fb-8f53-869881c3d3ab')
'Storage Blob Data Contributor': resourceId('microsoft.authorization/roleDefinitions', 'ba92f5b4-2d11-453d-a403-e96b0029c9fe')
'Storage Blob Data Reader': resourceId('microsoft.authorization/roleDefinitions', '2a2b9908-6ea1-4ae2-8e65-a410df84e7d1')
'Monitoring Metrics Publisher': resourceId('microsoft.authorization/roleDefinitions', '3913510d-42f4-4e42-8a64-420c390055eb')
}
26 changes: 17 additions & 9 deletions bicep/files-to-load/create_cc_param.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,13 @@ def set_slurm_params(params, dbPassword, outputs):
params['SubnetId'] = '/'.join([outputs['vnet']['value']['rg'], outputs['vnet']['value']['name'], outputs['vnet']['value']['computeSubnetName']])

# Define Availability Zone
params['DefineNodesAvailabilityZone'] = any('availabilityZone' in zoneList for zoneList in [outputs['partitions']['value']['htc'], outputs['partitions']['value']['hpc'], outputs['partitions']['value']['gpu']])

#HTC
params['HTCMachineType'] = outputs['partitions']['value']['htc']['sku']
params['MaxHTCExecuteNodeCount'] = int(outputs['partitions']['value']['htc']['maxNodes'])
params['HTCImageName'] = outputs['partitions']['value']['htc']['osImage']
params['HTCUseLowPrio'] = outputs['partitions']['value']['htc']['useSpot']
params['HTCAvailabilityZone'] = outputs['partitions']['value']['htc']['availabilityZone'] if params['DefineNodesAvailabilityZone'] and 'availabilityZone' in outputs['partitions']['value']['htc'] else None
params['DefineNodesAvailabilityZone'] = any('availabilityZone' in zoneList for zoneList in [outputs['partitions']['value']['hpc'], outputs['partitions']['value']['gpu']])

for na in ['D64D', 'D16D', 'M64']:
params[f'{na}MachineType'] = outputs['partitions']['value'][na.lower()]['sku']
params[f'Max{na}NodeCount'] = int(outputs['partitions']['value'][na.lower()]['maxNodes'])
params[f'{na}ImageName'] = outputs['partitions']['value'][na.lower()]['osImage']

#HPC
params['HPCMachineType'] = outputs['partitions']['value']['hpc']['sku']
params['MaxHPCExecuteNodeCount'] = int(outputs['partitions']['value']['hpc']['maxNodes'])
Expand Down Expand Up @@ -96,6 +94,13 @@ def set_slurm_params(params, dbPassword, outputs):
params['AdditionalNFSMountOptions'] = outputs['filerInfoFinal']['value']['additional']['mountOptions']
params['AdditionalNFSAddress'] = outputs['filerInfoFinal']['value']['additional']['ipAddress']

# Monitoring
params['MonitoringEnabled'] = outputs['monitoring']["value"]['ingestionEndpoint'] != ''
params['MonitoringIngestionEndpoint'] = outputs['monitoring']['value']['ingestionEndpoint']
params['MonitoringIdentityClientId'] = outputs['monitoring']['value']['identityClientId']

params['ManagedIdentity'] = outputs['hubMI']['value']


def set_ood_params(params, outputs):
slurm_params = get_json_dict('initial_params.json')
Expand All @@ -119,6 +124,7 @@ def set_ood_params(params, outputs):
params['ood_entra_tenant_id'] = outputs['ood']['value'].get('tenantId')
params['ood_nic'] = outputs['ood']['value'].get('nic')


class ClusterInitSpec:
def __init__(self, project: str, version: str, spec: str, targets: typing.List[str]):
self.project = project
Expand Down Expand Up @@ -197,7 +203,9 @@ def main():
"login": "LoginClusterInitSpecs",
"gpu": "GPUClusterInitSpecs",
"hpc": "HPCClusterInitSpecs",
"htc": "HTCClusterInitSpecs",
"d64d": "D64DClusterInitSpecs",
"d16d": "D16DClusterInitSpecs",
"m64": "M64ClusterInitSpecs",
"scheduler": "SchedulerClusterInitSpecs",
"dynamic": "DynamicClusterInitSpecs",
"ood": "ClusterInitSpecs"
Expand Down
Loading