Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def initialize(store_rollout, title: "Rollout Status", show_monitoring: true)
:id,
:parent_release, to: :store_rollout
delegate :release, :platform, to: :release_platform_run
delegate :automatic_rollout?, to: :release_platform_run

def decorated_status
status_picker(STATUS, status)
Expand Down Expand Up @@ -122,7 +123,7 @@ def more_actions
confirm: "Are you sure you want to rollout to all users?"
},
{
text: "Pause rollout",
text: "Pause automatic rollout",
path: pause_store_rollout_path(id),
scheme: :danger,
disabled: !automatic_rollout?,
Expand Down
4 changes: 3 additions & 1 deletion app/controllers/config/release_platforms_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ def config_params
production_release_attributes: [
:id,
submissions_attributes: [
:id, :submission_type, :_destroy, :rollout_stages, :rollout_enabled, :finish_rollout_in_next_release
:id, :submission_type, :_destroy, :rollout_stages, :rollout_enabled, :finish_rollout_in_next_release,
:automatic_rollout
]
],
internal_workflow_attributes: [
Expand Down Expand Up @@ -155,6 +156,7 @@ def update_production_release_rollout_stages(production_release_attributes)
submission_attributes[:rollout_stages] = submission_attributes[:rollout_stages].safe_csv_parse(coerce_float: true)
else
submission_attributes[:finish_rollout_in_next_release] = false
submission_attributes[:automatic_rollout] = false
end
end

Expand Down
2 changes: 1 addition & 1 deletion app/controllers/store_rollouts_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def resume
end

def halt
if (res = Action.halt_the_store_rollout!(@store_rollout)).ok?
if (res = Action.halt_the_store_rollout!(@store_rollout, manually: true)).ok?
redirect_back fallback_location: root_path, notice: t(".halt.success")
else
redirect_back fallback_location: root_path, flash: {error: t(".halt.failure", errors: res.error.message)}
Expand Down
19 changes: 19 additions & 0 deletions app/jobs/halt_unhealthy_release_rollout_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
class HaltUnhealthyReleaseRolloutJob < ApplicationJob
queue_as :high

def perform(production_release_id, event_id)
production_release = ProductionRelease.find(production_release_id)
event = ReleaseHealthEvent.find(event_id)

if production_release.unhealthy?
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Find ProductionRelease model
fd -t f "production_release\.rb|production_releases\.rb" -type f

Repository: tramlinehq/tramline

Length of output: 233


🏁 Script executed:

#!/bin/bash
# Search for healthy? and unhealthy? method definitions in ProductionRelease
ast-grep --pattern 'def healthy?
  $$$
'

ast-grep --pattern 'def unhealthy?
  $$$
'

Repository: tramlinehq/tramline

Length of output: 1701


🏁 Script executed:

#!/bin/bash
# Also check for IncreaseHealthyReleaseRolloutJob
fd -t f "increase_healthy.*job" -type f

Repository: tramlinehq/tramline

Length of output: 233


🏁 Script executed:

#!/bin/bash
# Find all jobs to understand the context better
fd -t f ".*_job\.rb" app/jobs/ | head -20

Repository: tramlinehq/tramline

Length of output: 861


🏁 Script executed:

#!/bin/bash
# Read the halt_unhealthy_release_rollout_job.rb file
cat -n app/jobs/halt_unhealthy_release_rollout_job.rb

Repository: tramlinehq/tramline

Length of output: 832


🏁 Script executed:

#!/bin/bash
# Read the increase_healthy_release_rollout_job.rb file
cat -n app/jobs/increase_healthy_release_rollout_job.rb

Repository: tramlinehq/tramline

Length of output: 1054


🏁 Script executed:

#!/bin/bash
# Check Release model for healthy? method
rg -n "def healthy\?" app/models/release.rb -A 10

Repository: tramlinehq/tramline

Length of output: 45


🏁 Script executed:

#!/bin/bash
# Check the relationship between Release and ProductionRelease
rg -n "production_release" app/models/release.rb -B 2 -A 2 | head -40

Repository: tramlinehq/tramline

Length of output: 1057


🏁 Script executed:

#!/bin/bash
# Broader search for healthy? in the Release model file
cat -n app/models/release.rb | grep -C 5 "healthy"

Repository: tramlinehq/tramline

Length of output: 1102


🏁 Script executed:

#!/bin/bash
# Search for any healthy? method definitions in the entire codebase
rg -n "^\s*def healthy\?" app/models/ -B 2

Repository: tramlinehq/tramline

Length of output: 339


Define Release#healthy? method or use inverse predicate in IncreaseHealthyReleaseRolloutJob.

The Release model lacks a healthy? method, but IncreaseHealthyReleaseRolloutJob calls release.healthy? on line 11. The Release model only defines unhealthy? (which checks if any platform run is unhealthy). Add Release#healthy? as the inverse of unhealthy?, or refactor the job to use !release.unhealthy? for consistency with the predicate pattern used elsewhere.

🤖 Prompt for AI Agents
In app/jobs/halt_unhealthy_release_rollout_job.rb around line 7, the job and
IncreaseHealthyReleaseRolloutJob expect Release#healthy? to exist but the model
only implements unhealthy?; add a Release#healthy? instance method that returns
the inverse of unhealthy? (e.g., !unhealthy?) so both predicates are available
and consistent, or alternatively update the jobs to use !release.unhealthy?
everywhere; implement the method in the Release model to return !unhealthy? and
update tests if any rely on healthy?.

store_rollout = production_release.store_rollout

if store_rollout.is_a?(PlayStoreRollout) && store_rollout.automatic_rollout?
result = Action.halt_the_store_rollout!(store_rollout)

# Mark the event as action triggered if halt was successful
event.update(action_triggered: true) if result.ok?
end
end
end
end
29 changes: 29 additions & 0 deletions app/jobs/increase_healthy_release_rollout_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
class IncreaseHealthyReleaseRolloutJob < ApplicationJob
queue_as :high

def perform(play_store_rollout_id)
rollout = PlayStoreRollout.find(play_store_rollout_id)
return if rollout.completed? || rollout.fully_released?
return unless rollout.automatic_rollout?

release = rollout.parent_release

if release.healthy?
if rollout.halted?
Action.resume_the_store_rollout!(rollout)
end

if rollout.started?
Action.increase_the_store_rollout!(rollout)
end
end

# This update is necessary so that our verify rollout job does not pick this up again
rollout.update!(
automatic_rollout_updated_at: Time.current,
automatic_rollout_next_update_at: Time.current + PlayStoreRollout::AUTO_ROLLOUT_RUN_INTERVAL
)

IncreaseHealthyReleaseRolloutJob.perform_in(PlayStoreRollout::AUTO_ROLLOUT_RUN_INTERVAL, play_store_rollout_id)
end
end
13 changes: 13 additions & 0 deletions app/jobs/verify_automatic_rollout_job.rb
Copy link
Member

@kitallis kitallis Apr 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

problem This job can cause double/multi increments to the rollout. Scenario:

  1. cascading job from IncreaseHealthyReleaseRolloutJob scheduled for day2/11am
  2. the job gets delayed by 1 minute (because of queue buildup)
  3. Verify kicks in because automatic_rollout_updated_at was at day1/11:00am and automatic_rollout_next_update_at is at day2/11:00am (so diff is > 300 seconds)
  4. it triggers the increase rollout job
  5. the delayed job from point 2 also kicks in
  6. increments ends up happening twice in 2 minutes

there are multiple problems here:

  1. IncreaseHealthyReleaseRolloutJob is not fully idempotent because it only looks at time, I think it also needs to look at the "stage" of the rollout
  2. the verification check of 300 seconds should not look at the diff between next and last, it should probably just do "delta" verification which is: was I supposed to run at t0, but I haven't run for t0 + <large jitter like 1 hour> because its purpose should be integrity check rather than another way of updating rollout

Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
class VerifyAutomaticRolloutJob < ApplicationJob
queue_as :high

def perform
PlayStoreRollout.automatic_rollouts
.where(automatic_rollout_next_update_at: ...Time.current)
.where(
"automatic_rollout_next_update_at - automatic_rollout_updated_at >= interval '300 second'"
).find_each do |rollout|
IncreaseHealthyReleaseRolloutJob.perform_async(rollout.id)
end
end
end
7 changes: 6 additions & 1 deletion app/libs/coordinators.rb
Original file line number Diff line number Diff line change
Expand Up @@ -258,10 +258,15 @@ def self.resume_the_store_rollout!(rollout)
Res.new { true }
end

def self.halt_the_store_rollout!(rollout)
def self.halt_the_store_rollout!(rollout, manually: false)
return Res.new { raise "release is not actionable" } unless rollout.actionable?
return Res.new { raise "rollout is not started" } unless rollout.started?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

problem: our state machine allows moving from halted back to started (so we can increase rollout) but this guard clause will prevent it from actually happening and blow up

we need to remove this I think

rollout.halt_release!

if manually && rollout.automatic_rollout?
rollout.disable_automatic_rollout!
end

return Res.new { raise rollout.errors.full_messages.to_sentence } if rollout.errors?
Res.new { true }
end
Expand Down
29 changes: 18 additions & 11 deletions app/models/app_store_rollout.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,20 @@
#
# Table name: store_rollouts
#
# id :uuid not null, primary key
# completed_at :datetime
# config :decimal(8, 5) default([]), not null, is an Array
# current_stage :integer
# is_staged_rollout :boolean default(FALSE)
# status :string not null
# type :string not null
# created_at :datetime not null
# updated_at :datetime not null
# release_platform_run_id :uuid not null, indexed
# store_submission_id :uuid indexed
# id :uuid not null, primary key
# automatic_rollout :boolean default(FALSE), not null
# automatic_rollout_next_update_at :datetime
# automatic_rollout_updated_at :datetime
# completed_at :datetime
# config :decimal(8, 5) default([]), not null, is an Array
# current_stage :integer
# is_staged_rollout :boolean default(FALSE)
# status :string not null
# type :string not null
# created_at :datetime not null
# updated_at :datetime not null
# release_platform_run_id :uuid not null, indexed
# store_submission_id :uuid indexed
#
class AppStoreRollout < StoreRollout
include Passportable
Expand Down Expand Up @@ -124,6 +127,10 @@ def halt_release!
end
end

def disable_automatic_rollout!
# noop for app store
end

def release_fully!
with_lock do
return unless may_fully_release?
Expand Down
5 changes: 4 additions & 1 deletion app/models/config/submission.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#
# id :bigint not null, primary key
# auto_promote :boolean default(FALSE)
# automatic_rollout :boolean default(FALSE)
# finish_rollout_in_next_release :boolean default(FALSE), not null
# integrable_type :string
# number :integer indexed, indexed => [release_step_config_id]
Expand Down Expand Up @@ -49,7 +50,8 @@ def as_json(options = {})
finish_rollout_in_next_release: finish_rollout_in_next_release,
rollout_config: {
enabled: rollout_enabled,
stages: rollout_stages
stages: rollout_stages,
automatic: automatic_rollout
}
}
end
Expand Down Expand Up @@ -78,6 +80,7 @@ def self.from_json(json, read_only: false)
submission.submission_external = Config::SubmissionExternal.from_json(json["submission_config"])
submission.rollout_stages = json.dig("rollout_config", "stages")
submission.rollout_enabled = json.dig("rollout_config", "enabled")
submission.automatic_rollout = json.dig("rollout_config", "automatic")
submission
end

Expand Down
61 changes: 46 additions & 15 deletions app/models/play_store_rollout.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,20 @@
#
# Table name: store_rollouts
#
# id :uuid not null, primary key
# completed_at :datetime
# config :decimal(8, 5) default([]), not null, is an Array
# current_stage :integer
# is_staged_rollout :boolean default(FALSE)
# status :string not null
# type :string not null
# created_at :datetime not null
# updated_at :datetime not null
# release_platform_run_id :uuid not null, indexed
# store_submission_id :uuid indexed
# id :uuid not null, primary key
# automatic_rollout :boolean default(FALSE), not null
# automatic_rollout_next_update_at :datetime
# automatic_rollout_updated_at :datetime
# completed_at :datetime
# config :decimal(8, 5) default([]), not null, is an Array
# current_stage :integer
# is_staged_rollout :boolean default(FALSE)
# status :string not null
# type :string not null
# created_at :datetime not null
# updated_at :datetime not null
# release_platform_run_id :uuid not null, indexed
# store_submission_id :uuid indexed
#
class PlayStoreRollout < StoreRollout
include Passportable
Expand All @@ -23,20 +26,32 @@ class PlayStoreRollout < StoreRollout
STAMPABLE_REASONS = %w[
started
updated
paused
resumed
halted
completed
failed
fully_released
]

AUTO_ROLLOUT_RUN_INTERVAL = 24.hours

aasm safe_state_machine_params(with_lock: false) do
state :created, initial: true
state(*STATES.keys)

event :start, after_commit: :on_start! do
transitions from: :created, to: :started
transitions from: :halted, to: :started
transitions from: :paused, to: :started
end

event :pause do
transitions from: :started, to: :paused
end

event :resume do
transitions from: :paused, to: :started
end

event :halt, after_commit: :on_halt! do
Expand All @@ -56,8 +71,6 @@ class PlayStoreRollout < StoreRollout

def controllable_rollout? = true

def automatic_rollout? = false

def start_release!(retry_on_review_fail: false)
if staged_rollout?
# return mock_start_play_store_rollout! if sandbox_mode?
Expand Down Expand Up @@ -128,13 +141,20 @@ def halt_release!
end
end

def disable_automatic_rollout!
update!(automatic_rollout: false)
end

def resume_release!
with_lock do
return unless may_start?

result = rollout(last_rollout_percentage, retry_on_review_fail: true)
if result.ok?
start!
if release_platform_run.automatic_rollout?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

problem: I think this why you probably added the long dig to find the config in release_platform_run but it's not necessary.

we can just do

play_store_submission.conf.automatic_rollout?

and then delegate can be chucked

delegate :automatic_rollout?, to: :release_platform_run

update!(automatic_rollout: true)
end
event_stamp!(reason: :resumed, kind: :notice, data: stamp_data)
notify!("Rollout was resumed", :production_rollout_resumed, notification_params)
else
Expand All @@ -143,8 +163,19 @@ def resume_release!
end
end

def rollout_active?
provider.build_active?(submission_channel_id, build_number, raise_on_lock_error: true)
def pause_release!
with_lock do
return unless may_pause?

disable_automatic_rollout!
pause!
event_stamp!(reason: :paused, kind: :error, data: stamp_data)
notify!("Automatic rollout has been paused", :production_rollout_paused, notification_params)
end
end

def rollout_in_progress?
provider.build_in_progress?(submission_channel_id, build_number, raise_on_lock_error: true)
rescue GooglePlayStoreIntegration::LockAcquisitionError => e
errors.add(:base, e.message)
false
Expand Down
2 changes: 1 addition & 1 deletion app/models/play_store_submission.rb
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def release_notes
def on_prepare!
event_stamp!(reason: :prepared, kind: :notice, data: stamp_data)
config = conf.rollout_stages.presence || []
create_play_store_rollout!(release_platform_run:, config:, is_staged_rollout: staged_rollout?)
create_play_store_rollout!(release_platform_run:, config:, is_staged_rollout: staged_rollout?, automatic_rollout: auto_rollout?)
play_store_rollout.start_release!(retry_on_review_fail: internal_channel?) if auto_rollout?
end

Expand Down
6 changes: 6 additions & 0 deletions app/models/release_health_event.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class ReleaseHealthEvent < ApplicationRecord
delegate :notify!, to: :production_release

after_create_commit :notify_health_rule_triggered
after_create_commit :trigger_halt_on_unhealthy

private

Expand Down Expand Up @@ -59,4 +60,9 @@ def rule_triggers
def previous_event
production_release.release_health_events.for_rule(release_health_rule).where.not(id:).reorder("event_timestamp").last
end

def trigger_halt_on_unhealthy
return unless unhealthy?
HaltUnhealthyReleaseRolloutJob.perform_async(production_release.id, id)
end
end
2 changes: 2 additions & 0 deletions app/models/release_platform_run.rb
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,8 @@ def previously_completed_rollout_run

def conf = Config::ReleasePlatform.from_json(config)

def automatic_rollout? = conf.production_release&.submissions&.first&.automatic_rollout || false

private

def set_config
Expand Down
26 changes: 15 additions & 11 deletions app/models/store_rollout.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,20 @@
#
# Table name: store_rollouts
#
# id :uuid not null, primary key
# completed_at :datetime
# config :decimal(8, 5) default([]), not null, is an Array
# current_stage :integer
# is_staged_rollout :boolean default(FALSE)
# status :string not null
# type :string not null
# created_at :datetime not null
# updated_at :datetime not null
# release_platform_run_id :uuid not null, indexed
# store_submission_id :uuid indexed
# id :uuid not null, primary key
# automatic_rollout :boolean default(FALSE), not null
# automatic_rollout_next_update_at :datetime
# automatic_rollout_updated_at :datetime
# completed_at :datetime
# config :decimal(8, 5) default([]), not null, is an Array
# current_stage :integer
# is_staged_rollout :boolean default(FALSE)
# status :string not null
# type :string not null
# created_at :datetime not null
# updated_at :datetime not null
# release_platform_run_id :uuid not null, indexed
# store_submission_id :uuid indexed
#
class StoreRollout < ApplicationRecord
has_paper_trail
Expand Down Expand Up @@ -47,6 +50,7 @@ class StoreRollout < ApplicationRecord
delegate :stale?, :actionable?, to: :parent_release

scope :production, -> { joins(store_submission: :production_release) }
scope :automatic_rollouts, -> { where(status: [:started, :halted], is_staged_rollout: true, automatic_rollout: true) }

def staged_rollout? = is_staged_rollout

Expand Down
Loading
Loading