11use crate :: compute:: ComputeNode ;
22use anyhow:: { Context , Result , bail} ;
3- use compute_api:: {
4- responses :: { LfcPrewarmState , PromoteState , SafekeepersLsn } ,
5- spec :: ComputeMode ,
6- } ;
3+ use compute_api:: responses :: { LfcPrewarmState , PromoteConfig , PromoteState } ;
4+ use compute_api :: spec :: ComputeMode ;
5+ use itertools :: Itertools ;
6+ use std :: collections :: HashMap ;
77use std:: { sync:: Arc , time:: Duration } ;
88use tokio:: time:: sleep;
9+ use tracing:: info;
910use utils:: lsn:: Lsn ;
1011
1112impl ComputeNode {
1213 /// Returns only when promote fails or succeeds. If a network error occurs
1314 /// and http client disconnects, this does not stop promotion, and subsequent
1415 /// calls block until promote finishes.
1516 /// Called by control plane on secondary after primary endpoint is terminated
16- pub async fn promote ( self : & Arc < Self > , safekeepers_lsn : SafekeepersLsn ) -> PromoteState {
17+ /// Has a failpoint "compute-promotion"
18+ pub async fn promote ( self : & Arc < Self > , cfg : PromoteConfig ) -> PromoteState {
1719 let cloned = self . clone ( ) ;
20+ let promote_fn = async move || {
21+ let Err ( err) = cloned. promote_impl ( cfg) . await else {
22+ return PromoteState :: Completed ;
23+ } ;
24+ tracing:: error!( %err, "promoting" ) ;
25+ PromoteState :: Failed {
26+ error : format ! ( "{err:#}" ) ,
27+ }
28+ } ;
29+
1830 let start_promotion = || {
1931 let ( tx, rx) = tokio:: sync:: watch:: channel ( PromoteState :: NotPromoted ) ;
20- tokio:: spawn ( async move {
21- tx. send ( match cloned. promote_impl ( safekeepers_lsn) . await {
22- Ok ( _) => PromoteState :: Completed ,
23- Err ( err) => {
24- tracing:: error!( %err, "promoting" ) ;
25- PromoteState :: Failed {
26- error : err. to_string ( ) ,
27- }
28- }
29- } )
30- } ) ;
32+ tokio:: spawn ( async move { tx. send ( promote_fn ( ) . await ) } ) ;
3133 rx
3234 } ;
3335
@@ -47,9 +49,7 @@ impl ComputeNode {
4749 task. borrow ( ) . clone ( )
4850 }
4951
50- // Why do we have to supply safekeepers?
51- // For secondary we use primary_connection_conninfo so safekeepers field is empty
52- async fn promote_impl ( & self , safekeepers_lsn : SafekeepersLsn ) -> Result < ( ) > {
52+ async fn promote_impl ( & self , mut cfg : PromoteConfig ) -> Result < ( ) > {
5353 {
5454 let state = self . state . lock ( ) . unwrap ( ) ;
5555 let mode = & state. pspec . as_ref ( ) . unwrap ( ) . spec . mode ;
@@ -73,7 +73,7 @@ impl ComputeNode {
7373 . await
7474 . context ( "connecting to postgres" ) ?;
7575
76- let primary_lsn = safekeepers_lsn . wal_flush_lsn ;
76+ let primary_lsn = cfg . wal_flush_lsn ;
7777 let mut last_wal_replay_lsn: Lsn = Lsn :: INVALID ;
7878 const RETRIES : i32 = 20 ;
7979 for i in 0 ..=RETRIES {
@@ -86,7 +86,7 @@ impl ComputeNode {
8686 if last_wal_replay_lsn >= primary_lsn {
8787 break ;
8888 }
89- tracing :: info!( "Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}" ) ;
89+ info ! ( "Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}" ) ;
9090 sleep ( Duration :: from_secs ( 1 ) ) . await ;
9191 }
9292 if last_wal_replay_lsn < primary_lsn {
@@ -96,7 +96,7 @@ impl ComputeNode {
9696 // using $1 doesn't work with ALTER SYSTEM SET
9797 let safekeepers_sql = format ! (
9898 "ALTER SYSTEM SET neon.safekeepers='{}'" ,
99- safekeepers_lsn . safekeepers
99+ cfg . spec . safekeeper_connstrings . join ( "," )
100100 ) ;
101101 client
102102 . query ( & safekeepers_sql, & [ ] )
@@ -106,6 +106,12 @@ impl ComputeNode {
106106 . query ( "SELECT pg_reload_conf()" , & [ ] )
107107 . await
108108 . context ( "reloading postgres config" ) ?;
109+
110+ #[ cfg( feature = "testing" ) ]
111+ fail:: fail_point!( "compute-promotion" , |_| {
112+ bail!( "promotion configured to fail because of a failpoint" )
113+ } ) ;
114+
109115 let row = client
110116 . query_one ( "SELECT * FROM pg_promote()" , & [ ] )
111117 . await
@@ -125,8 +131,36 @@ impl ComputeNode {
125131 bail ! ( "replica in read only mode after promotion" ) ;
126132 }
127133
128- let mut state = self . state . lock ( ) . unwrap ( ) ;
129- state. pspec . as_mut ( ) . unwrap ( ) . spec . mode = ComputeMode :: Primary ;
130- Ok ( ( ) )
134+ {
135+ let mut state = self . state . lock ( ) . unwrap ( ) ;
136+ let spec = & mut state. pspec . as_mut ( ) . unwrap ( ) . spec ;
137+ spec. mode = ComputeMode :: Primary ;
138+ let new_conf = cfg. spec . cluster . postgresql_conf . as_mut ( ) . unwrap ( ) ;
139+ let existing_conf = spec. cluster . postgresql_conf . as_ref ( ) . unwrap ( ) ;
140+ Self :: merge_spec ( new_conf, existing_conf) ;
141+ }
142+ info ! ( "applied new spec, reconfiguring as primary" ) ;
143+ self . reconfigure ( )
144+ }
145+
146+ /// Merge old and new Postgres conf specs to apply on secondary.
147+ /// Change new spec's port and safekeepers since they are supplied
148+ /// differenly
149+ fn merge_spec ( new_conf : & mut String , existing_conf : & str ) {
150+ let mut new_conf_set: HashMap < & str , & str > = new_conf
151+ . split_terminator ( '\n' )
152+ . map ( |e| e. split_once ( "=" ) . expect ( "invalid item" ) )
153+ . collect ( ) ;
154+ new_conf_set. remove ( "neon.safekeepers" ) ;
155+
156+ let existing_conf_set: HashMap < & str , & str > = existing_conf
157+ . split_terminator ( '\n' )
158+ . map ( |e| e. split_once ( "=" ) . expect ( "invalid item" ) )
159+ . collect ( ) ;
160+ new_conf_set. insert ( "port" , existing_conf_set[ "port" ] ) ;
161+ * new_conf = new_conf_set
162+ . iter ( )
163+ . map ( |( k, v) | format ! ( "{k}={v}" ) )
164+ . join ( "\n " ) ;
131165 }
132166}
0 commit comments