Skip to content

Commit 6c44e28

Browse files
committed
OCPEDGE-2037: Improve etcd handling
1 parent b98ba18 commit 6c44e28

File tree

2 files changed

+85
-56
lines changed

2 files changed

+85
-56
lines changed

pkg/controllers/etcd.go

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -169,30 +169,59 @@ func stopMicroshiftEtcdScopeIfExists() error {
169169
}
170170

171171
func checkIfEtcdIsReady(ctx context.Context) error {
172-
client, err := getEtcdClient(ctx)
173-
if err != nil {
174-
return fmt.Errorf("failed to obtain etcd client: %v", err)
175-
}
176-
defer func() { _ = client.Close() }()
172+
var client *clientv3.Client
173+
defer func() {
174+
if client != nil {
175+
_ = client.Close()
176+
}
177+
}()
177178

178-
s, err := client.Status(ctx, "localhost:2379")
179-
if err != nil {
180-
return fmt.Errorf("failed to get etcd status: %v", err)
181-
}
182-
if s.IsLearner {
183-
return nil
184-
}
179+
for attempt := 0; attempt < HealthCheckRetries; attempt++ {
180+
if client == nil {
181+
var err error
182+
client, err = getEtcdClient(ctx)
183+
if err != nil {
184+
klog.Infof("failed to obtain etcd client: %v", err)
185+
if attempt < HealthCheckRetries-1 {
186+
time.Sleep(HealthCheckWait)
187+
continue
188+
}
189+
return fmt.Errorf("failed to obtain etcd client after %d attempts: %v", HealthCheckRetries, err)
190+
}
191+
}
192+
193+
s, err := client.Status(ctx, "localhost:2379")
194+
if err != nil {
195+
_ = client.Close()
196+
client = nil
197+
klog.Infof("failed to get etcd status: %v", err)
198+
if attempt < HealthCheckRetries-1 {
199+
time.Sleep(HealthCheckWait)
200+
continue
201+
}
202+
return fmt.Errorf("failed to get etcd status after %d attempts: %v", HealthCheckRetries, err)
203+
}
204+
205+
// If my own instance is a learner I dont need to check readiness because apiserver is going
206+
// to connect to other voting members in the cluster.
207+
if s.IsLearner {
208+
return nil
209+
}
185210

186-
for i := 0; i < HealthCheckRetries; i++ {
187-
time.Sleep(HealthCheckWait)
188211
if _, err = client.Get(ctx, "health"); err == nil {
189212
return nil
190213
} else {
214+
_ = client.Close()
215+
client = nil
191216
klog.Infof("etcd not ready yet: %v", err)
192217
if err == context.Canceled {
193218
return err
194219
}
195220
}
221+
222+
if attempt < HealthCheckRetries-1 {
223+
time.Sleep(HealthCheckWait)
224+
}
196225
}
197226
return fmt.Errorf("etcd still not healthy after checking %d times", HealthCheckRetries)
198227
}

pkg/controllers/kube-apiserver.go

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ func (s *KubeAPIServer) configure(ctx context.Context, cfg *config.Config) error
164164
}
165165
}
166166

167-
etcdServers, err := discoverEtcdServers(ctx, s.configuration.Node.HostnameOverride, s.configuration.BootstrapKubeConfigPath())
167+
etcdServers, err := discoverEtcdServers(ctx, s.configuration.BootstrapKubeConfigPath())
168168
if err != nil {
169169
return fmt.Errorf("failed to discover etcd servers: %w", err)
170170
}
@@ -412,7 +412,7 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped
412412
}
413413
}
414414

415-
func discoverEtcdServers(ctx context.Context, hostname, kubeconfigPath string) ([]string, error) {
415+
func discoverEtcdServers(ctx context.Context, kubeconfigPath string) ([]string, error) {
416416
certsDir := cryptomaterial.CertsDirectory(config.DataDir)
417417
etcdPeerCertDir := cryptomaterial.EtcdPeerCertDir(certsDir)
418418

@@ -441,61 +441,61 @@ func discoverEtcdServers(ctx context.Context, hostname, kubeconfigPath string) (
441441
if err != nil {
442442
return nil, fmt.Errorf("failed to get etcd status: %w", err)
443443
}
444-
if st.IsLearner {
445-
//TODO if its a learner I need to take the server from the current non-learner members. Use the bootstrap for that.
446-
kubeconfig, err := clientcmd.LoadFromFile(kubeconfigPath)
447-
if err != nil {
448-
return nil, fmt.Errorf("failed to load bootstrap kubeconfig: %w", err)
449-
}
450444

451-
if kubeconfig == nil || kubeconfig.Clusters == nil || len(kubeconfig.Clusters) == 0 {
452-
return nil, fmt.Errorf("invalid bootstrap kubeconfig: no clusters found")
453-
}
445+
// If I am not a learner it means I am a voting member, so connecting to my own etcd instance
446+
// is fine because everything is synced.
447+
if !st.IsLearner {
448+
return []string{"https://localhost:2379"}, nil
449+
}
454450

455-
var etcdHost string
456-
for _, cluster := range kubeconfig.Clusters {
457-
etcdHost = cluster.Server
458-
break
459-
}
451+
// If I am a learner I need to connect to a member, retrieve the list of voting
452+
// members and connect to all of them.
453+
kubeconfig, err := clientcmd.LoadFromFile(kubeconfigPath)
454+
if err != nil {
455+
return nil, fmt.Errorf("failed to load bootstrap kubeconfig: %w", err)
456+
}
460457

461-
if etcdHost == "" {
462-
return nil, fmt.Errorf("failed to extract etcd hostname from bootstrap kubeconfig")
463-
}
458+
if kubeconfig == nil || kubeconfig.Clusters == nil || len(kubeconfig.Clusters) == 0 {
459+
return nil, fmt.Errorf("invalid bootstrap kubeconfig: no clusters found")
460+
}
464461

465-
etcdHost = strings.TrimPrefix(etcdHost, "https://")
466-
etcdHost, _, _ = net.SplitHostPort(etcdHost)
467-
etcdHost = fmt.Sprintf("https://%s", net.JoinHostPort(etcdHost, "2379"))
468-
client, err = clientv3.New(clientv3.Config{
469-
DialTimeout: 5 * time.Second,
470-
Endpoints: []string{etcdHost},
471-
TLS: tlsConfig,
472-
Context: ctx,
473-
})
474-
if err != nil {
475-
return nil, fmt.Errorf("failed to create etcd client: %w", err)
476-
}
462+
if len(kubeconfig.Clusters) > 1 {
463+
return nil, fmt.Errorf("invalid bootstrap kubeconfig: multiple clusters found")
464+
}
465+
466+
var etcdHost string
467+
for _, cluster := range kubeconfig.Clusters {
468+
etcdHost = cluster.Server
469+
break
470+
}
471+
472+
if etcdHost == "" {
473+
return nil, fmt.Errorf("failed to extract etcd hostname from bootstrap kubeconfig")
474+
}
475+
476+
etcdHost = strings.TrimPrefix(etcdHost, "https://")
477+
etcdHost, _, _ = net.SplitHostPort(etcdHost)
478+
etcdHost = fmt.Sprintf("https://%s", net.JoinHostPort(etcdHost, "2379"))
479+
client, err = clientv3.New(clientv3.Config{
480+
DialTimeout: 5 * time.Second,
481+
Endpoints: []string{etcdHost},
482+
TLS: tlsConfig,
483+
Context: ctx,
484+
})
485+
if err != nil {
486+
return nil, fmt.Errorf("failed to create etcd client: %w", err)
477487
}
478488

479489
resp, err := client.MemberList(ctx)
480490
if err != nil {
481491
return nil, fmt.Errorf("failed to retrieve etcd member list: %w", err)
482492
}
483493

484-
//TODO I already know if I am a learner, I had to do this before.
485-
iAmLearner := false
486494
var members []string
487495
for _, member := range resp.Members {
488-
if member.Name == hostname && member.IsLearner {
489-
iAmLearner = true
490-
continue
491-
}
492496
if !member.IsLearner {
493497
members = append(members, member.ClientURLs...)
494498
}
495499
}
496-
if iAmLearner {
497-
return members, nil
498-
}
499-
500-
return []string{"https://localhost:2379"}, nil
500+
return members, nil
501501
}

0 commit comments

Comments
 (0)