@@ -6,8 +6,10 @@ import (
66	"log" 
77	"log/slog" 
88	"os" 
9+ 	"sync" 
910	"time" 
1011
12+ 	"github.com/cenkalti/backoff/v4" 
1113	"github.com/skupperproject/skupper/api/types" 
1214	"github.com/skupperproject/skupper/internal/config" 
1315	"github.com/skupperproject/skupper/internal/flow" 
@@ -47,20 +49,6 @@ func (s *StatusSyncClient) Update(ctx context.Context, latest *corev1.ConfigMap)
4749	return  err 
4850}
4951
50- func  updateLockOwner (lockname , namespace  string , owner  * metav1.OwnerReference , cli  * internalclient.KubeClient ) error  {
51- 	current , err  :=  cli .Kube .CoordinationV1 ().Leases (namespace ).Get (context .TODO (), lockname , metav1.GetOptions {})
52- 	if  err  !=  nil  {
53- 		return  err 
54- 	}
55- 	if  owner  !=  nil  {
56- 		current .ObjectMeta .OwnerReferences  =  []metav1.OwnerReference {
57- 			* owner ,
58- 		}
59- 	}
60- 	_ , err  =  cli .Kube .CoordinationV1 ().Leases (namespace ).Update (context .TODO (), current , metav1.UpdateOptions {})
61- 	return  err 
62- }
63- 
6452func  siteCollector (ctx  context.Context , cli  * internalclient.KubeClient ) {
6553	siteData  :=  map [string ]string {}
6654	platform  :=  config .GetPlatform ()
@@ -84,11 +72,6 @@ func siteCollector(ctx context.Context, cli *internalclient.KubeClient) {
8472		log .Fatal ("Failed to create site status config map " , err .Error ())
8573	}
8674
87- 	err  =  updateLockOwner (types .SiteLeaderLockName , cli .Namespace , & owner , cli )
88- 	if  err  !=  nil  {
89- 		log .Println ("Update lock error" , err .Error ())
90- 	}
91- 
9275	factory  :=  session .NewContainerFactory ("amqp://localhost:5672" , session.ContainerConfig {ContainerID : "kube-flow-collector" })
9376	statusSyncClient  :=  & StatusSyncClient {
9477		client : cli .Kube .CoreV1 ().ConfigMaps (cli .Namespace ),
@@ -136,37 +119,56 @@ func startFlowController(ctx context.Context, cli *internalclient.KubeClient) er
136119}
137120
138121func  runLeaderElection (lock  * resourcelock.LeaseLock , id  string , cli  * internalclient.KubeClient ) {
139- 	ctx  :=  context .Background ()
140- 	begin  :=  time .Now ()
141- 	podname , _  :=  os .Hostname ()
142- 	leaderelection .RunOrDie (ctx , leaderelection.LeaderElectionConfig {
143- 		Lock :            lock ,
144- 		ReleaseOnCancel : true ,
145- 		LeaseDuration :   15  *  time .Second ,
146- 		RenewDeadline :   10  *  time .Second ,
147- 		RetryPeriod :     2  *  time .Second ,
148- 		Callbacks : leaderelection.LeaderCallbacks {
149- 			OnStartedLeading : func (c  context.Context ) {
150- 				log .Printf ("COLLECTOR: Leader %s starting site collection after %s\n " , podname , time .Since (begin ))
151- 				siteCollector (ctx , cli )
152- 				if  err  :=  startFlowController (ctx , cli ); err  !=  nil  {
153- 					log .Printf ("COLLECTOR: Failed to start controller for emitting site events: %s" , err )
154- 				}
155- 			},
156- 			OnStoppedLeading : func () {
157- 				// we held the lock but lost it. This indicates that something 
158- 				// went wrong. Exit and restart. 
159- 				log .Fatalf ("COLLECTOR: Lost leader lock after %s" , time .Since (begin ))
160- 			},
161- 			OnNewLeader : func (current_id  string ) {
162- 				if  current_id  ==  id  {
163- 					// Remain as the leader 
164- 					return 
165- 				}
166- 				log .Printf ("COLLECTOR: New leader for site collection is %s\n " , current_id )
122+ 	var  (
123+ 		mu               sync.Mutex 
124+ 		leaderCtx        context.Context 
125+ 		leaderCtxCancel  func ()
126+ 	)
127+ 	// attempt to run leader election forever 
128+ 	strategy  :=  backoff .NewExponentialBackOff (backoff .WithMaxElapsedTime (0 ))
129+ 	backoff .RetryNotify (func () error  {
130+ 		leaderelection .RunOrDie (context .Background (), leaderelection.LeaderElectionConfig {
131+ 			Lock :            lock ,
132+ 			ReleaseOnCancel : true ,
133+ 			LeaseDuration :   15  *  time .Second ,
134+ 			RenewDeadline :   10  *  time .Second ,
135+ 			RetryPeriod :     2  *  time .Second ,
136+ 			Callbacks : leaderelection.LeaderCallbacks {
137+ 				OnStartedLeading : func (ctx  context.Context ) {
138+ 					mu .Lock ()
139+ 					defer  mu .Unlock ()
140+ 					leaderCtx , leaderCtxCancel  =  context .WithCancel (ctx )
141+ 					log .Printf ("COLLECTOR: Became leader. Starting status sync and site controller after %s." , strategy .GetElapsedTime ())
142+ 					siteCollector (leaderCtx , cli )
143+ 					if  err  :=  startFlowController (leaderCtx , cli ); err  !=  nil  {
144+ 						log .Printf ("COLLECTOR: Failed to start controller for emitting site events: %s" , err )
145+ 					}
146+ 				},
147+ 				OnStoppedLeading : func () {
148+ 					log .Printf ("COLLECTOR: Lost leader lock after %s. Stopping status sync and site controller." , strategy .GetElapsedTime ())
149+ 					mu .Lock ()
150+ 					defer  mu .Unlock ()
151+ 					if  leaderCtxCancel  ==  nil  {
152+ 						return 
153+ 					}
154+ 					leaderCtxCancel ()
155+ 					leaderCtx , leaderCtxCancel  =  nil , nil 
156+ 				},
157+ 				OnNewLeader : func (current_id  string ) {
158+ 					if  current_id  ==  id  {
159+ 						// Remain as the leader 
160+ 						return 
161+ 					}
162+ 					log .Printf ("COLLECTOR: New leader for site collection is %s\n " , current_id )
163+ 				},
167164			},
168- 		},
169- 	})
165+ 		})
166+ 		return  fmt .Errorf ("leader election died" )
167+ 	},
168+ 		strategy ,
169+ 		func (_  error , d  time.Duration ) {
170+ 			log .Printf ("COLLECTOR: leader election failed. retrying after %s" , d )
171+ 		})
170172}
171173
172174func  StartCollector (cli  * internalclient.KubeClient ) {
0 commit comments