@@ -2055,6 +2055,7 @@ static UCS_F_ALWAYS_INLINE void ucp_worker_keepalive_reset(ucp_worker_h worker)
20552055 worker -> keepalive .ep_count = 0 ;
20562056 worker -> keepalive .iter_count = 0 ;
20572057 worker -> keepalive .iter = & worker -> all_eps ;
2058+ worker -> keepalive .iter_begin = worker -> keepalive .iter ;
20582059 worker -> keepalive .round_count = 0 ;
20592060}
20602061
@@ -2908,42 +2909,50 @@ void ucp_worker_print_info(ucp_worker_h worker, FILE *stream)
29082909}
29092910
29102911static UCS_F_ALWAYS_INLINE ucp_ep_h
2911- ucp_worker_keepalive_current_ep (ucp_worker_h worker )
2912- {
2913- ucp_ep_ext_gen_t * ep_ext ;
2914-
2915- ucs_assert (worker -> keepalive .iter != & worker -> all_eps );
2916- ep_ext = ucs_container_of (worker -> keepalive .iter , ucp_ep_ext_gen_t ,
2917- ep_list );
2918- return ucp_ep_from_ext_gen (ep_ext );
2919- }
2920-
2921- static UCS_F_ALWAYS_INLINE void
29222912ucp_worker_keepalive_next_ep (ucp_worker_h worker )
29232913{
29242914 ucp_ep_h ep ;
29252915
2926- worker -> keepalive .iter = worker -> keepalive .iter -> next ;
2927- if (worker -> keepalive .iter == & worker -> all_eps ) {
2928- /* if next list item points to all_eps then step one more time */
2916+ if (worker -> keepalive .lane_map == 0 ) {
29292917 worker -> keepalive .iter = worker -> keepalive .iter -> next ;
2918+ if (worker -> keepalive .iter == & worker -> all_eps ) {
2919+ return NULL ;
2920+ }
2921+
2922+ worker -> keepalive .lane_map = UCS_MASK (UCP_MAX_LANES );
29302923 }
29312924
29322925 ucs_assert (worker -> keepalive .iter != & worker -> all_eps );
2933- ep = ucp_worker_keepalive_current_ep (worker );
2934- worker -> keepalive .lane_map = ((ep -> cfg_index != UCP_WORKER_CFG_INDEX_NULL ) &&
2935- !(ep -> flags & UCP_EP_FLAG_FAILED )) ?
2936- ucp_ep_config (ep )-> key .ep_check_map : 0 ;
2926+ ep = ucp_ep_from_ext_gen (ucs_container_of (worker -> keepalive .iter ,
2927+ ucp_ep_ext_gen_t , ep_list ));
2928+
2929+ if ((ep -> cfg_index == UCP_WORKER_CFG_INDEX_NULL ) ||
2930+ (ep -> flags & UCP_EP_FLAG_FAILED )) {
2931+ worker -> keepalive .lane_map = 0 ;
2932+ return NULL ;
2933+ }
2934+
2935+ /* Take updated ep_check_map, in case endpoint configuration has changed
2936+ * before continuing this round */
2937+ worker -> keepalive .lane_map &= ucp_ep_config (ep )-> key .ep_check_map ;
2938+ if (worker -> keepalive .lane_map == 0 ) {
2939+ return NULL ;
2940+ }
2941+
2942+ return ep ;
29372943}
29382944
29392945static UCS_F_NOINLINE unsigned
29402946ucp_worker_do_keepalive_progress (ucp_worker_h worker )
29412947{
29422948 unsigned progress_count = 0 ;
2943- unsigned max_ep_count ;
2949+ unsigned max_ep_count = worker -> context -> config . ext . keepalive_num_eps ;
29442950 ucs_time_t now ;
29452951 ucp_ep_h ep ;
29462952
2953+ ucs_assertv (worker -> keepalive .ep_count < max_ep_count ,
2954+ "worker %p: ep_count=%u max_ep_count=%u" ,
2955+ worker , worker -> keepalive .ep_count , max_ep_count );
29472956 ucs_assert (worker -> context -> config .ext .keepalive_num_eps != 0 );
29482957
29492958 now = ucs_get_time ();
@@ -2967,22 +2976,19 @@ ucp_worker_do_keepalive_progress(ucp_worker_h worker)
29672976 goto out_unblock ;
29682977 }
29692978
2970- if (ucs_unlikely (worker -> keepalive .iter == & worker -> all_eps )) {
2971- ucp_worker_keepalive_next_ep (worker );
2972- }
2973-
2974- max_ep_count = ucs_min (worker -> context -> config .ext .keepalive_num_eps ,
2975- worker -> num_all_eps );
2976-
29772979 /* Use own loop for elements because standard for_each skips
29782980 * head element */
29792981 /* TODO: use more optimal algo to enumerate EPs to keepalive
29802982 * (linked list) */
2981- while (worker -> keepalive .ep_count < max_ep_count ) {
2982- ep = ucp_worker_keepalive_current_ep (worker );
2983+ do {
2984+ ep = ucp_worker_keepalive_next_ep (worker );
2985+ if (ep == NULL ) {
2986+ continue ;
2987+ }
2988+
29832989 ucs_trace_func ("worker %p: do keepalive on ep %p lane_map 0x%x" , worker ,
29842990 ep , worker -> keepalive .lane_map );
2985- if (!ucp_ep_do_keepalive (ep )) {
2991+ if (!ucp_ep_do_keepalive (ep , now )) {
29862992 /* In case if EP has no resources to send keepalive message
29872993 * then just return without update of last_round timestamp,
29882994 * on next progress iteration we will continue from this point */
@@ -2991,11 +2997,12 @@ ucp_worker_do_keepalive_progress(ucp_worker_h worker)
29912997
29922998 progress_count ++ ;
29932999 worker -> keepalive .ep_count ++ ;
2994- ucp_worker_keepalive_next_ep ( worker );
2995- }
3000+ } while (( worker -> keepalive . ep_count < max_ep_count ) &&
3001+ ( worker -> keepalive . iter != worker -> keepalive . iter_begin ));
29963002
29973003 ucs_trace ("worker %p: sent keepalive on %u endpoints" ,
29983004 worker , worker -> keepalive .ep_count );
3005+ worker -> keepalive .iter_begin = worker -> keepalive .iter ;
29993006 worker -> keepalive .last_round = now ;
30003007 worker -> keepalive .ep_count = 0 ;
30013008 worker -> keepalive .round_count ++ ;
@@ -3053,13 +3060,14 @@ void ucp_worker_keepalive_remove_ep(ucp_ep_h ep)
30533060 return ;
30543061 }
30553062
3056- if (ucs_list_is_only (& worker -> all_eps , & ucp_ep_ext_gen (ep )-> ep_list )) {
3057- /* this is the last EP in worker */
3058- worker -> keepalive .iter = & worker -> all_eps ;
3059- } else if (worker -> keepalive .iter == & ucp_ep_ext_gen (ep )-> ep_list ) {
3060- /* if iterator points into EP to be removed - then
3061- * step to next EP */
3063+ if (worker -> keepalive .iter == & ucp_ep_ext_gen (ep )-> ep_list ) {
3064+ /* Set lane_map=0 to make sure the endpoint won't be selected again */
3065+ worker -> keepalive .lane_map = 0 ;
3066+
30623067 ucp_worker_keepalive_next_ep (worker );
3068+ ucs_assert (worker -> keepalive .iter != & ucp_ep_ext_gen (ep )-> ep_list );
3069+
3070+ worker -> keepalive .iter_begin = worker -> keepalive .iter ;
30633071 }
30643072}
30653073
0 commit comments