@@ -19963,12 +19963,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1996319963
1996419964#ifndef GGML_USE_OPENMP
1996519965
19966+ // check if thread is active
1996619967static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
1996719968 struct ggml_threadpool * threadpool = state->threadpool;
1996819969 int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
1996919970 return (state->ith < n_threads);
1997019971}
1997119972
19973+ // check if thread is ready to proceed (exit from polling or sleeping)
1997219974static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
1997319975 struct ggml_threadpool * threadpool = state->threadpool;
1997419976
@@ -19984,6 +19986,14 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
1998419986 return state->pending;
1998519987}
1998619988
19989+ // sync thread state after polling
19990+ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
19991+ struct ggml_threadpool * threadpool = state->threadpool;
19992+ // this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
19993+ // so instead we just use a dummy read-modify-write
19994+ atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
19995+ }
19996+
1998719997static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
1998819998 struct ggml_threadpool * threadpool = state->threadpool;
1998919999
@@ -20008,6 +20018,7 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
2000820018 struct ggml_threadpool * threadpool = state->threadpool;
2000920019
2001020020 if (ggml_graph_compute_poll_for_work(state)) {
20021+ ggml_graph_compute_thread_sync(state);
2001120022 return state->pending;
2001220023 }
2001320024
@@ -20063,7 +20074,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
2006320074// Start processing new graph
2006420075static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
2006520076{
20066- // always take the mutex here because the worker threads are doing hybrid poll/wait
20077+ // Always take the mutex here because the worker threads are doing hybrid poll/wait
2006720078
2006820079 ggml_mutex_lock(&threadpool->mutex);
2006920080
@@ -20072,7 +20083,9 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
2007220083 // Update the number of active threads
2007320084 atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
2007420085
20075- atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
20086+ // Indicate the graph is ready to be processed
20087+ // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
20088+ atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
2007620089
2007720090 if (threadpool->pause) {
2007820091 // Update main thread prio and affinity to match the threadpool settings
0 commit comments