@@ -19334,8 +19334,12 @@ typedef int ggml_lock_t;
1933419334
1933519335#endif
1933619336
19337+ #ifdef GGML_NO_OMP
19338+
19339+
1933719340// Android's libc implementation "bionic" does not support setting affinity
1933819341#if defined(__gnu_linux__)
19342+
1933919343static void set_numa_thread_affinity(int thread_n) {
1934019344 if (!ggml_is_numa()) {
1934119345 return;
@@ -19401,11 +19405,16 @@ static void clear_numa_thread_affinity(void) {
1940119405
1940219406 CPU_FREE(cpus);
1940319407}
19408+
1940419409#else
1940519410// TODO: Windows etc.
1940619411// (the linux implementation may also work on BSD, someone should test)
1940719412static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
1940819413static void clear_numa_thread_affinity(void) {}
19414+
19415+ #endif
19416+
19417+
1940919418#endif
1941019419
1941119420static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -19713,7 +19722,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1971319722
1971419723 const int n_threads = state->shared->n_threads;
1971519724
19725+ #ifdef GGML_NO_OMP
1971619726 set_numa_thread_affinity(state->ith);
19727+ #endif
1971719728
1971819729 int node_n = -1;
1971919730 int task_phase = GGML_TASK_TYPE_FINALIZE;
@@ -20086,44 +20097,50 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2008620097 };
2008720098 struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
2008820099
20089- // create thread pool
20090- if (n_threads > 1) {
20091- for (int j = 1; j < n_threads; ++j) {
20092- workers[j] = (struct ggml_compute_state) {
20093- .thrd = 0,
20094- .ith = j,
20095- .shared = &state_shared,
20096- .ec = GGML_STATUS_SUCCESS,
20097- };
20100+ const int64_t perf_start_cycles = ggml_perf_cycles();
20101+ const int64_t perf_start_time_us = ggml_perf_time_us();
2009820102
20103+ /* Loop is reversed as in the NO_OMP case we want threads to start
20104+ before the main thread (j==0) */
20105+ #pragma omp parallel for shared(workers,state_shared)
20106+ for (int j = n_threads - 1; 0 <= j; j--) {
20107+ workers[j] = (struct ggml_compute_state) {
20108+ .ith = j,
20109+ .shared = &state_shared,
20110+ .ec = GGML_STATUS_SUCCESS,
20111+ };
20112+
20113+ #ifdef GGML_NO_OMP
20114+ if(j == 0)
20115+ {
20116+ /* No need to spawn a thread for main */
20117+ ggml_graph_compute_thread(&workers[j]);
20118+ }
20119+ else
20120+ {
2009920121 const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
2010020122 GGML_ASSERT(rc == 0);
2010120123 UNUSED(rc);
2010220124 }
20125+ #else
20126+ ggml_graph_compute_thread(&workers[j]);
20127+ #endif
2010320128 }
2010420129
20105- workers[0].ith = 0;
20106- workers[0].shared = &state_shared;
20107- workers[0].ec = GGML_STATUS_SUCCESS;
20108-
20109- const int64_t perf_start_cycles = ggml_perf_cycles();
20110- const int64_t perf_start_time_us = ggml_perf_time_us();
20130+ #ifdef GGML_NO_OMP
20131+ clear_numa_thread_affinity();
20132+ #endif
2011120133
20112- // this is a work thread too
20113- ggml_graph_compute_thread(&workers[0]);
2011420134 enum ggml_status compute_status = workers[0].ec;
2011520135
20116- // don't leave affinity set on the main thread
20117- clear_numa_thread_affinity();
20118-
2011920136 // join or kill thread pool
20120- if (n_threads > 1 ) {
20121- for (int j = 1; j < n_threads; j++) {
20122- const int rc = ggml_thread_join(workers[j].thrd, NULL);
20123- GGML_ASSERT(rc == 0);
20124- if (workers[j].ec != GGML_STATUS_SUCCESS)
20125- compute_status = workers[j].ec;
20126- }
20137+ for (int j = 1; j < n_threads; j++ ) {
20138+ #ifdef GGML_NO_OMP
20139+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
20140+ GGML_ASSERT(rc == 0);
20141+ #endif
20142+ if ( workers[j].ec != GGML_STATUS_SUCCESS)
20143+ compute_status = workers[j].ec;
2012720144 }
2012820145
2012920146 // performance stats (graph)
0 commit comments