@@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
60276027 nthread = std::thread::hardware_concurrency ();
60286028 }
60296029
6030- llama_model_loader ml (fname_inp, /* use_mmap*/ false );
6030+ // mmap consistently increases speed Linux, and also increases speed on Windows with
6031+ // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
6032+ #if defined(__linux__) || defined(_WIN32)
6033+ constexpr bool use_mmap = true ;
6034+ #else
6035+ constexpr bool use_mmap = false ;
6036+ #endif
6037+
6038+ llama_model_loader ml (fname_inp, use_mmap);
6039+ if (ml.use_mmap ) {
6040+ ml.mapping .reset (new llama_mmap (&ml.file , /* prefetch */ 0 , ggml_is_numa ()));
6041+ }
60316042
60326043 llama_model model;
60336044 llm_load_arch (ml, model);
@@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
61056116
61066117 const std::string name = ggml_get_name (tensor);
61076118
6108- if (read_data.size () < ggml_nbytes (tensor)) {
6109- read_data.resize (ggml_nbytes (tensor));
6119+ if (!ml.use_mmap ) {
6120+ if (read_data.size () < ggml_nbytes (tensor)) {
6121+ read_data.resize (ggml_nbytes (tensor));
6122+ }
6123+ tensor->data = read_data.data ();
61106124 }
6111- tensor->data = read_data.data ();
61126125 ml.load_data_for (tensor);
61136126
61146127 LLAMA_LOG_INFO (" [%4d/%4d] %36s - [%s], type = %6s, " ,
0 commit comments