diff --git a/common/common.cpp b/common/common.cpp
index d4e8c7405eb..138424d14ac 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1092,10 +1092,16 @@ common_init_result::common_init_result(common_params & params) :
     auto cparams = common_context_params_to_llama(params);
 
     if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        const char * msg = "%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n";
+        // hack: make sure this message is shown on CLI in case errors occur during fitting
+        if (params.verbosity == LOG_LEVEL_ERROR) {
+            LOG_WRN(msg, __func__);
+        } else {
+            LOG_INF(msg, __func__);
+        }
         llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
             params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
-            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+            params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
     }
 
     llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
diff --git a/common/log.cpp b/common/log.cpp
index b17d2b62c35..6cf5daa8e50 100644
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -125,6 +125,7 @@ struct common_log {
         file = nullptr;
         prefix = false;
         timestamps = false;
+        buffering = false;
         running = false;
         t_start = t_us();
 
@@ -156,6 +157,7 @@ struct common_log {
 
     bool prefix;
     bool timestamps;
+    bool buffering;
     bool running;
 
     int64_t t_start;
@@ -172,7 +174,7 @@ struct common_log {
     void add(enum ggml_log_level level, const char * fmt, va_list args) {
         std::lock_guard<std::mutex> lock(mtx);
 
-        if (!running) {
+        if (!running && !buffering) {
             // discard messages while the worker thread is paused
             return;
         }
@@ -250,7 +252,7 @@ struct common_log {
     void resume() {
         std::lock_guard<std::mutex> lock(mtx);
 
-        if (running) {
+        if (running || buffering) {
             return;
         }
 
@@ -353,6 +355,26 @@ struct common_log {
 
         this->timestamps = timestamps;
     }
+
+    void set_buffering(bool buffering) {
+        {
+            std::lock_guard<std::mutex> lock(mtx);
+
+            this->buffering = buffering;
+        }
+        if (buffering) {
+            pause();
+        } else {
+            resume();
+        }
+    }
+
+    void drop() {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        head = tail;
+        cv.notify_one();
+    }
 };
 
 //
@@ -412,6 +434,14 @@ void common_log_set_colors(struct common_log * log, log_colors colors) {
     log->set_colors(true);
 }
 
+void common_log_buffering(struct common_log * log, bool buffering) {
+    log->set_buffering(buffering);
+}
+
+void common_log_drop(struct common_log * log) {
+    log->drop();
+}
+
 void common_log_set_prefix(struct common_log * log, bool prefix) {
     log->set_prefix(prefix);
 }
diff --git a/common/log.h b/common/log.h
index f0f8471b5f4..d6681e4dabb 100644
--- a/common/log.h
+++ b/common/log.h
@@ -86,6 +86,22 @@ void common_log_set_prefix    (struct common_log * log, bool prefix);       // w
 void common_log_set_timestamps(struct common_log * log, bool timestamps);   // whether to output timestamps in the prefix
 void common_log_flush         (struct common_log * log);                    // flush all pending log messages
 
+// Buffering log messages allows to only write log if we encouter an error later on
+// This is useful for libraries where we want to avoid spamming the user with
+// debug/info messages unless something goes wrong.
+//
+// example:
+//  common_log_buffering(log, true);
+//  ... do stuff ...
+//  if (error) {
+//      common_log_buffering(log, false); // also flushes the log
+//  }
+//  common_log_drop(log);
+//  common_log_buffering(log, false);
+
+void common_log_buffering(struct common_log * log, bool buffering); // not thread-safe
+void common_log_drop     (struct common_log * log);
+
 // helper macros for logging
 // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
 //
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 128679d020c..76f71eab3c7 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -161,7 +161,9 @@ struct cli_context {
 int main(int argc, char ** argv) {
     common_params params;
 
-    params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs
+    // by default, less verbose logs
+    auto default_log_lvl = LOG_LEVEL_ERROR;
+    params.verbosity = default_log_lvl;
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) {
         return 1;
@@ -173,6 +175,9 @@ int main(int argc, char ** argv) {
         console::error("please use llama-completion instead\n");
     }
 
+    // TODO @ngxson: we need this to have colors in log, will it have any side effects?
+    common_log_set_prefix(common_log_main(), true);
+
     common_init();
 
     // struct that contains llama context and inference
@@ -201,21 +206,43 @@ int main(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-    console::log("\nLoading model... "); // followed by loading animation
-    console::spinner::start();
+    // prepare model loading
+    auto curr_log_level = params.verbosity;
+    bool use_default_log = curr_log_level == default_log_lvl;
+    if (use_default_log) {
+        common_log_buffering(common_log_main(), true);
+        common_log_set_verbosity_thold(LOG_LEVEL_WARN);
+        console::log("\nLoading model... "); // followed by loading animation
+        console::spinner::start();
+    }
+
     if (!ctx_cli.ctx_server.load_model(params)) {
-        console::spinner::stop();
-        console::error("\nFailed to load the model\n");
+        if (use_default_log) {
+            console::error("\n----- ERROR -----\n");
+            console::spinner::stop();
+            console::log("\n");
+            common_log_buffering(common_log_main(), false);
+        }
+        console::error("\nFailed to load the model, see logs above\n");
         return 1;
     }
 
-    console::spinner::stop();
-    console::log("\n");
+    if (use_default_log) {
+        console::spinner::stop();
+        console::log("\n");
+        common_log_set_verbosity_thold(curr_log_level);
+        common_log_drop(common_log_main());
+        common_log_buffering(common_log_main(), false);
+    }
 
+    // start server main loop in a separate thread
     std::thread inference_thread([&ctx_cli]() {
         ctx_cli.ctx_server.start_loop();
     });
 
+    // note: from this point onward, we're having 2 threads
+    // it is unsafe to call certain common_log functions that modify global state
+
     auto inf = ctx_cli.ctx_server.get_info();
     std::string modalities = "text";
     if (inf.has_inp_image) {