diff --git a/.gitignore b/.gitignore
index ef0a665..c4cc200 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,4 +24,6 @@ build/*
 out/*
 
 # debugging files
-*.pdb
\ No newline at end of file
+*.pdb
+
+**/**/__pycache__/*
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index fa7941a..ee5f9a7 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,7 @@
 [submodule "external/imspinner"]
 	path = external/imspinner
 	url = https://github.com/dalerank/imspinner
+[submodule "kolosal-server"]
+	path = kolosal-server
+	url = https://github.com/genta-technology/kolosal-server
+	branch = dev
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2af203c..263118e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -124,6 +124,7 @@ target_include_directories(kolosal_lib PUBLIC
     ${EXTERNAL_DIR}/imspinner
     ${CMAKE_SOURCE_DIR}/include
     ${CMAKE_SOURCE_DIR}/assets
+    ${CMAKE_SOURCE_DIR}/kolosal-server/include
     ${CURL_INCLUDE_DIR}
 )
 
@@ -165,6 +166,9 @@ else()
     )
 endif()
 
+# ==== Kolosal Server Shared Library ====
+add_subdirectory(${CMAKE_SOURCE_DIR}/kolosal-server)
+
 # ==== Main Executable ====
 if (DEBUG)
     add_executable(KolosalDesktop
@@ -178,7 +182,11 @@ else()
     )
 endif()
 
-target_link_libraries(KolosalDesktop PRIVATE kolosal_lib)
+# Link both the engine (kolosal_lib) and the Kolosal server shared library.
+target_link_libraries(KolosalDesktop PRIVATE 
+    kolosal_lib
+    kolosal_server
+)
 
 # ==== Post-Build Commands ====
 # Copy fonts
@@ -219,6 +227,15 @@ add_custom_command(
     "${EXTERNAL_DIR}/curl/bin" "$<TARGET_FILE_DIR:KolosalDesktop>"
 )
 
+# Copy Kolosal Server DLL
+add_custom_command(
+    TARGET KolosalDesktop POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        "$<TARGET_FILE:kolosal_server>"
+        "$<TARGET_FILE_DIR:KolosalDesktop>"
+    COMMENT "Copying Kolosal Server DLL to output directory"
+)
+
 # Copy Inference Engine DLLs
 add_custom_command(
     TARGET KolosalDesktop POST_BUILD
diff --git a/CMakeSettings.json b/CMakeSettings.json
index 7a17477..f5ff921 100644
--- a/CMakeSettings.json
+++ b/CMakeSettings.json
@@ -31,8 +31,7 @@
       "cmakeCommandArgs": "-DDEBUG=ON",
       "buildCommandArgs": "",
       "ctestCommandArgs": "",
-      "inheritEnvironments": [ "msvc_x64_x64" ],
-      "variables": []
+      "inheritEnvironments": [ "msvc_x64_x64" ]
     }
   ]
 }
\ No newline at end of file
diff --git a/external/genta-personal/bin/InferenceEngineLib.dll b/external/genta-personal/bin/InferenceEngineLib.dll
index c16b57e..56890bd 100644
Binary files a/external/genta-personal/bin/InferenceEngineLib.dll and b/external/genta-personal/bin/InferenceEngineLib.dll differ
diff --git a/external/genta-personal/bin/InferenceEngineLibVulkan.dll b/external/genta-personal/bin/InferenceEngineLibVulkan.dll
index 7a5ef37..07e8ca2 100644
Binary files a/external/genta-personal/bin/InferenceEngineLibVulkan.dll and b/external/genta-personal/bin/InferenceEngineLibVulkan.dll differ
diff --git a/external/genta-personal/include/inference.h b/external/genta-personal/include/inference.h
index e9b999d..2220251 100644
--- a/external/genta-personal/include/inference.h
+++ b/external/genta-personal/include/inference.h
@@ -34,7 +34,7 @@ class INFERENCE_API InferenceEngine : public IInferenceEngine
 public:
 	explicit InferenceEngine();
 
-	bool loadModel(const char* engineDir, const int mainGpuId = -1);
+	bool loadModel(const char* engineDir, const LoadingParameters lParams, const int mainGpuId = -1);
 
 	bool unloadModel();
 
diff --git a/external/genta-personal/include/inference_interface.h b/external/genta-personal/include/inference_interface.h
index ae5ec3e..541a40d 100644
--- a/external/genta-personal/include/inference_interface.h
+++ b/external/genta-personal/include/inference_interface.h
@@ -17,7 +17,7 @@ class IInferenceEngine {
 public:
     virtual ~IInferenceEngine() = default;
 
-    virtual bool loadModel(const char* engineDir, const int mainGpuId = -1) = 0;
+    virtual bool loadModel(const char* engineDir, const LoadingParameters lParams, const int mainGpuId = -1) = 0;
     virtual bool unloadModel() = 0;
     virtual int submitCompletionsJob(const CompletionParameters& params) = 0;
     virtual int submitChatCompletionsJob(const ChatCompletionParameters& params) = 0;
diff --git a/external/genta-personal/include/job.h b/external/genta-personal/include/job.h
index b22257d..2aeaace 100644
--- a/external/genta-personal/include/job.h
+++ b/external/genta-personal/include/job.h
@@ -9,6 +9,11 @@
 #include <memory>
 #include <exception>
 
+#include "types.h"
+#include "llama.h"
+#include "common.h"
+#include "sampling.h"
+
 struct Job {
     int jobId;
     std::mutex mtx;
@@ -20,6 +25,27 @@ struct Job {
     std::string errorMessage;
     float tps = 0;
     std::atomic<bool> cancelRequested{ false };
+    CompletionParameters params;
+
+    bool isDecodingPrompt = true;
+
+    int n_past;
+    int n_remain;
+    int i_prompt;
+    int n_prompt;
+    size_t n_matching_session_tokens;
+
+    std::vector<llama_token> session_tokens;
+    std::vector<llama_token> embd_inp;
+    std::string path_session;
+    struct common_sampler* smpl = nullptr;
+    int batch_pos = 0;
+
+    ~Job() {
+        if (smpl) {
+            common_sampler_free(smpl);
+        }
+    }
 };
 
 #endif // JOB_H
\ No newline at end of file
diff --git a/external/genta-personal/include/types.h b/external/genta-personal/include/types.h
index decd41e..49796ec 100644
--- a/external/genta-personal/include/types.h
+++ b/external/genta-personal/include/types.h
@@ -57,4 +57,16 @@ struct CompletionResult
 	float tps;
 };
 
+struct LoadingParameters
+{
+	int n_ctx = 4096;
+	int n_keep = 2048;
+	bool use_mlock = true;
+	bool use_mmap = false;
+	bool cont_batching = true;
+	bool warmup = false;
+	int n_parallel = 1;
+	int n_gpu_layers = 100;
+};
+
 #endif // TYPES_H
\ No newline at end of file
diff --git a/external/genta-personal/lib/InferenceEngineLib.lib b/external/genta-personal/lib/InferenceEngineLib.lib
index 6afa937..86c9cd4 100644
Binary files a/external/genta-personal/lib/InferenceEngineLib.lib and b/external/genta-personal/lib/InferenceEngineLib.lib differ
diff --git a/external/genta-personal/lib/InferenceEngineLibVulkan.lib b/external/genta-personal/lib/InferenceEngineLibVulkan.lib
index ee187ed..d477528 100644
Binary files a/external/genta-personal/lib/InferenceEngineLibVulkan.lib and b/external/genta-personal/lib/InferenceEngineLibVulkan.lib differ
diff --git a/include/config.hpp b/include/config.hpp
index bb882ae..27fef43 100644
--- a/include/config.hpp
+++ b/include/config.hpp
@@ -88,6 +88,13 @@ namespace Config
         constexpr float MAX_SIDEBAR_WIDTH = 400.0F;
     } // namespace ModelSettings
 
+	namespace DeploymentSettingsSidebar
+	{
+		constexpr float SIDEBAR_WIDTH = 200.0F;
+		constexpr float MIN_SIDEBAR_WIDTH = 200.0F;
+		constexpr float MAX_SIDEBAR_WIDTH = 400.0F;
+	} // namespace DeploymentSettingsSidebar
+
     namespace Color
     {
         constexpr ImVec4 TRANSPARENT_COL = ImVec4(0.0F, 0.0F, 0.0F, 0.0F);
diff --git a/include/model/model_loader_config_manager.hpp b/include/model/model_loader_config_manager.hpp
new file mode 100644
index 0000000..b4c153c
--- /dev/null
+++ b/include/model/model_loader_config_manager.hpp
@@ -0,0 +1,113 @@
+#ifndef MODEL_LOADER_CONFIG_MANAGER_HPP
+#define MODEL_LOADER_CONFIG_MANAGER_HPP
+
+#include "model_loader_config_persistence.hpp"
+
+#include <string>
+#include <json.hpp>
+#include <types.h>
+#include <iostream>
+
+namespace Model
+{
+    /**
+     * @brief Class for managing LLM model loading configuration
+     */
+    class ModelLoaderConfigManager {
+    public:
+        /**
+         * @brief Get singleton instance of config manager
+         * @param configFilePath Path to the configuration file (optional on first call)
+         * @return Reference to the singleton instance
+         */
+        static ModelLoaderConfigManager& getInstance(const std::string& configFilePath = "")
+        {
+            static ModelLoaderConfigManager instance(configFilePath.empty() ? "model_config.json" : configFilePath);
+
+            if (!configFilePath.empty() && configFilePath != instance.configFilePath_) {
+                // Log a warning that the config file path is being ignored after initialization
+                std::cerr << "Warning: Config file path '" << configFilePath
+                    << "' is ignored as the instance is already initialized with '"
+                    << instance.configFilePath_ << "'" << std::endl;
+            }
+
+            return instance;
+        }
+
+        // Delete copy constructor and assignment operator
+        ModelLoaderConfigManager(const ModelLoaderConfigManager&) = delete;
+        ModelLoaderConfigManager& operator=(const ModelLoaderConfigManager&) = delete;
+
+        /**
+         * @brief Get the current configuration
+         * @return Reference to the current configuration
+         */
+        const LoadingParameters& getConfig() const {
+            return config_;
+        }
+
+        /**
+         * @brief Set a complete new configuration
+         * @param config The new configuration
+         */
+        void setConfig(const LoadingParameters& config) {
+            config_ = config;
+        }
+
+        /**
+         * @brief Save current configuration to disk
+         * @return true if successful, false otherwise
+         */
+        bool saveConfig() {
+            return persistence_.saveToFile(config_, configFilePath_);
+        }
+
+        /**
+         * @brief Load configuration from disk
+         * @return true if successful, false otherwise
+         */
+        bool loadConfig() {
+            return persistence_.loadFromFile(configFilePath_, config_);
+        }
+
+        // Getters
+        int getContextSize() const { return config_.n_ctx; }
+        int getKeepSize() const { return config_.n_keep; }
+        bool getUseMlock() const { return config_.use_mlock; }
+        bool getUseMmap() const { return config_.use_mmap; }
+        bool getContinuousBatching() const { return config_.cont_batching; }
+        bool getWarmup() const { return config_.warmup; }
+        int getParallelCount() const { return config_.n_parallel; }
+        int getGpuLayers() const { return config_.n_gpu_layers; }
+
+        // Setters
+        void setContextSize(int size) { config_.n_ctx = size; }
+        void setKeepSize(int size) { config_.n_keep = size; }
+        void setUseMlock(bool use) { config_.use_mlock = use; }
+        void setUseMmap(bool use) { config_.use_mmap = use; }
+        void setContinuousBatching(bool enable) { config_.cont_batching = enable; }
+        void setWarmup(bool enable) { config_.warmup = enable; }
+        void setParallelCount(int count) { config_.n_parallel = count; }
+        void setGpuLayers(int layers) { config_.n_gpu_layers = layers; }
+
+    private:
+        explicit ModelLoaderConfigManager(const std::string& configFilePath)
+            : configFilePath_(configFilePath) {
+            // Try loading from file, if it fails, use default values
+            if (!loadConfig()) {
+                std::cout << "Using default configuration values" << std::endl;
+            }
+        }
+
+        LoadingParameters config_;
+        std::string configFilePath_;
+        ModelLoaderConfigPersistence persistence_;
+    };
+
+	inline void initializeModelLoaderConfigManager(const std::string& configFilePath = "") {
+		ModelLoaderConfigManager::getInstance(configFilePath);
+	}
+
+} // namespace Model
+
+#endif // MODEL_LOADER_CONFIG_MANAGER_HPP
\ No newline at end of file
diff --git a/include/model/model_loader_config_persistence.hpp b/include/model/model_loader_config_persistence.hpp
new file mode 100644
index 0000000..2ed61ba
--- /dev/null
+++ b/include/model/model_loader_config_persistence.hpp
@@ -0,0 +1,95 @@
+#ifndef MODEL_LOADER_CONFIG_PERSISTENCE_HPP
+#define MODEL_LOADER_CONFIG_PERSISTENCE_HPP
+
+#include <string>
+#include <json.hpp>
+#include <types.h>
+
+namespace Model
+{
+    class ModelLoaderConfigPersistence {
+    public:
+        /**
+         * @brief Save configuration to a JSON file
+         * @param config The model loader configuration
+         * @param filePath Path to save the configuration
+         * @return true if successful, false otherwise
+         */
+        bool saveToFile(const LoadingParameters& config, const std::string& filePath) {
+            try {
+                nlohmann::json j = configToJson(config);
+
+                std::ofstream file(filePath);
+                if (!file.is_open()) {
+                    std::cerr << "Error: Could not open file for writing: " << filePath << std::endl;
+                    return false;
+                }
+
+                file << j.dump(4); // Pretty print with 4 spaces indentation
+                file.close();
+
+                return true;
+            }
+            catch (const std::exception& e) {
+                std::cerr << "Error saving configuration: " << e.what() << std::endl;
+                return false;
+            }
+        }
+
+        /**
+         * @brief Load configuration from a JSON file
+         * @param filePath Path to the configuration file
+         * @param config The configuration to populate
+         * @return true if successful, false otherwise
+         */
+        bool loadFromFile(const std::string& filePath, LoadingParameters& config) {
+            try {
+                std::ifstream file(filePath);
+                if (!file.is_open()) {
+                    std::cerr << "Error: Could not open file for reading: " << filePath << std::endl;
+                    return false;
+                }
+
+                nlohmann::json j;
+                file >> j;
+                file.close();
+
+                jsonToConfig(j, config);
+                return true;
+            }
+            catch (const std::exception& e) {
+                std::cerr << "Error loading configuration: " << e.what() << std::endl;
+                return false;
+            }
+        }
+
+    private:
+        nlohmann::json configToJson(const LoadingParameters& config) {
+            nlohmann::json j;
+
+            j["n_ctx"] = config.n_ctx;
+            j["n_keep"] = config.n_keep;
+            j["use_mlock"] = config.use_mlock;
+            j["use_mmap"] = config.use_mmap;
+            j["cont_batching"] = config.cont_batching;
+            j["warmup"] = config.warmup;
+            j["n_parallel"] = config.n_parallel;
+            j["n_gpu_layers"] = config.n_gpu_layers;
+
+            return j;
+        }
+
+        void jsonToConfig(const nlohmann::json& json, LoadingParameters& config) {
+            if (json.contains("n_ctx")) config.n_ctx = json["n_ctx"];
+            if (json.contains("n_keep")) config.n_keep = json["n_keep"];
+            if (json.contains("use_mlock")) config.use_mlock = json["use_mlock"];
+            if (json.contains("use_mmap")) config.use_mmap = json["use_mmap"];
+            if (json.contains("cont_batching")) config.cont_batching = json["cont_batching"];
+            if (json.contains("warmup")) config.warmup = json["warmup"];
+            if (json.contains("n_parallel")) config.n_parallel = json["n_parallel"];
+            if (json.contains("n_gpu_layers")) config.n_gpu_layers = json["n_gpu_layers"];
+        }
+    };
+} // namespace Model
+
+#endif // MODEL_LOADER_CONFIG_PERSISTENCE_HPP
\ No newline at end of file
diff --git a/include/model/model_manager.hpp b/include/model/model_manager.hpp
index 15bd630..aea172f 100644
--- a/include/model/model_manager.hpp
+++ b/include/model/model_manager.hpp
@@ -2,7 +2,9 @@
 
 #include "preset_manager.hpp"
 #include "model_persistence.hpp"
+#include "model_loader_config_manager.hpp"
 
+#include <kolosal_server.hpp>
 #include <types.h>
 #include <inference_interface.h>
 #include <string>
@@ -249,6 +251,35 @@ namespace Model
 		// Inference Engine
 		//--------------------------------------------------------------------------------------------
 
+        ChatCompletionParameters buildChatCompletionParameters(
+            const ChatCompletionRequest& request) {
+            ChatCompletionParameters params;
+
+            // Copy messages from the request
+            for (const auto& msg : request.messages) {
+                params.messages.push_back({ msg.role, msg.content });
+            }
+
+            // Map parameters from request to our format
+            if (request.seed.has_value()) {
+                params.randomSeed = request.seed.value();
+            }
+
+            if (request.max_tokens.has_value()) {
+                params.maxNewTokens = request.max_tokens.value();
+            }
+            else {
+                // Use a reasonable default if not specified
+                params.maxNewTokens = 1024;
+            }
+
+            params.temperature = request.temperature;
+            params.topP = request.top_p;
+            params.streaming = request.stream;
+
+            return params;
+        }
+
         ChatCompletionParameters buildChatCompletionParameters(
             const Chat::ChatHistory& currentChat,
             const std::string& userInput
@@ -337,12 +368,6 @@ namespace Model
             return completionParams;
         }
 
-        void setStreamingCallback(std::function<void(const std::string&, const float, const int)> callback)
-        {
-            std::unique_lock<std::shared_mutex> lock(m_mutex);
-            m_streamingCallback = std::move(callback);
-        }
-
         bool stopJob(int jobId)
         {
             std::shared_lock<std::shared_mutex> lock(m_mutex);
@@ -355,7 +380,138 @@ namespace Model
             return true;
         }
 
-        int startCompletionJob(const CompletionParameters& params)
+        CompletionResult completeSync(const CompletionParameters& params)
+        {
+            {
+                std::shared_lock<std::shared_mutex> lock(m_mutex);
+                if (!m_inferenceEngine)
+                {
+                    std::cerr << "[ModelManager] Inference engine is not initialized.\n";
+                    CompletionResult result;
+                    result.text = "";
+                    result.tps = 0.0F;
+                    return result;
+                }
+                if (!m_modelLoaded)
+                {
+                    std::cerr << "[ModelManager] No model is currently loaded.\n";
+                    CompletionResult result;
+                    result.text = "";
+                    result.tps = 0.0F;
+                    return result;
+                }
+            }
+
+            int jobId = m_inferenceEngine->submitCompletionsJob(params);
+            if (jobId < 0) {
+                std::cerr << "[ModelManager] Failed to submit completions job.\n";
+                CompletionResult result;
+                result.text = "";
+                result.tps = 0.0F;
+                return result;
+            }
+
+            // Add job ID with proper synchronization
+            {
+                std::unique_lock<std::shared_mutex> lock(m_mutex);
+                m_jobIds.push_back(jobId);
+            }
+
+            // Wait for the job to complete
+            m_inferenceEngine->waitForJob(jobId);
+
+            // Get the final result
+            CompletionResult result = m_inferenceEngine->getJobResult(jobId);
+
+            // Check for errors
+            if (m_inferenceEngine->hasJobError(jobId)) {
+                std::cerr << "[ModelManager] Error in completion job: "
+                    << m_inferenceEngine->getJobError(jobId) << std::endl;
+            }
+
+            // Clean up with proper synchronization
+            {
+                std::unique_lock<std::shared_mutex> lock(m_mutex);
+                m_jobIds.erase(std::remove(m_jobIds.begin(), m_jobIds.end(), jobId), m_jobIds.end());
+            }
+
+            return result;
+        }
+
+        CompletionResult chatCompleteSync(const ChatCompletionParameters& params)
+        {
+            {
+                std::shared_lock<std::shared_mutex> lock(m_mutex);
+                if (!m_inferenceEngine)
+                {
+                    std::cerr << "[ModelManager] Inference engine is not initialized.\n";
+                    CompletionResult result;
+                    result.text = "";
+                    result.tps = 0.0F;
+                    return result;
+                }
+                if (!m_modelLoaded)
+                {
+                    std::cerr << "[ModelManager] No model is currently loaded.\n";
+                    CompletionResult result;
+                    result.text = "";
+                    result.tps = 0.0F;
+                    return result;
+                }
+            }
+
+            int jobId = m_inferenceEngine->submitChatCompletionsJob(params);
+            if (jobId < 0) {
+                std::cerr << "[ModelManager] Failed to submit chat completions job.\n";
+                CompletionResult result;
+                result.text = "";
+                result.tps = 0.0F;
+                return result;
+            }
+
+            // Add job ID with proper synchronization
+            {
+                std::unique_lock<std::shared_mutex> lock(m_mutex);
+                m_jobIds.push_back(jobId);
+            }
+
+            auto& chatManager = Chat::ChatManager::getInstance();
+
+            // Wait for the job to complete
+            m_inferenceEngine->waitForJob(jobId);
+
+            // Get the final result
+            CompletionResult result = m_inferenceEngine->getJobResult(jobId);
+
+            // Check for errors
+            if (m_inferenceEngine->hasJobError(jobId)) {
+                std::cerr << "[ModelManager] Error in chat completion job: "
+                    << m_inferenceEngine->getJobError(jobId) << std::endl;
+            }
+
+            // Clean up with proper synchronization
+            {
+                std::unique_lock<std::shared_mutex> lock(m_mutex);
+                m_jobIds.erase(std::remove(m_jobIds.begin(), m_jobIds.end(), jobId), m_jobIds.end());
+            }
+
+            // Save the chat history
+            auto chatName = chatManager.getChatNameByJobId(jobId);
+            if (!chatManager.saveChat(chatName))
+            {
+                std::cerr << "[ModelManager] Failed to save chat: " << chatName << std::endl;
+            }
+
+            // Reset jobid tracking on chat manager
+            if (!chatManager.removeJobId(jobId))
+            {
+                std::cerr << "[ModelManager] Failed to remove job id from chat manager.\n";
+            }
+
+            return result;
+        }
+
+        int startCompletionJob(const CompletionParameters& params, std::function<void(const std::string&, const float, const int, const bool)> streamingCallback)
         {
             {
                 std::shared_lock<std::shared_mutex> lock(m_mutex);
@@ -377,44 +533,45 @@ namespace Model
                 return -1;
             }
 
-            m_jobIds.push_back(jobId);
+            // Add job ID with proper synchronization
+            {
+                std::unique_lock<std::shared_mutex> lock(m_mutex);
+                m_jobIds.push_back(jobId);
+            }
 
-            std::thread([this, jobId]() {
+            std::thread([this, jobId, streamingCallback]() {
                 // Poll while job is running or until the engine says it's done
-				this->setModelGenerationInProgress(true);
                 while (true)
                 {
                     if (this->m_inferenceEngine->hasJobError(jobId)) break;
 
                     CompletionResult partial = this->m_inferenceEngine->getJobResult(jobId);
+                    bool isFinished = this->m_inferenceEngine->isJobFinished(jobId);
 
                     if (!partial.text.empty()) {
-                        // Call the user�s callback
-                        // (hold shared lock if needed to be thread-safe)
-                        std::shared_lock<std::shared_mutex> lock(m_mutex);
-                        if (m_streamingCallback) {
-                            m_streamingCallback(partial.text, partial.tps, jobId);
+                        // Call the user's callback (no need to lock for the callback)
+                        if (streamingCallback) {
+                            streamingCallback(partial.text, partial.tps, jobId, isFinished);
                         }
                     }
 
-                    if (this->m_inferenceEngine->isJobFinished(jobId)) break;
+                    if (isFinished) break;
 
                     // Sleep briefly to avoid busy-waiting
                     std::this_thread::sleep_for(std::chrono::milliseconds(100));
                 }
 
-				this->setModelGenerationInProgress(false);
-
+                // Remove job ID with proper synchronization
                 {
-                    // remove job id from m_jobIds
-					m_jobIds.erase(std::remove(m_jobIds.begin(), m_jobIds.end(), jobId), m_jobIds.end());
+                    std::unique_lock<std::shared_mutex> lock(m_mutex);
+                    m_jobIds.erase(std::remove(m_jobIds.begin(), m_jobIds.end(), jobId), m_jobIds.end());
                 }
 
-                // Reset jobid tracking on chat manager to -1
+                // Reset jobid tracking on chat manager
                 {
                     if (!Chat::ChatManager::getInstance().removeJobId(jobId))
                     {
-						std::cerr << "[ModelManager] Failed to remove job id from chat manager.\n";
+                        std::cerr << "[ModelManager] Failed to remove job id from chat manager.\n";
                     }
                 }
                 }).detach();
@@ -422,7 +579,7 @@ namespace Model
             return jobId;
         }
 
-        int startChatCompletionJob(const ChatCompletionParameters& params)
+        int startChatCompletionJob(const ChatCompletionParameters& params, std::function<void(const std::string&, const float, const int, const bool)> streamingCallback)
         {
             {
                 std::shared_lock<std::shared_mutex> lock(m_mutex);
@@ -444,11 +601,14 @@ namespace Model
                 return -1;
             }
 
-            m_jobIds.push_back(jobId);
+            // Add job ID with proper synchronization
+            {
+                std::unique_lock<std::shared_mutex> lock(m_mutex);
+                m_jobIds.push_back(jobId);
+            }
 
-            std::thread([this, jobId]() {
+            std::thread([this, jobId, streamingCallback]() {
                 // Poll while job is running or until the engine says it's done
-				this->setModelGenerationInProgress(true);
                 auto& chatManager = Chat::ChatManager::getInstance();
 
                 while (true)
@@ -456,38 +616,37 @@ namespace Model
                     if (this->m_inferenceEngine->hasJobError(jobId)) break;
 
                     CompletionResult partial = this->m_inferenceEngine->getJobResult(jobId);
+                    bool isFinished = this->m_inferenceEngine->isJobFinished(jobId);
 
                     if (!partial.text.empty()) {
-                        // Call the user�s callback
-                        std::shared_lock<std::shared_mutex> lock(m_mutex);
-                        if (m_streamingCallback) {
-                            m_streamingCallback(partial.text, partial.tps, jobId);
+                        // Call the user's callback (no need to lock for the callback)
+                        if (streamingCallback) {
+                            streamingCallback(partial.text, partial.tps, jobId, isFinished);
                         }
                     }
 
-                    if (this->m_inferenceEngine->isJobFinished(jobId)) break;
+                    if (isFinished) break;
 
                     // Sleep briefly to avoid busy-waiting
                     std::this_thread::sleep_for(std::chrono::milliseconds(100));
                 }
 
-				this->setModelGenerationInProgress(false);
-
-				{
-					// remove job id from m_jobIds
-					m_jobIds.erase(std::remove(m_jobIds.begin(), m_jobIds.end(), jobId), m_jobIds.end());
-				}
+                // Remove job ID with proper synchronization
+                {
+                    std::unique_lock<std::shared_mutex> lock(m_mutex);
+                    m_jobIds.erase(std::remove(m_jobIds.begin(), m_jobIds.end(), jobId), m_jobIds.end());
+                }
 
-				// save the chat history
-				{
+                // Save the chat history
+                {
                     auto chatName = chatManager.getChatNameByJobId(jobId);
                     if (!chatManager.saveChat(chatName))
                     {
                         std::cerr << "[ModelManager] Failed to save chat: " << chatName << std::endl;
                     }
-				}
+                }
 
-                // Reset jobid tracking on chat manager to -1
+                // Reset jobid tracking on chat manager
                 {
                     if (!chatManager.removeJobId(jobId))
                     {
@@ -501,6 +660,7 @@ namespace Model
 
         bool isJobFinished(int jobId)
         {
+            std::shared_lock<std::shared_mutex> lock(m_mutex);
             if (!m_inferenceEngine)
             {
                 std::cerr << "[ModelManager] Inference engine is not initialized.\n";
@@ -511,26 +671,29 @@ namespace Model
 
         CompletionResult getJobResult(int jobId)
         {
+            std::shared_lock<std::shared_mutex> lock(m_mutex);
             if (!m_inferenceEngine)
             {
                 std::cerr << "[ModelManager] Inference engine is not initialized.\n";
                 return { {}, "" };
             }
-			return m_inferenceEngine->getJobResult(jobId);
+            return m_inferenceEngine->getJobResult(jobId);
         }
 
         bool hasJobError(int jobId)
         {
+            std::shared_lock<std::shared_mutex> lock(m_mutex);
             if (!m_inferenceEngine)
             {
                 std::cerr << "[ModelManager] Inference engine is not initialized.\n";
                 return true;
             }
-			return m_inferenceEngine->hasJobError(jobId);
+            return m_inferenceEngine->hasJobError(jobId);
         }
 
         std::string getJobError(int jobId)
         {
+            std::shared_lock<std::shared_mutex> lock(m_mutex);
             if (!m_inferenceEngine)
             {
                 std::cerr << "[ModelManager] Inference engine is not initialized.\n";
@@ -539,6 +702,318 @@ namespace Model
             return m_inferenceEngine->getJobError(jobId);
         }
 
+		//--------------------------------------------------------------------------------------------
+        // Server management
+		//--------------------------------------------------------------------------------------------
+
+        bool startServer(const std::string& port) {
+            // Stop any existing server
+            kolosal::ServerAPI::instance().shutdown();
+
+            // Initialize logger
+            Logger::instance().setLogFile("model_server.log");
+            Logger::instance().setLevel(LogLevel::SERVER_INFO);
+            Logger::logInfo("Starting model server on port %s", port.c_str());
+
+            // Set inference callbacks
+            kolosal::ServerAPI::instance().setInferenceCallback(
+                [this](const ChatCompletionRequest& request) {
+                    return this->handleNonStreamingRequest(request);
+                }
+            );
+
+            kolosal::ServerAPI::instance().setStreamingInferenceCallback(
+                [this](const ChatCompletionRequest& request,
+                    const std::string& requestId,
+                    int chunkIndex,
+                    ChatCompletionChunk& outputChunk) {
+                        return this->handleStreamingRequest(request, requestId, chunkIndex, outputChunk);
+                }
+            );
+
+            // Initialize and start the server
+            if (!kolosal::ServerAPI::instance().init(port)) {
+                Logger::logError("Failed to start model server");
+                return false;
+            }
+
+            Logger::logInfo("Model server started successfully");
+            return true;
+        }
+
+        void stopServer() {
+            Logger::logInfo("Stopping model server");
+            kolosal::ServerAPI::instance().shutdown();
+        }
+
+        ChatCompletionResponse handleNonStreamingRequest(const ChatCompletionRequest& request) {
+            // Build parameters from the incoming request.
+            ChatCompletionParameters params = buildChatCompletionParameters(request);
+            // (The parameters will include the messages and other fields.)
+            params.streaming = false;
+
+            // Invoke the synchronous chat completion method.
+            CompletionResult result = chatCompleteSync(params);
+
+            // Map the engine’s result to our ChatCompletionResponse.
+            ChatCompletionResponse response = convertToChatResponse(request, result);
+            return response;
+        }
+
+        bool ModelManager::handleStreamingRequest(
+            const ChatCompletionRequest& request,
+            const std::string& requestId,
+            int chunkIndex,
+            ChatCompletionChunk& outputChunk) {
+            // Look up (or create) the StreamingContext for this requestId.
+            std::shared_ptr<StreamingContext> ctx;
+            {
+                std::unique_lock<std::mutex> lock(m_streamContextsMutex);
+                auto it = m_streamingContexts.find(requestId);
+                if (it == m_streamingContexts.end()) {
+                    // For the very first chunk (chunkIndex==0) we create a new context.
+                    if (chunkIndex == 0) {
+                        ctx = std::make_shared<StreamingContext>();
+                        m_streamingContexts[requestId] = ctx;
+                    }
+                    else {
+                        // If no context and chunk index is not zero, something is wrong.
+                        Logger::logError("[ModelManager] Streaming context not found for requestId: %s",
+                            requestId.c_str());
+                        return false;
+                    }
+                }
+                else {
+                    ctx = it->second;
+                }
+            }
+
+            // If this is the first call (chunkIndex 0), start the asynchronous job.
+            if (chunkIndex == 0) {
+                // Build parameters with streaming enabled.
+                ChatCompletionParameters params = buildChatCompletionParameters(request);
+                params.streaming = true;
+
+                // Track the job ID and model name for this request
+                int jobId = -1;
+
+                {
+                    std::lock_guard<std::mutex> lock(ctx->mtx);
+                    ctx->model = request.model;
+                    ctx->jobId = m_inferenceEngine->submitChatCompletionsJob(params);
+                    jobId = ctx->jobId;
+                }
+
+                if (jobId < 0) {
+                    Logger::logError("[ModelManager] Failed to submit chat completions job for requestId: %s",
+                        requestId.c_str());
+                    {
+                        std::lock_guard<std::mutex> lock(ctx->mtx);
+                        ctx->error = true;
+                        ctx->errorMessage = "Failed to start completion job";
+                        ctx->finished = true;
+                    }
+                    {
+                        std::unique_lock<std::mutex> lock(m_streamContextsMutex);
+                        m_streamingContexts.erase(requestId);
+                    }
+                    return false;
+                }
+
+                // Add job ID with proper synchronization to the global tracking
+                {
+                    std::unique_lock<std::shared_mutex> lock(m_mutex);
+                    m_jobIds.push_back(jobId);
+                }
+
+                // Launch an asynchronous thread that polls the job and accumulates new text.
+                std::thread([this, jobId, requestId, ctx]() {
+                    std::string lastText;
+                    auto startTime = std::chrono::steady_clock::now();
+
+                    try {
+                        while (true) {
+                            // Check if the job has an error
+                            if (this->m_inferenceEngine->hasJobError(jobId)) {
+                                std::string errorMsg = this->m_inferenceEngine->getJobError(jobId);
+                                Logger::logError("[ModelManager] Streaming job error for jobId: %d - %s",
+                                    jobId, errorMsg.c_str());
+                                {
+                                    std::lock_guard<std::mutex> lock(ctx->mtx);
+                                    ctx->error = true;
+                                    ctx->errorMessage = errorMsg;
+                                    ctx->finished = true;
+                                }
+                                ctx->cv.notify_all();
+                                break;
+                            }
+
+                            // Get the current result and check if finished
+                            CompletionResult partial = this->m_inferenceEngine->getJobResult(jobId);
+                            bool isFinished = this->m_inferenceEngine->isJobFinished(jobId);
+
+                            // Compute delta text (only new text since last poll).
+                            std::string newText;
+                            if (partial.text.size() > lastText.size()) {
+                                newText = partial.text.substr(lastText.size());
+                                lastText = partial.text;
+                            }
+
+                            // If we have new text, add it to the chunks
+                            if (!newText.empty()) {
+                                {
+                                    std::lock_guard<std::mutex> lock(ctx->mtx);
+                                    ctx->chunks.push_back(newText);
+                                }
+                                ctx->cv.notify_all();
+                            }
+
+                            // If the job is finished, set the finished flag and break
+                            if (isFinished) {
+                                auto endTime = std::chrono::steady_clock::now();
+                                auto durationMs = std::chrono::duration_cast<std::chrono::milliseconds>(
+                                    endTime - startTime).count();
+
+                                Logger::logInfo("[ModelManager] Streaming job %d completed in %lld ms",
+                                    jobId, durationMs);
+
+                                {
+                                    std::lock_guard<std::mutex> lock(ctx->mtx);
+                                    ctx->finished = true;
+                                }
+                                ctx->cv.notify_all();
+                                break;
+                            }
+                        }
+                    }
+                    catch (const std::exception& e) {
+                        Logger::logError("[ModelManager] Exception in streaming thread: %s", e.what());
+                        {
+                            std::lock_guard<std::mutex> lock(ctx->mtx);
+                            ctx->error = true;
+                            ctx->errorMessage = e.what();
+                            ctx->finished = true;
+                        }
+                        ctx->cv.notify_all();
+                    }
+
+                    // Clean up job ID tracking
+                    {
+                        std::unique_lock<std::shared_mutex> lock(this->m_mutex);
+                        this->m_jobIds.erase(
+                            std::remove(this->m_jobIds.begin(), this->m_jobIds.end(), jobId),
+                            this->m_jobIds.end());
+                    }
+
+                    // We don't erase the streaming context here - that happens when the last chunk is requested
+                    }).detach();
+            }
+
+            if (chunkIndex == 0) {
+                // First chunk - just send the role (OpenAI format)
+                outputChunk.id = requestId;
+                outputChunk.model = request.model;
+
+                ChatCompletionChunkChoice choice;
+                choice.index = 0;
+                choice.delta.role = "assistant";  // Always "assistant" role for responses
+                choice.delta.content = "";        // Empty content in first chunk (just role)
+                choice.finish_reason = "";        // No finish reason yet
+
+                outputChunk.choices.clear();
+                outputChunk.choices.push_back(choice);
+
+                // More chunks will follow
+                return true;
+            }
+            else {
+                // For chunkIndex > 0, wait for the (chunkIndex-1)-th text chunk or completion
+                std::unique_lock<std::mutex> lock(ctx->mtx);
+
+                // Wait with a timeout for better responsiveness
+                bool result = ctx->cv.wait_for(lock, std::chrono::seconds(30), [ctx, chunkIndex]() {
+                    return (ctx->chunks.size() >= static_cast<size_t>(chunkIndex)) ||
+                        ctx->finished || ctx->error;
+                    });
+
+                if (!result) {
+                    // If we timed out
+                    Logger::logError("[ModelManager] Timeout waiting for chunk %d for requestId %s",
+                        chunkIndex, requestId.c_str());
+
+                    // Clean up and return error
+                    std::unique_lock<std::mutex> glock(m_streamContextsMutex);
+                    m_streamingContexts.erase(requestId);
+                    return false;
+                }
+
+                // If an error occurred, clean up the context and signal termination
+                if (ctx->error) {
+                    Logger::logError("[ModelManager] Error in streaming job for requestId %s: %s",
+                        requestId.c_str(), ctx->errorMessage.c_str());
+
+                    std::unique_lock<std::mutex> glock(m_streamContextsMutex);
+                    m_streamingContexts.erase(requestId);
+                    return false;
+                }
+
+                // If job is finished but we don't have this chunk, send a final chunk
+                if (ctx->chunks.size() < static_cast<size_t>(chunkIndex) && ctx->finished) {
+                    outputChunk.id = requestId;
+                    outputChunk.model = ctx->model;
+
+                    ChatCompletionChunkChoice choice;
+                    choice.index = 0;
+                    choice.delta.content = "";       // Empty content
+                    choice.finish_reason = "stop";   // Mark as final chunk
+
+                    outputChunk.choices.clear();
+                    outputChunk.choices.push_back(choice);
+
+                    // Clean up the context
+                    {
+                        std::unique_lock<std::mutex> glock(m_streamContextsMutex);
+                        m_streamingContexts.erase(requestId);
+                    }
+
+                    return false; // No more chunks to send
+                }
+
+                // Get the content for this chunk
+                std::string chunkContent = ctx->chunks[chunkIndex - 1];
+                outputChunk.id = requestId;
+                outputChunk.model = ctx->model;
+
+                ChatCompletionChunkChoice choice;
+                choice.index = 0;
+                choice.delta.content = chunkContent;
+                choice.finish_reason = "";
+
+                outputChunk.choices.clear();
+                outputChunk.choices.push_back(choice);
+
+                // Check if this is the last chunk
+                bool isLastChunk = ctx->finished && (ctx->chunks.size() == static_cast<size_t>(chunkIndex));
+
+                if (isLastChunk) {
+                    // Set finish reason for the last content chunk
+                    choice.finish_reason = "stop";
+                    outputChunk.choices[0] = choice;
+
+                    // Clean up the context
+                    {
+                        std::unique_lock<std::mutex> glock(m_streamContextsMutex);
+                        m_streamingContexts.erase(requestId);
+                    }
+
+                    return false; // No more chunks to send
+                }
+
+                // More chunks to come
+                return true;
+            }
+        }
+
         std::string getCurrentVariantForModel(const std::string& modelName) const 
         {
             auto it = m_modelVariantMap.find(modelName);
@@ -600,7 +1075,6 @@ namespace Model
 
 		bool setModelGenerationInProgress(bool inProgress)
 		{
-			std::unique_lock<std::shared_mutex> lock(m_mutex);
 			m_modelGenerationInProgress = inProgress;
 			return true;
 		}
@@ -1210,7 +1684,8 @@ namespace Model
             // Launch heavy loading in async task
             return std::async(std::launch::async, [this, modelDir]() {
                 try {
-                    bool success = m_inferenceEngine->loadModel(modelDir->c_str());
+                    bool success = m_inferenceEngine->loadModel(modelDir->c_str(),
+                        ModelLoaderConfigManager::getInstance().getConfig());
 
                     {
                         std::unique_lock<std::shared_mutex> lock(m_mutex);
@@ -1273,7 +1748,13 @@ namespace Model
 
         void stopAllJobs()
         {
-            for (auto jobId : m_jobIds)
+            std::vector<int> jobIdsCopy;
+            {
+                std::shared_lock<std::shared_mutex> lock(m_mutex);
+                jobIdsCopy = m_jobIds;
+            }
+
+            for (auto jobId : jobIdsCopy)
             {
                 stopJob(jobId);
             }
@@ -1296,6 +1777,30 @@ namespace Model
             }
         }
 
+        static ChatCompletionResponse convertToChatResponse(
+            const ChatCompletionRequest& request, const CompletionResult& result)
+        {
+            ChatCompletionResponse response;
+            response.model = request.model;
+
+            ChatCompletionChoice choice;
+            choice.index = 0;
+            choice.message.role = "assistant";
+            choice.message.content = result.text;
+            // For simplicity we assume the response is complete.
+            choice.finish_reason = "stop";
+
+            response.choices.push_back(choice);
+            // For usage we make a simple estimate (adjust as needed)
+            response.usage.prompt_tokens = 0;
+            response.usage.completion_tokens =
+                static_cast<int>(result.text.size() / 5);
+            response.usage.total_tokens =
+                response.usage.prompt_tokens + response.usage.completion_tokens;
+
+            return response;
+        }
+
         mutable std::shared_mutex                       m_mutex;
         std::unique_ptr<IModelPersistence>              m_persistence;
         std::vector<ModelData>                          m_models;
@@ -1324,7 +1829,20 @@ namespace Model
 
         IInferenceEngine* m_inferenceEngine = nullptr;
 
-		std::function<void(const std::string&, const float, const int)> m_streamingCallback;
+		// Server related
+        struct StreamingContext {
+            std::mutex mtx;
+            std::condition_variable cv;
+            std::vector<std::string> chunks;
+            std::string model;        // Store model name
+            int jobId = -1;           // Store job ID
+            std::string errorMessage; // Store error details
+            bool finished = false;
+            bool error = false;
+        };
+        std::mutex m_streamContextsMutex;
+        std::unordered_map<std::string, std::shared_ptr<StreamingContext>>
+            m_streamingContexts;
     };
 
     inline void initializeModelManager()
diff --git a/include/model/server_state_manager.hpp b/include/model/server_state_manager.hpp
new file mode 100644
index 0000000..c58f77b
--- /dev/null
+++ b/include/model/server_state_manager.hpp
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "model_manager.hpp"
+
+#include <string>
+#include <functional>
+#include <optional>
+
+class ServerStateManager {
+public:
+    static ServerStateManager& getInstance() {
+        static ServerStateManager instance;
+        return instance;
+    }
+
+    // Server status
+    bool isServerRunning() const { return m_serverRunning; }
+    void setServerRunning(bool running) { m_serverRunning = running; }
+
+    // Server port
+    int getServerPort() const { return m_serverPort; }
+    void setServerPort(int port) { m_serverPort = port; }
+
+    // Get port as string for display and connection purposes
+    std::string getServerPortString() const {
+        return std::to_string(m_serverPort);
+    }
+
+    // Model state observers
+    bool isModelLoadInProgress() const {
+        return Model::ModelManager::getInstance().isLoadInProgress();
+    }
+
+    bool isModelLoaded() const {
+        return Model::ModelManager::getInstance().isModelLoaded();
+    }
+
+    std::optional<std::string> getCurrentModelName() const {
+        return Model::ModelManager::getInstance().getCurrentModelName();
+    }
+
+    // Model parameters change tracking
+    bool haveModelParamsChanged() const { return m_modelParamsChanged; }
+    void setModelParamsChanged(bool changed) { m_modelParamsChanged = changed; }
+    void resetModelParamsChanged() { m_modelParamsChanged = false; }
+
+private:
+    ServerStateManager() : m_serverRunning(false), m_serverPort(8080), m_modelParamsChanged(false) {}
+
+    bool m_serverRunning;
+    int m_serverPort;
+    bool m_modelParamsChanged;
+};
\ No newline at end of file
diff --git a/include/ui/chat/chat_history.hpp b/include/ui/chat/chat_history.hpp
index 15f762d..63688e5 100644
--- a/include/ui/chat/chat_history.hpp
+++ b/include/ui/chat/chat_history.hpp
@@ -198,6 +198,36 @@ class ChatHistoryRenderer {
         ImGui::EndGroup();
     }
 
+    static void chatStreamingCallback(const std::string& partialOutput, const float tps, const int jobId, const bool isFinished) {
+        auto& chatManager = Chat::ChatManager::getInstance();
+        auto& modelManager = Model::ModelManager::getInstance();
+        std::string chatName = chatManager.getChatNameByJobId(jobId);
+
+        if (isFinished) modelManager.setModelGenerationInProgress(false);
+
+        auto chatOpt = chatManager.getChat(chatName);
+        if (chatOpt) {
+            Chat::ChatHistory chat = chatOpt.value();
+            if (!chat.messages.empty() && chat.messages.back().role == "assistant") {
+                // Append to existing assistant message
+                chat.messages.back().content = partialOutput;
+                chat.messages.back().tps = tps;
+                chatManager.updateChat(chatName, chat);
+            }
+            else {
+                // Create new assistant message
+                Chat::Message assistantMsg;
+                assistantMsg.id = static_cast<int>(chat.messages.size()) + 1;
+                assistantMsg.role = "assistant";
+                assistantMsg.content = partialOutput;
+                assistantMsg.tps = tps;
+                assistantMsg.modelName = modelManager.getCurrentModelName().value_or("idk") + " | "
+                    + modelManager.getCurrentVariantType();
+                chatManager.addMessage(chatName, assistantMsg);
+            }
+        }
+    }
+
     void regenerateResponse(int index) {
         Model::ModelManager& modelManager = Model::ModelManager::getInstance();
         Chat::ChatManager& chatManager = Chat::ChatManager::getInstance();
@@ -282,10 +312,12 @@ class ChatHistoryRenderer {
             chatManager.getCurrentChat().value()
         );
 
-        int jobId = modelManager.startChatCompletionJob(completionParams);
+        int jobId = modelManager.startChatCompletionJob(completionParams, chatStreamingCallback);
         if (!chatManager.setCurrentJobId(jobId)) {
             std::cerr << "[ChatSection] Failed to set the current job ID.\n";
         }
+
+        modelManager.setModelGenerationInProgress(true);
     }
 
     void renderMetadata(const Chat::Message& msg, int index, float bubbleWidth, float bubblePadding)
diff --git a/include/ui/chat/chat_window.hpp b/include/ui/chat/chat_window.hpp
index a59e38f..48ea786 100644
--- a/include/ui/chat/chat_window.hpp
+++ b/include/ui/chat/chat_window.hpp
@@ -249,6 +249,36 @@ class ChatWindow {
     }
 
 private:
+    static void chatStreamingCallback(const std::string& partialOutput, const float tps, const int jobId, const bool isFinished) {
+        auto& chatManager = Chat::ChatManager::getInstance();
+        auto& modelManager = Model::ModelManager::getInstance();
+        std::string chatName = chatManager.getChatNameByJobId(jobId);
+
+        if (isFinished) modelManager.setModelGenerationInProgress(false);
+
+        auto chatOpt = chatManager.getChat(chatName);
+        if (chatOpt) {
+            Chat::ChatHistory chat = chatOpt.value();
+            if (!chat.messages.empty() && chat.messages.back().role == "assistant") {
+                // Append to existing assistant message
+                chat.messages.back().content = partialOutput;
+                chat.messages.back().tps = tps;
+                chatManager.updateChat(chatName, chat);
+            }
+            else {
+                // Create new assistant message
+                Chat::Message assistantMsg;
+                assistantMsg.id = static_cast<int>(chat.messages.size()) + 1;
+                assistantMsg.role = "assistant";
+                assistantMsg.content = partialOutput;
+                assistantMsg.tps = tps;
+                assistantMsg.modelName = modelManager.getCurrentModelName().value_or("idk") + " | "
+                    + modelManager.getCurrentVariantType();
+                chatManager.addMessage(chatName, assistantMsg);
+            }
+        }
+    }
+
     // Render the row of buttons that allow the user to switch models or clear chat.
     void renderChatFeatureButtons(float baseX, float baseY) {
 		Model::ModelManager& modelManager = Model::ModelManager::getInstance();
@@ -256,12 +286,19 @@ class ChatWindow {
         // Update the open-model manager button�s label dynamically.
         openModelManagerConfig.label =
             modelManager.getCurrentModelName().value_or("Select Model");
+		openModelManagerConfig.tooltip =
+			modelManager.getCurrentModelName().value_or("Select Model");
 
         if (modelManager.isLoadInProgress())
         {
             openModelManagerConfig.label = "Loading Model...";
         }
 
+        if (modelManager.isModelLoaded())
+        {
+			openModelManagerConfig.icon = ICON_CI_SPARKLE_FILLED;
+        }
+
         std::vector<ButtonConfig> buttons = { openModelManagerConfig, clearChatButtonConfig };
         Button::renderGroup(buttons, baseX, baseY);
 
@@ -296,10 +333,12 @@ class ChatWindow {
             buildChatCompletionParameters(currentChat, message);
 
         auto& modelManager = Model::ModelManager::getInstance();
-        int jobId = modelManager.startChatCompletionJob(completionParams);
+        int jobId = modelManager.startChatCompletionJob(completionParams, chatStreamingCallback);
         if (!chatManager.setCurrentJobId(jobId)) {
             std::cerr << "[ChatSection] Failed to set the current job ID.\n";
         }
+
+        modelManager.setModelGenerationInProgress(true);
     }
 
     InputFieldConfig createInputFieldConfig(
diff --git a/include/ui/chat/model_manager_modal.hpp b/include/ui/chat/model_manager_modal.hpp
index 7640498..fe93abd 100644
--- a/include/ui/chat/model_manager_modal.hpp
+++ b/include/ui/chat/model_manager_modal.hpp
@@ -270,6 +270,7 @@ class ModelCardRenderer {
             btnConfig.onClick = [variant, this]() {
                 Model::ModelManager::getInstance().setPreferredVariant(m_model.name, variant);
                 };
+			ImGui::SetCursorPosX(ImGui::GetCursorPosX() + 4);
             Button::render(btnConfig);
 
             ImGui::SameLine(0.0f, 4.0f);
@@ -280,6 +281,7 @@ class ModelCardRenderer {
             variantLabel.fontType = FontsManager::REGULAR;
             variantLabel.fontSize = FontsManager::SM;
             variantLabel.alignment = Alignment::LEFT;
+			ImGui::SetCursorPosY(ImGui::GetCursorPosY() - 6);
             Label::render(variantLabel);
             };
 
diff --git a/include/ui/server/deployment_settings.hpp b/include/ui/server/deployment_settings.hpp
new file mode 100644
index 0000000..f01e019
--- /dev/null
+++ b/include/ui/server/deployment_settings.hpp
@@ -0,0 +1,285 @@
+#pragma once
+
+#include "imgui.h"
+#include "ui/widgets.hpp"
+#include "model/model_loader_config_manager.hpp"
+
+#include <IconsCodicons.h>
+#include <string>
+#include <functional>
+
+namespace DeploymentSettingsConstants {
+    constexpr ImGuiWindowFlags SidebarFlags =
+        ImGuiWindowFlags_NoMove |
+        ImGuiWindowFlags_NoCollapse |
+        ImGuiWindowFlags_NoTitleBar |
+        ImGuiWindowFlags_NoBackground |
+        ImGuiWindowFlags_NoScrollbar;
+}
+
+class ModelLoaderSettingsComponent {
+public:
+    // Takes sidebarWidth by reference to always reflect the current width
+    ModelLoaderSettingsComponent(float& sidebarWidth)
+        : m_sidebarWidth(sidebarWidth)
+    {
+        // Initialize labels
+        m_contextSizeLabel = createLabel("Context Size", ICON_CI_BRACKET);
+        m_gpuLayersLabel = createLabel("GPU Layers", ICON_CI_CHIP);
+        m_systemSettingsLabel = createLabel("System Settings", ICON_CI_SERVER);
+        m_optimizationLabel = createLabel("Optimization Settings", ICON_CI_DASHBOARD);
+    }
+
+    void render() {
+        auto& configManager = Model::ModelLoaderConfigManager::getInstance();
+        auto& serverState = ServerStateManager::getInstance();
+
+        const float sliderWidth = m_sidebarWidth - 30;
+
+        // n_ctx slider (context size) - using float for slider then converting back to int
+        {
+            int n_ctx = configManager.getContextSize();
+            float n_ctx_float = static_cast<float>(n_ctx);
+            Slider::render("##n_ctx", n_ctx_float, 1024.0f, 16384.0f, sliderWidth, "%.0f");
+            int new_n_ctx = static_cast<int>(n_ctx_float);
+            if (new_n_ctx != n_ctx) {
+                configManager.setContextSize(new_n_ctx);
+                configManager.saveConfig(); // Auto-save on change
+                serverState.setModelParamsChanged(true); // Mark params as changed
+            }
+        }
+
+        // n_keep slider (keep size) - using float for slider then converting back to int
+        {
+            int n_keep = configManager.getKeepSize();
+            float n_keep_float = static_cast<float>(n_keep);
+            Slider::render("##n_keep", n_keep_float, 0.0f, static_cast<float>(configManager.getContextSize()), sliderWidth, "%.0f");
+            int new_n_keep = static_cast<int>(n_keep_float);
+            if (new_n_keep != n_keep) {
+                configManager.setKeepSize(new_n_keep);
+                configManager.saveConfig(); // Auto-save on change
+                serverState.setModelParamsChanged(true); // Mark params as changed
+            }
+        }
+
+        // n_gpu_layers slider - using float for slider then converting back to int
+        {
+            int n_gpu_layers = configManager.getGpuLayers();
+            float n_gpu_layers_float = static_cast<float>(n_gpu_layers);
+            Slider::render("##n_gpu_layers", n_gpu_layers_float, 0.0f, 100.0f, sliderWidth, "%.0f");
+            int new_n_gpu_layers = static_cast<int>(n_gpu_layers_float);
+            if (new_n_gpu_layers != n_gpu_layers) {
+                configManager.setGpuLayers(new_n_gpu_layers);
+                configManager.saveConfig(); // Auto-save on change
+                serverState.setModelParamsChanged(true); // Mark params as changed
+            }
+        }
+
+        // use_mlock checkbox
+        renderCheckbox("Memory Lock", "##use_mlock", configManager.getUseMlock(),
+            [&configManager, &serverState](bool value) {
+                configManager.setUseMlock(value);
+                configManager.saveConfig();
+                serverState.setModelParamsChanged(true); // Mark params as changed
+            },
+            "Locks memory to prevent swapping to disk");
+
+        // use_mmap checkbox
+        renderCheckbox("Memory Map", "##use_mmap", configManager.getUseMmap(),
+            [&configManager, &serverState](bool value) {
+                configManager.setUseMmap(value);
+                configManager.saveConfig();
+                serverState.setModelParamsChanged(true); // Mark params as changed
+            },
+            "Use memory mapping for model weights");
+
+        // n_parallel input
+        ImGui::Spacing();
+        int n_parallel = configManager.getParallelCount();
+        IntInputField::render("##n_parallel", n_parallel, sliderWidth);
+        if (n_parallel != configManager.getParallelCount()) {
+            configManager.setParallelCount(n_parallel);
+            configManager.saveConfig();
+            serverState.setModelParamsChanged(true); // Mark params as changed
+        }
+
+        // cont_batching checkbox
+        renderCheckbox("Continuous Batching", "##cont_batching", configManager.getContinuousBatching(),
+            [&configManager, &serverState](bool value) {
+                configManager.setContinuousBatching(value);
+                configManager.saveConfig();
+                serverState.setModelParamsChanged(true); // Mark params as changed
+            },
+            "Enable continuous batching for better performance");
+
+        // warmup checkbox
+        renderCheckbox("Warmup", "##warmup", configManager.getWarmup(),
+            [&configManager, &serverState](bool value) {
+                configManager.setWarmup(value);
+                configManager.saveConfig();
+                serverState.setModelParamsChanged(true); // Mark params as changed
+            },
+            "Run model warmup at initialization");
+    }
+
+private:
+    float& m_sidebarWidth;
+    LabelConfig m_contextSizeLabel;
+    LabelConfig m_gpuLayersLabel;
+    LabelConfig m_systemSettingsLabel;
+    LabelConfig m_optimizationLabel;
+
+    LabelConfig createLabel(const std::string& text, const std::string& icon) {
+        LabelConfig label;
+        label.id = "##" + text + "_label";
+        label.label = text;
+        label.icon = icon;
+        label.size = ImVec2(Config::Icon::DEFAULT_FONT_SIZE, 0);
+        label.fontType = FontsManager::BOLD;
+        return label;
+    }
+
+    void renderCheckbox(const std::string& label, const std::string& id, bool value, std::function<void(bool)> onChange, const std::string& tooltip = "") {
+        ImGui::SetCursorPosX(ImGui::GetCursorPosX() + 5.0f);
+		ImGui::SetCursorPosY(ImGui::GetCursorPosY() + 8.0f);
+
+        ButtonConfig btnConfig;
+        btnConfig.id = id;
+        btnConfig.icon = value ? ICON_CI_CHECK : ICON_CI_CLOSE;
+        btnConfig.textColor = value ? ImVec4(1, 1, 1, 1) : ImVec4(0.6f, 0.6f, 0.6f, 1.0f);
+        btnConfig.fontSize = FontsManager::SM;
+        btnConfig.size = ImVec2(24, 24);
+        btnConfig.backgroundColor = value ? Config::Color::PRIMARY : RGBAToImVec4(60, 60, 60, 255);
+        btnConfig.hoverColor = value ? RGBAToImVec4(53, 132, 228, 255) : RGBAToImVec4(80, 80, 80, 255);
+        btnConfig.activeColor = value ? RGBAToImVec4(26, 95, 180, 255) : RGBAToImVec4(100, 100, 100, 255);
+        btnConfig.onClick = [value, onChange]() {
+            onChange(!value);
+            };
+        if (!tooltip.empty()) {
+            btnConfig.tooltip = tooltip;
+        }
+        Button::render(btnConfig);
+
+        ImGui::SameLine(0.0f, 8.0f);
+        LabelConfig labelConfig;
+        labelConfig.id = id + "_label";
+        labelConfig.label = label;
+        labelConfig.size = ImVec2(0, 0);
+        labelConfig.fontType = FontsManager::REGULAR;
+        labelConfig.fontSize = FontsManager::MD;
+        labelConfig.alignment = Alignment::LEFT;
+
+        ImGui::SetCursorPosY(ImGui::GetCursorPosY() - 8.0f);
+        Label::render(labelConfig);
+
+        ImGui::Spacing();
+    }
+};
+
+class ServerSettingsComponent {
+public:
+    ServerSettingsComponent(float& sidebarWidth)
+        : m_sidebarWidth(sidebarWidth)
+    {
+        m_serverSettingsLabel = createLabel("Server Settings", ICON_CI_SERVER);
+    }
+
+    void render() {
+        auto& serverState = ServerStateManager::getInstance();
+        const float sliderWidth = m_sidebarWidth - 30;
+
+        // Server status indicator
+        ImGui::SetCursorPosX(ImGui::GetCursorPosX() + 5.0f);
+        ImGui::TextUnformatted("Status:");
+        ImGui::SameLine();
+
+        if (serverState.isServerRunning()) {
+            ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.0f, 1.0f, 0.0f, 1.0f));
+            ImGui::TextUnformatted("Running");
+        }
+        else {
+            ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(1.0f, 0.5f, 0.0f, 1.0f));
+            ImGui::TextUnformatted("Stopped");
+        }
+        ImGui::PopStyleColor();
+        ImGui::Spacing();
+        ImGui::Separator();
+        ImGui::Spacing();
+
+        int port = serverState.getServerPort();
+
+        // Make the port input read-only if server is running
+        ImGui::BeginDisabled(serverState.isServerRunning());
+
+        IntInputField::render("##server_port", port, sliderWidth);
+        if (port != serverState.getServerPort() && port > 0 && port <= 65535) {
+            serverState.setServerPort(port);
+        }
+
+        ImGui::EndDisabled();
+
+        ImGui::Spacing();
+        ImGui::Spacing();
+    }
+
+private:
+    float& m_sidebarWidth;
+    LabelConfig m_serverSettingsLabel;
+
+    LabelConfig createLabel(const std::string& text, const std::string& icon) {
+        LabelConfig label;
+        label.id = "##" + text + "_label";
+        label.label = text;
+        label.icon = icon;
+        label.size = ImVec2(Config::Icon::DEFAULT_FONT_SIZE, 0);
+        label.fontType = FontsManager::BOLD;
+        return label;
+    }
+};
+
+class DeploymentSettingsSidebar {
+public:
+    DeploymentSettingsSidebar() :
+        m_width(Config::DeploymentSettingsSidebar::SIDEBAR_WIDTH),
+        m_modelLoaderSettingsComponent(m_width),
+        m_serverSettingsComponent(m_width) {
+    }
+
+    void render() {
+        ImGuiIO& io = ImGui::GetIO();
+        const float sidebarHeight = io.DisplaySize.y - Config::TITLE_BAR_HEIGHT;
+
+        // Right sidebar window
+        ImGui::SetNextWindowPos(ImVec2(io.DisplaySize.x - m_width, Config::TITLE_BAR_HEIGHT + 40), ImGuiCond_Always);
+        ImGui::SetNextWindowSize(ImVec2(m_width, sidebarHeight), ImGuiCond_Always);
+        ImGui::SetNextWindowSizeConstraints(
+            ImVec2(Config::DeploymentSettingsSidebar::MIN_SIDEBAR_WIDTH, sidebarHeight),
+            ImVec2(Config::DeploymentSettingsSidebar::MAX_SIDEBAR_WIDTH, sidebarHeight)
+        );
+
+        ImGui::Begin("Deployment Settings", nullptr, DeploymentSettingsConstants::SidebarFlags);
+
+        // Update the current sidebar width
+        m_width = ImGui::GetWindowSize().x;
+
+        // Render scrollable content area
+        ImGui::BeginChild("##deployment_settings_content", ImVec2(0, 0), false, false);
+
+        // Render server settings component first
+        m_serverSettingsComponent.render();
+
+        // Render model loader settings component
+        m_modelLoaderSettingsComponent.render();
+
+        ImGui::EndChild();
+
+        ImGui::End();
+    }
+
+    float getWidth() const { return m_width; }
+
+private:
+    float m_width = 0.0F;
+    ModelLoaderSettingsComponent m_modelLoaderSettingsComponent;
+    ServerSettingsComponent m_serverSettingsComponent;
+};
\ No newline at end of file
diff --git a/include/ui/server/server_logs.hpp b/include/ui/server/server_logs.hpp
new file mode 100644
index 0000000..e9e7c22
--- /dev/null
+++ b/include/ui/server/server_logs.hpp
@@ -0,0 +1,269 @@
+#pragma once
+
+#include "imgui.h"
+#include "ui/widgets.hpp"
+#include "ui/chat/model_manager_modal.hpp"
+#include "model/model_manager.hpp"
+#include "model/server_state_manager.hpp"
+
+#include <IconsCodicons.h>
+
+class ServerLogViewer {
+public:
+    ServerLogViewer() {
+        m_logBuffer = "Server logs will be displayed here.";
+        m_lastLogUpdate = std::chrono::steady_clock::now();
+    }
+
+    ~ServerLogViewer() {
+        // Make sure to stop the server on destruction
+        if (ServerStateManager::getInstance().isServerRunning()) {
+            Model::ModelManager::getInstance().stopServer();
+        }
+    }
+
+    void render(const float sidebarWidth) {
+        ImGuiIO& io = ImGui::GetIO();
+        Model::ModelManager& modelManager = Model::ModelManager::getInstance();
+        ServerStateManager& serverState = ServerStateManager::getInstance();
+
+        ImGuiWindowFlags window_flags = ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoResize |
+            ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoCollapse |
+            ImGuiWindowFlags_NoBringToFrontOnFocus | ImGuiWindowFlags_NoScrollbar | ImGuiWindowFlags_NoBackground;
+
+        ImGui::PushStyleVar(ImGuiStyleVar_WindowBorderSize, 0.0F);
+        ImGui::SetNextWindowPos(ImVec2(0, Config::TITLE_BAR_HEIGHT), ImGuiCond_Always);
+        ImGui::SetNextWindowSize(ImVec2(io.DisplaySize.x - sidebarWidth, io.DisplaySize.y - Config::TITLE_BAR_HEIGHT), ImGuiCond_Always);
+        ImGui::Begin("Server Logs", nullptr, window_flags);
+        ImGui::PopStyleVar();
+
+        // Top bar with controls
+        {
+            // Start/Stop server button
+            ButtonConfig serverButtonConfig;
+            serverButtonConfig.id = "##server_toggle_button";
+
+            if (serverState.isServerRunning()) {
+                serverButtonConfig.label = "Stop Server";
+                serverButtonConfig.icon = ICON_CI_DEBUG_STOP;
+                serverButtonConfig.tooltip = "Stop the server";
+            }
+            else {
+                serverButtonConfig.label = "Start Server";
+                serverButtonConfig.icon = ICON_CI_RUN;
+                serverButtonConfig.tooltip = "Start the server";
+            }
+
+            serverButtonConfig.size = ImVec2(150, 0);
+            serverButtonConfig.alignment = Alignment::CENTER;
+            serverButtonConfig.onClick = [this, &modelManager, &serverState]() {
+                toggleServer(modelManager, serverState);
+                };
+
+            // Model selection button
+            ButtonConfig selectModelButtonConfig;
+            selectModelButtonConfig.id = "##server_select_model_button";
+            selectModelButtonConfig.label =
+                serverState.getCurrentModelName().value_or("Select Model");
+            selectModelButtonConfig.tooltip =
+                serverState.getCurrentModelName().value_or("Select Model");
+            selectModelButtonConfig.icon = ICON_CI_SPARKLE;
+            selectModelButtonConfig.size = ImVec2(180, 0);
+            selectModelButtonConfig.alignment = Alignment::CENTER;
+            selectModelButtonConfig.onClick = [this]() {
+                m_modelManagerModalOpen = true;
+                };
+
+            if (serverState.isModelLoadInProgress()) {
+                selectModelButtonConfig.label = "Loading Model...";
+                serverButtonConfig.state = ButtonState::DISABLED;
+            }
+
+            if (serverState.isModelLoaded()) {
+                selectModelButtonConfig.icon = ICON_CI_SPARKLE_FILLED;
+            }
+            else {
+                serverButtonConfig.state = ButtonState::DISABLED; // Can't start server without model
+            }
+
+            std::vector<ButtonConfig> buttonConfigs = { serverButtonConfig, selectModelButtonConfig };
+
+            // Add reload button if model params have changed
+            if (serverState.haveModelParamsChanged() && serverState.isModelLoaded()) {
+                ButtonConfig reloadModelButtonConfig;
+                reloadModelButtonConfig.id = "##reload_model_button";
+                reloadModelButtonConfig.icon = ICON_CI_REFRESH;
+                reloadModelButtonConfig.tooltip = "Reload model with new parameters";
+                reloadModelButtonConfig.size = ImVec2(24, 24);
+                reloadModelButtonConfig.alignment = Alignment::CENTER;
+				reloadModelButtonConfig.backgroundColor = ImVec4(0.2f, 0.2f, 0.2f, 1.0f);
+                reloadModelButtonConfig.onClick = [this, &modelManager, &serverState]() {
+					modelManager.switchModel(
+						modelManager.getCurrentModelName().value(),
+						modelManager.getCurrentVariantType()
+					);
+					serverState.resetModelParamsChanged();
+                    };
+
+                // Disable the reload button if server is running or model is loading
+                if (serverState.isServerRunning() || serverState.isModelLoadInProgress()) {
+                    reloadModelButtonConfig.state = ButtonState::DISABLED;
+                }
+
+                buttonConfigs.push_back(reloadModelButtonConfig);
+            }
+
+			Button::renderGroup(buttonConfigs, ImGui::GetCursorPosX(), ImGui::GetCursorPosY());
+
+            // Show API endpoint info if server is running
+            if (serverState.isServerRunning()) {
+                ImGui::SameLine();
+
+				ImGui::SetCursorPosX(ImGui::GetCursorPosX() + 40);
+
+                ImGui::TextUnformatted("API Endpoint:");
+                ImGui::SameLine();
+
+                std::string endpoint = "http://localhost:" + serverState.getServerPortString() + "/v1/chat/completions";
+                ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.4f, 0.8f, 1.0f, 1.0f));
+                ImGui::TextUnformatted(endpoint.c_str());
+                ImGui::PopStyleColor();
+
+                ImGui::SameLine();
+				ImGui::SetCursorPosY(ImGui::GetCursorPosY() - 2);
+                ButtonConfig copyButtonConfig;
+                copyButtonConfig.id = "##copy_endpoint_button";
+                copyButtonConfig.icon = ICON_CI_COPY;
+                copyButtonConfig.tooltip = "Copy endpoint to clipboard";
+                copyButtonConfig.size = ImVec2(24, 24);
+                copyButtonConfig.onClick = [endpoint]() {
+                    ImGui::SetClipboardText(endpoint.c_str());
+                    };
+
+                Button::render(copyButtonConfig);
+            }
+
+            m_modelManagerModal.render(m_modelManagerModalOpen);
+        }
+
+        ImGui::SetCursorPosY(ImGui::GetCursorPosY() + 12);
+
+        // Update log buffer from kolosal::Logger
+        updateLogBuffer();
+
+        // Log display area
+        {
+            InputFieldConfig input_cfg(
+                "##server_log_input",
+                ImVec2(-FLT_MIN, -FLT_MIN),
+                m_logBuffer,
+                m_isLogFocused
+            );
+
+            input_cfg.frameRounding = 4.0f;
+            input_cfg.flags = ImGuiInputTextFlags_ReadOnly;
+            input_cfg.backgroundColor = ImVec4(0.2f, 0.2f, 0.2f, 0.5f);
+            InputField::renderMultiline(input_cfg);
+
+            // Auto-scroll to bottom
+            if (ImGui::GetScrollY() >= ImGui::GetScrollMaxY() - 20.0f) {
+                ImGui::SetScrollHereY(1.0f);
+            }
+        }
+
+        ImGui::End();
+    }
+
+private:
+    bool m_isLogFocused = false;
+    std::string m_logBuffer;
+    size_t m_lastLogIndex = 0;
+    std::chrono::steady_clock::time_point m_lastLogUpdate;
+
+    ModelManagerModal m_modelManagerModal;
+    bool m_modelManagerModalOpen = false;
+
+    void toggleServer(Model::ModelManager& modelManager, ServerStateManager& serverState) {
+        if (serverState.isServerRunning()) {
+            // Stop the server
+            modelManager.stopServer();
+            serverState.setServerRunning(false);
+        }
+        else {
+            // Start the server
+            if (serverState.isModelLoaded()) {
+                if (modelManager.startServer(serverState.getServerPortString())) {
+                    serverState.setServerRunning(true);
+                    addToLogBuffer("Server started on port " + serverState.getServerPortString());
+                }
+                else {
+                    addToLogBuffer("Failed to start server on port " + serverState.getServerPortString());
+                }
+            }
+            else {
+                addToLogBuffer("Error: Cannot start server without a loaded model");
+            }
+        }
+    }
+
+    void updateLogBuffer() {
+        // Check if it's time to update (limit updates to reduce performance impact)
+        auto now = std::chrono::steady_clock::now();
+        if (std::chrono::duration_cast<std::chrono::milliseconds>(now - m_lastLogUpdate).count() < 100) {
+            return;
+        }
+        m_lastLogUpdate = now;
+
+        // Get logs from the kolosal::Logger
+        const auto& logs = Logger::instance().getLogs();
+
+        // If there are new logs, add them to our buffer
+        if (logs.size() > m_lastLogIndex) {
+            for (size_t i = m_lastLogIndex; i < logs.size(); i++) {
+                const auto& entry = logs[i];
+                std::string levelPrefix;
+
+                switch (entry.level) {
+                case LogLevel::SERVER_ERROR:
+                    levelPrefix = "[ERROR] ";
+                    break;
+                case LogLevel::SERVER_WARNING:
+                    levelPrefix = "[WARNING] ";
+                    break;
+                case LogLevel::SERVER_INFO:
+                    levelPrefix = "[INFO] ";
+                    break;
+                case LogLevel::SERVER_DEBUG:
+                    levelPrefix = "[DEBUG] ";
+                    break;
+                default:
+                    levelPrefix = "[LOG] ";
+                }
+
+                addToLogBuffer(levelPrefix + entry.message);
+            }
+
+            m_lastLogIndex = logs.size();
+        }
+    }
+
+    void addToLogBuffer(const std::string& message) {
+        // Add timestamp
+        auto now = std::chrono::system_clock::now();
+        auto time_t = std::chrono::system_clock::to_time_t(now);
+        std::tm* tm = std::localtime(&time_t);
+
+        char timestamp[32];
+        std::strftime(timestamp, sizeof(timestamp), "[%H:%M:%S] ", tm);
+
+        // Add to buffer with newline if not empty
+        if (!m_logBuffer.empty() && m_logBuffer != "Server logs will be displayed here.") {
+            m_logBuffer += "\n";
+        }
+        else if (m_logBuffer == "Server logs will be displayed here.") {
+            m_logBuffer = ""; // Clear the initial message
+        }
+
+        m_logBuffer += std::string(timestamp) + message;
+    }
+};
\ No newline at end of file
diff --git a/include/ui/tab_manager.hpp b/include/ui/tab_manager.hpp
index 48f14a2..16f0bb2 100644
--- a/include/ui/tab_manager.hpp
+++ b/include/ui/tab_manager.hpp
@@ -1,11 +1,14 @@
 #pragma once
 
+#include "IconsCodicons.h"
+
 #include "ui/chat/chat_history_sidebar.hpp"
 #include "ui/chat/preset_sidebar.hpp"
 #include "ui/chat/chat_window.hpp"
+#include "ui/server/server_logs.hpp"
+#include "ui/server/deployment_settings.hpp"
 
 #include "chat/chat_manager.hpp"
-
 #include "model/model_manager.hpp"
 
 #include <memory>
@@ -17,6 +20,8 @@ class ITab {
     virtual void render() = 0;
     virtual void onActivate() = 0;
     virtual void onDeactivate() = 0;
+    virtual const char* getTitle() const = 0;
+    virtual const char* getIcon() const = 0;
 };
 
 // Update ChatTab to implement the new methods
@@ -27,42 +32,8 @@ class ChatTab : public ITab {
     {
     }
 
-    void onActivate() override {
-		Model::ModelManager& modelManager = Model::ModelManager::getInstance();
-
-        modelManager.setStreamingCallback(
-            [&modelManager](const std::string& partialOutput, const float tps, const int jobId) {
-                auto& chatManager = Chat::ChatManager::getInstance();
-                std::string chatName = chatManager.getChatNameByJobId(jobId);
-
-                auto chatOpt = chatManager.getChat(chatName);
-                if (chatOpt) {
-                    Chat::ChatHistory chat = chatOpt.value();
-                    if (!chat.messages.empty() && chat.messages.back().role == "assistant") {
-                        // Append to existing assistant message
-                        chat.messages.back().content = partialOutput;
-                        chat.messages.back().tps = tps;
-                        chatManager.updateChat(chatName, chat);
-                    }
-                    else {
-                        // Create new assistant message
-                        Chat::Message assistantMsg;
-                        assistantMsg.id = static_cast<int>(chat.messages.size()) + 1;
-                        assistantMsg.role = "assistant";
-                        assistantMsg.content = partialOutput;
-                        assistantMsg.tps = tps;
-						assistantMsg.modelName = modelManager.getCurrentModelName().value_or("idk") + " | " 
-                            + modelManager.getCurrentVariantType();
-                        chatManager.addMessage(chatName, assistantMsg);
-                    }
-                }
-            }
-        );
-    }
-
-    void onDeactivate() override {
-        Model::ModelManager::getInstance().setStreamingCallback(nullptr);
-    }
+    void onActivate() override {}
+    void onDeactivate() override {}
 
     void render() override {
         chatHistorySidebar.render();
@@ -73,12 +44,43 @@ class ChatTab : public ITab {
         );
     }
 
+    // Return a title for the Chat tab
+    const char* getTitle() const override { return "Chat"; }
+
+    // Return the icon for the Chat tab
+    const char* getIcon() const override { return ICON_CI_COMMENT_DISCUSSION; }
+
 private:
     ChatHistorySidebar chatHistorySidebar;
     ModelPresetSidebar modelPresetSidebar;
     ChatWindow chatWindow;
 };
 
+class ServerTab : public ITab {
+public:
+	ServerTab() : serverLogViewer(), deploymentSettingsSidebar()
+    {
+    }
+
+    void onActivate() override {}
+    void onDeactivate() override {}
+
+    void render() override {
+        deploymentSettingsSidebar.render();
+        serverLogViewer.render(deploymentSettingsSidebar.getWidth());
+    }
+
+    // Return a title for the Chat tab
+    const char* getTitle() const override { return "Server"; }
+
+    // Return the icon for the Chat tab
+    const char* getIcon() const override { return ICON_CI_SERVER_PROCESS; }
+
+private:
+    ServerLogViewer serverLogViewer;
+	DeploymentSettingsSidebar deploymentSettingsSidebar;
+};
+
 // Update TabManager to handle tab activation/deactivation
 class TabManager {
 public:
@@ -110,6 +112,10 @@ class TabManager {
         }
     }
 
+    ITab* getTab(size_t index) const { return tabs.at(index).get(); }
+    const size_t getTabCount() const { return tabs.size(); }
+    const size_t getCurrentActiveTabIndex() const { return activeTabIndex; };
+
 private:
     std::vector<std::unique_ptr<ITab>> tabs;
     size_t activeTabIndex;
diff --git a/include/ui/title_bar.hpp b/include/ui/title_bar.hpp
index 2e4e2d9..84a82b2 100644
--- a/include/ui/title_bar.hpp
+++ b/include/ui/title_bar.hpp
@@ -7,6 +7,9 @@
 #include "stb_image.h"
 #include "resource.h"
 
+#include "tab_manager.hpp"
+#include "widgets.hpp"
+
 GLuint LoadTextureFromFile(const char* filename)
 {
     int width, height, channels;
@@ -38,14 +41,14 @@ GLuint LoadTextureFromFile(const char* filename)
     return texture;
 }
 
-void titleBar(void* handler)
+void titleBar(void* handler, TabManager& tabManager)
 {
 #ifdef _WIN32
-	// Cast the HWND
-	HWND hwnd = static_cast<HWND>(handler);
+    // Cast the HWND
+    HWND hwnd = static_cast<HWND>(handler);
 #else
-	// Cast the XID
-	XID xid = static_cast<XID>(handler);
+    // Cast the XID
+    XID xid = static_cast<XID>(handler);
 #endif
 
     ImGuiIO& io = ImGui::GetIO();
@@ -83,6 +86,56 @@ void titleBar(void* handler)
         }
     }
 
+    ImGui::SetCursorPosX(ImGui::GetCursorPosX() + 16.0f);
+
+    // Render a button for each available tab
+    {
+        std::vector<ButtonConfig> buttonConfigs;
+
+        for (size_t i = 0; i < tabManager.getTabCount(); ++i)
+        {
+            ButtonConfig tabButtonConfig;
+            tabButtonConfig.id = "##" + (std::string)tabManager.getTab(i)->getTitle();
+            tabButtonConfig.icon = tabManager.getTab(i)->getIcon();
+            tabButtonConfig.size = ImVec2(24, 0);
+            tabButtonConfig.onClick = [i, &tabManager]() { tabManager.switchTab(i); };
+            tabButtonConfig.tooltip = tabManager.getTab(i)->getTitle();
+            if (tabManager.getCurrentActiveTabIndex() == i)
+            {
+                tabButtonConfig.state = ButtonState::ACTIVE;
+            }
+            else
+            {
+                tabButtonConfig.textColor = ImVec4(0.7f, 0.7f, 0.7f, 0.7f);
+            }
+
+            buttonConfigs.push_back(tabButtonConfig);
+        }
+
+        // Calculate background dimensions
+        float buttonHeight = 16.0f;
+        float totalWidth = buttonConfigs.size() * 24.0f + (buttonConfigs.size() - 2) * 10.0f + 6.0f;
+        float padding = 6.0f;
+
+        // Calculate background position and size
+        ImVec2 pos = ImVec2(ImGui::GetCursorPosX(), ImGui::GetCursorPosY());
+        ImVec2 size = ImVec2(totalWidth + padding * 2, buttonHeight + padding * 2);
+
+        // Draw the background
+        ImDrawList* drawList = ImGui::GetWindowDrawList();
+        drawList->AddRectFilled(
+            ImVec2(pos.x - padding, pos.y - padding),
+            ImVec2(pos.x + size.x, pos.y + size.y),
+            ImGui::ColorConvertFloat4ToU32(ImVec4(0.3f, 0.3f, 0.3f, 0.3f)),
+            8.0f
+        );
+
+        // Render the buttons
+        Button::renderGroup(buttonConfigs, pos.x, pos.y);
+
+        ImGui::SameLine();
+    }
+
     // Title Bar Buttons
     {
         float buttonWidth = 45.0f; // Adjust as needed
diff --git a/include/ui/widgets.hpp b/include/ui/widgets.hpp
index ef36447..c2fc39e 100644
--- a/include/ui/widgets.hpp
+++ b/include/ui/widgets.hpp
@@ -765,8 +765,6 @@ namespace Slider
         renderLabel.erase(std::remove(renderLabel.begin(), renderLabel.end(), '#'), renderLabel.end());
         std::replace(renderLabel.begin(), renderLabel.end(), '_', ' ');
 
-        // Apply horizontal padding and render label
-        ImGui::SetCursorPosX(ImGui::GetCursorPosX() + paddingX);
         LabelConfig labelConfig;
         labelConfig.id = label;
         labelConfig.label = renderLabel;
@@ -861,8 +859,6 @@ namespace IntInputField
         renderLabel.erase(std::remove(renderLabel.begin(), renderLabel.end(), '#'), renderLabel.end());
         std::replace(renderLabel.begin(), renderLabel.end(), '_', ' ');
 
-        // Apply horizontal padding and render label
-        ImGui::SetCursorPosX(ImGui::GetCursorPosX() + paddingX);
         LabelConfig labelConfig;
         labelConfig.id = label;
         labelConfig.label = renderLabel;
diff --git a/include/window/win32_window.hpp b/include/window/win32_window.hpp
index 5f566f3..f42b600 100644
--- a/include/window/win32_window.hpp
+++ b/include/window/win32_window.hpp
@@ -39,11 +39,12 @@ class Win32Window : public Window {
         }
     }
 
-    void createWindow(int width, int height, const std::string& title) override
+    void createWindow(int width, int height, const std::string& title, const float tabButtonWidths) override
     {
         this->width = width;
         this->height = height;
         this->title = title;
+        this->tabButtonWidths = tabButtonWidths;
 
         hwnd = create_window(&Win32Window::WndProc, hInstance, this);
         if (!hwnd) {
@@ -169,6 +170,7 @@ class Win32Window : public Window {
     int height;
     std::string title;
     bool should_close;
+    float tabButtonWidths;
 
     // Borderless window specific
     bool borderless;
@@ -311,7 +313,8 @@ class Win32Window : public Window {
         }
 
         if ((cursor.y >= window.top && cursor.y < window.top + Config::TITLE_BAR_HEIGHT) &&
-            (cursor.x <= window.right - 45 * 3)) {
+            ((cursor.x <= window.right - 45 * 3 && cursor.x >= window.left + /* logo width */ 40 + /* gap between logo and tab buttons */ 16 + this->tabButtonWidths) ||
+                cursor.x <= window.left + /* logo width */ 40 + /* gap between logo and tab buttons */ 16)) {
             return HTCAPTION;
         }
 
diff --git a/include/window/window.hpp b/include/window/window.hpp
index f34cfac..5147db0 100644
--- a/include/window/window.hpp
+++ b/include/window/window.hpp
@@ -5,7 +5,7 @@
 class Window {
 public:
     virtual ~Window() = default;
-    virtual void createWindow(int width, int height, const std::string& title) = 0;
+    virtual void createWindow(int width, int height, const std::string& title, const float tabButtonWidths) = 0;
     virtual void show() = 0;
     virtual void processEvents() = 0;
     virtual bool shouldClose() = 0;
diff --git a/installer/script.nsi b/installer/script.nsi
index 5aa2ea5..128f600 100644
--- a/installer/script.nsi
+++ b/installer/script.nsi
@@ -18,13 +18,13 @@ Var DefaultChatDir
 ;-----------------------------------
 ; Embed version info (metadata)
 ;-----------------------------------
-VIProductVersion "0.1.1.0"
+VIProductVersion "0.1.6.0"
 VIAddVersionKey "ProductName" "Kolosal AI Installer"
 VIAddVersionKey "CompanyName" "Genta Technology"
 VIAddVersionKey "FileDescription" "Kolosal AI Installer"
 VIAddVersionKey "LegalCopyright" "Copyright (C) 2025"
-VIAddVersionKey "FileVersion" "0.1.1.0"
-VIAddVersionKey "ProductVersion" "0.1.1.0"
+VIAddVersionKey "FileVersion" "0.1.6.0"
+VIAddVersionKey "ProductVersion" "0.1.6.0"
 VIAddVersionKey "OriginalFilename" "KolosalAI_Installer.exe"
 VIAddVersionKey "Comments" "Installer for Kolosal AI"
 VIAddVersionKey "Publisher" "Genta Technology"
@@ -105,6 +105,9 @@ FunctionEnd
 ; Installation Section
 ;-----------------------------------
 Section "Kolosal AI" SecKolosalAI
+  ; Force overwrite of existing files so that EXE and DLL files are always replaced
+  SetOverwrite on
+  
   SetOutPath "$INSTDIR"
   
   ; Set write permissions
diff --git a/kolosal-server b/kolosal-server
new file mode 160000
index 0000000..ea06fc2
--- /dev/null
+++ b/kolosal-server
@@ -0,0 +1 @@
+Subproject commit ea06fc2ad047fc0143e7b0f24f6e46398398a0b6
diff --git a/models/qwen2.5-0.5b.json b/models/qwen2.5-0.5b.json
index 45632b2..07feabb 100644
--- a/models/qwen2.5-0.5b.json
+++ b/models/qwen2.5-0.5b.json
@@ -1,9 +1,9 @@
 {
-  "name": "Qwen 2.5 0.5B",
+  "name": "Qwen2.5 0.5B",
   "author": "Alibaba",
   "fullPrecision": {
     "type": "Full Precision",
-    "path": "models/qwen2.5-0.5b/int4/Qwen2.5-0.5B-Instruct-f16.gguf",
+    "path": "models/qwen2.5-0.5b/fp16/Qwen2.5-0.5B-Instruct-f16.gguf",
     "downloadLink": "https://huggingface.co/kolosal/qwen2.5-0.5b/resolve/main/Qwen2.5-0.5B-Instruct-f16.gguf",
     "isDownloaded": false,
     "downloadProgress": 0.0,
@@ -19,7 +19,7 @@
   },
   "quantized4Bit": {
     "type": "4-bit Quantized",
-    "path": "models/qwen2.5-0.5b/fp16/Qwen2.5-0.5B-Instruct-Q4_K_M.gguf",
+    "path": "models/qwen2.5-0.5b/int4/Qwen2.5-0.5B-Instruct-Q4_K_M.gguf",
     "downloadLink": "https://huggingface.co/kolosal/qwen2.5-0.5b/resolve/main/Qwen2.5-0.5B-Instruct-Q4_K_M.gguf",
     "isDownloaded": false,
     "downloadProgress": 0.0,
diff --git a/models/qwen2.5-1.5b.json b/models/qwen2.5-1.5b.json
index d798701..f9fc696 100644
--- a/models/qwen2.5-1.5b.json
+++ b/models/qwen2.5-1.5b.json
@@ -1,5 +1,5 @@
 {
-  "name": "Qwen 2.5 1.5B",
+  "name": "Qwen2.5 1.5B",
   "author": "Alibaba",
   "fullPrecision": {
     "type": "Full Precision",
diff --git a/models/qwen2.5-14b.json b/models/qwen2.5-14b.json
index 31becb2..777490b 100644
--- a/models/qwen2.5-14b.json
+++ b/models/qwen2.5-14b.json
@@ -1,5 +1,5 @@
 {
-  "name": "Qwen 2.5 14B",
+  "name": "Qwen2.5 14B",
   "author": "Alibaba",
   "fullPrecision": {
     "type": "Full Precision",
diff --git a/models/qwen2.5-3b.json b/models/qwen2.5-3b.json
index cccadf7..095c6dc 100644
--- a/models/qwen2.5-3b.json
+++ b/models/qwen2.5-3b.json
@@ -1,5 +1,5 @@
 {
-  "name": "Qwen 2.5 3B",
+  "name": "Qwen2.5 3B",
   "author": "Alibaba",
   "fullPrecision": {
     "type": "Full Precision",
diff --git a/models/qwen2.5-7b.json b/models/qwen2.5-7b.json
index a5e87ca..90eaf06 100644
--- a/models/qwen2.5-7b.json
+++ b/models/qwen2.5-7b.json
@@ -1,5 +1,5 @@
 {
-  "name": "Qwen 2.5 7B",
+  "name": "Qwen2.5 7B",
   "author": "Alibaba",
   "fullPrecision": {
     "type": "Full Precision",
diff --git a/server-test/python/openai_test.py b/server-test/python/openai_test.py
new file mode 100644
index 0000000..879c4eb
--- /dev/null
+++ b/server-test/python/openai_test.py
@@ -0,0 +1,32 @@
+import openai
+import os
+
+# Configure the client to use your local endpoint
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1",
+    api_key="sk-dummy"  # Using dummy API key as in the curl example
+)
+
+print("Starting streaming request...\n")
+
+# Make a streaming request
+stream = client.chat.completions.create(
+    model="claude-3-opus-20240229",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Why anything to the power of zero is 1?"}
+    ],
+    stream=True
+)
+
+# Process the streaming response
+print("Streaming response:")
+full_response = ""
+for chunk in stream:
+    if chunk.choices[0].delta.content is not None:
+        content = chunk.choices[0].delta.content
+        full_response += content
+        print(content, end="", flush=True)
+
+print("\n\nFull response:", full_response)
+
diff --git a/server-test/python/requirements.txt b/server-test/python/requirements.txt
new file mode 100644
index 0000000..ec838c5
--- /dev/null
+++ b/server-test/python/requirements.txt
@@ -0,0 +1 @@
+openai
diff --git a/source/main.cpp b/source/main.cpp
index 4d06824..82a64cc 100644
--- a/source/main.cpp
+++ b/source/main.cpp
@@ -14,6 +14,7 @@
 #include "chat/chat_manager.hpp"
 #include "model/preset_manager.hpp"
 #include "model/model_manager.hpp"
+#include "model/model_loader_config_manager.hpp"
 
 #include "nfd.h"
 
@@ -156,9 +157,15 @@ class Application
 public:
     Application()
     {
+        // Initialize the TabManager and add the ChatTab (other tabs can be added similarly)
+        tabManager = std::make_unique<TabManager>();
+        tabManager->addTab(std::make_unique<ChatTab>());
+        tabManager->addTab(std::make_unique<ServerTab>());
+
         // Create and show the window
         window = WindowFactory::createWindow();
-        window->createWindow(Config::WINDOW_WIDTH, Config::WINDOW_HEIGHT, Config::WINDOW_TITLE);
+        window->createWindow(Config::WINDOW_WIDTH, Config::WINDOW_HEIGHT, Config::WINDOW_TITLE,
+            tabManager->getTabCount() * 24.0f + (tabManager->getTabCount() - 2) * 10.0f + 6.0f + 12.0f);
         window->show();
 
         // Create and initialize the OpenGL context
@@ -175,6 +182,7 @@ class Application
         Chat::initializeChatManager();
         Model::initializePresetManager();
         Model::initializeModelManager();
+		Model::initializeModelLoaderConfigManager("model_loader_config.json");
 
         // Initialize Native File Dialog
         NFD_Init();
@@ -188,10 +196,6 @@ class Application
 
         // Create the window state transition manager
         transitionManager = std::make_unique<WindowStateTransitionManager>(*window);
-
-        // Initialize the TabManager and add the ChatTab (other tabs can be added similarly)
-        tabManager = std::make_unique<TabManager>();
-        tabManager->addTab(std::make_unique<ChatTab>());
     }
 
     int run()
@@ -208,7 +212,7 @@ class Application
             StartNewFrame();
 
             // Render the custom title bar
-            titleBar(window->getNativeHandle());
+            titleBar(window->getNativeHandle(), *tabManager);
 
             // Render the currently active tab (chat tab in this example)
             tabManager->renderCurrentTab();