oracle
diff --git a/‎ads/aqua/common/entities.py
Lines changed: 4 additions & 0 deletions b/‎ads/aqua/common/entities.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎ads/aqua/common/utils.py
Lines changed: 0 additions & 1 deletion b/‎ads/aqua/common/utils.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎ads/aqua/extension/recommend_handler.py
Lines changed: 1 addition & 0 deletions b/‎ads/aqua/extension/recommend_handler.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎ads/aqua/shaperecommend/constants.py
Lines changed: 35 additions & 22 deletions b/‎ads/aqua/shaperecommend/constants.py
Lines changed: 35 additions & 22 deletions
diff --git a/‎ads/aqua/shaperecommend/estimator.py
Lines changed: 45 additions & 18 deletions b/‎ads/aqua/shaperecommend/estimator.py
Lines changed: 45 additions & 18 deletions
diff --git a/‎ads/aqua/shaperecommend/llm_config.py
Lines changed: 40 additions & 18 deletions b/‎ads/aqua/shaperecommend/llm_config.py
Lines changed: 40 additions & 18 deletions
@@ -101,6 +101,10 @@ class ComputeShapeSummary(Serializable):
     including CPU, memory, and optional GPU characteristics.
     """
 
+    available: Optional[bool] = Field(
+        default = False,
+        description="True if shape is available on user tenancy, "
+    )
     core_count: Optional[int] = Field(
         default=None,
         description="Total number of CPU cores available for the compute shape.",
 
@@ -1287,7 +1287,6 @@ def load_gpu_shapes_index(
 
     # Merge: remote shapes override local
     local_shapes = local_data.get("shapes", {})
-    remote_data = {}
     remote_shapes = remote_data.get("shapes", {})
     merged_shapes = {**local_shapes, **remote_shapes}
 
 
@@ -4,6 +4,7 @@
 from ads.aqua.extension.base_handler import AquaAPIhandler
 from ads.aqua.extension.errors import Errors
 from ads.aqua.shaperecommend.recommend import AquaRecommendApp
+from ads.config import COMPARTMENT_OCID
 
 
 class AquaRecommendHandler(AquaAPIhandler):
 
@@ -14,43 +14,56 @@
 
 NEXT_QUANT suggests the next quantization level based on the current quantization (if applied) or the model weights (if no quantization yet)
 """
+
 LLAMA_REQUIRED_FIELDS = [
-    "num_hidden_layers", "hidden_size", "num_attention_heads",
-    "num_key_value_heads", "head_dim", "intermediate_size", "vocab_size"
+    "num_hidden_layers",
+    "hidden_size",
+    "num_attention_heads",
+    "num_key_value_heads",
+    "head_dim",
+    "intermediate_size",
+    "vocab_size",
 ]
 
-MOE_REQUIRED_FIELDS = LLAMA_REQUIRED_FIELDS + [
-    "num_local_experts", "intermediate_size"
-]
+MOE_REQUIRED_FIELDS = LLAMA_REQUIRED_FIELDS + ["num_local_experts", "intermediate_size"]
 
 NEXT_QUANT = {
-    "float32": ["8bit", "4bit"], # bits and bytes does not support bfloat16, pytorch responsibility
-    "bfloat16": ["8bit", "4bit"],
-    "float16": ["8bit", "4bit"],
+    "float32": ["4bit"],  # vLLM only supports 4bit in-flight-quantization
+    "bfloat16": ["4bit"],
+    "float16": ["4bit"],
     "int8": ["4bit"],
-    "fp8":  ["4bit"],
+    "fp8": ["4bit"],
     "8bit": ["4bit"],
     "int4": ["No smaller quantization available"],
-    "4bit": ["No smaller quantization available"]
+    "4bit": ["No smaller quantization available"],
 }
 
 TEXT_GENERATION = "text_generation"
 SAFETENSORS = "safetensors"
 
+IN_FLIGHT_QUANTIZATION = {"4bit"}
+
 TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "
 
+VLLM_PARAMS = {
+    "max_model_len": "--max-model-len",
+    "in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
+}
 
-QUANT_MAPPING = {
-            "float32": 4,
-            "bfloat16": 2,
-            "float16": 2,
-            "fp16": 2,
-            "half": 2,
-            "int8": 1,
-            "fp8": 1,
-            "8bit": 1,
-            "4bit": 0.5,
-            "int4": 0.5,
-        }
+DEFAULT_WEIGHT_SIZE = "float32"
 
+BITS_AND_BYTES_8BIT = "8bit"
+BITS_AND_BYTES_4BIT = "4bit"
 
+QUANT_MAPPING = {
+    "float32": 4,
+    "bfloat16": 2,
+    "float16": 2,
+    "fp16": 2,
+    "half": 2,
+    "int8": 1,
+    "fp8": 1,
+    "8bit": 1,
+    "4bit": 0.5,
+    "int4": 0.5,
+}
@@ -7,10 +7,12 @@
 
 from ads.aqua.app import logger
 from ads.aqua.shaperecommend.constants import (
+    IN_FLIGHT_QUANTIZATION,
     LLAMA_REQUIRED_FIELDS,
     MOE_REQUIRED_FIELDS,
     NEXT_QUANT,
     QUANT_MAPPING,
+    VLLM_PARAMS,
 )
 from ads.aqua.shaperecommend.llm_config import LLMConfig
 
@@ -47,7 +49,7 @@ def kv_cache_memory(self) -> float:
         c = self.llm_config
         kv_cache_dtype_bytes = QUANT_MAPPING.get(
             c.weight_dtype, 2
-        )  # vLLM uses model's weight/quantization applied to KV cache
+        )  # vLLM uses model's weight applied to KV cache
 
         total_bytes = (
             self.batch_size
@@ -84,7 +86,9 @@ def total_memory(self) -> float:
         """
         return self.model_memory + self.kv_cache_memory
 
-    def validate_shape(self, allowed_gpu_memory: float, gpu_utilization: float = 0.9) -> bool:
+    def validate_shape(
+        self, allowed_gpu_memory: float, gpu_utilization: float = 0.9
+    ) -> bool:
         """
         Validates if a given model estimator fits within the allowed GPU memory budget, using a fixed utilization margin.
 
@@ -102,6 +106,30 @@ def validate_shape(self, allowed_gpu_memory: float, gpu_utilization: float = 0.9
         """
         return (allowed_gpu_memory * gpu_utilization) > self.total_memory
 
+    def construct_deployment_params(self) -> str:
+        """
+        Constructs a deployment parameter string for the model.
+
+        This method assembles runtime configuration parameters to be passed
+        during model deployment. It:
+        - Overrides the max sequence length if a shorter length is provided.
+        - Suggests in-flight quantization **only if the model is unquantized**
+            and in-flight quantization (such as '4bit') is requested in config.
+
+        Returns:
+            str: Parameter string for model deployment.
+        """
+        c = self.llm_config
+        params = ""
+        if self.seq_len < c.max_seq_len:
+            params += f"{VLLM_PARAMS['max_model_len']} {str(self.seq_len)}"
+
+        # Only suggest in-flight quantization for unquantized models when such quantization is requested
+        if not c.quantization and c.in_flight_quantization in IN_FLIGHT_QUANTIZATION:
+            params += " " + VLLM_PARAMS["in_flight_quant"]
+
+        return params
+
     def suggest_param_advice(self, allowed: float) -> str:
         """
         Suggests parameter modifications to help a model fit within GPU memory limits.
@@ -126,12 +154,12 @@ def suggest_param_advice(self, allowed: float) -> str:
         config = self.llm_config
 
         suggested_quant_msg = None
-        quant_advice = ", ".join(getattr(config, "suggested_quantizations", []))
+        quant_advice = ", ".join(config.suggested_quantizations)
         quantization = getattr(config, "quantization", None)
 
         advice = []
 
-        if getattr(config, "suggested_quantizations", []):
+        if config.suggested_quantizations:
             to_do = f", which is smaller than the current {quantization if quantization in NEXT_QUANT else weight_size} format."
             if "No" in quant_advice:
                 suggested_quant_msg = "No smaller quantized version exists. Use a model with fewer parameters."
@@ -142,37 +170,36 @@ def suggest_param_advice(self, allowed: float) -> str:
                 )
             else:
                 suggested_quant_msg = (
-                    f"Use a model with or apply in-flight {quant_advice} quantization" + to_do
+                    f"Either use a pre-quantized model at {quant_advice}, or apply in-flight {quant_advice} quantization"
+                    + to_do
                 )
 
-        kv_advice = [
-            f"Reduce maximum context length (set --max-model-len < {seq_len})"
-        ]
+        kv_advice = [f"Reduce maximum context length (set --max-model-len < {seq_len})"]
 
         if batch_size != 1:
             kv_advice.append(f"Reduce batch size to less than {batch_size}.")
 
         wt_advice = [
             "Use a model with fewer parameters.",
-            f"{suggested_quant_msg}"
-            if suggested_quant_msg
-            else ""
+            f"{suggested_quant_msg}" if suggested_quant_msg else "",
         ]
 
         if kv_gb > wt_gb and kv_gb > allowed * 0.5:
-            main = "KV cache memory usage is the main limiting factor."
+            main = "KV cache memory usage is the main limiting factor"
             advice = kv_advice
         elif wt_gb > kv_gb and wt_gb > allowed * 0.5:
-            main = "Model weights are the main limiting factor."
+            main = "Model weights are the main limiting factor"
             advice = wt_advice
         else:
-            main = "Both model weights and KV cache are significant contributors to memory use."
+            main = "Both model weights and KV cache are significant contributors to memory use"
             advice = kv_advice
             advice.extend(wt_advice)
 
         advice_str = "\n".join(f"{i}. {item}" for i, item in enumerate(advice, 1))
 
-        return f"{advice_str}\n\n{main} (KV cache: {kv_gb:.1f}GB, Weights: {wt_gb:.1f}GB)."
+        return (
+            f"{advice_str}\n\n{main} (KV cache: {kv_gb:.1f}GB, Weights: {wt_gb:.1f}GB)."
+        )
 
     def limiting_factor(
         self, allowed_gpu_memory: float, warn_delta: float = 0.85
@@ -202,8 +229,7 @@ def limiting_factor(
             advice = (
                 f"While the selected compute shape is estimated to work "
                 f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed), "
-                f"the model configuration is close to the GPU memory limit. "
-                "This estimation is theoretical; actual memory usage may vary at runtime.\n\n"
+                f"the model configuration is close to the GPU memory limit.\n\n"
                 "If you encounter issues with this shape, consider the following options to reduce memory usage:\n\n"
                 f"{model_params.lstrip()}"
             )
@@ -216,7 +242,7 @@ def limiting_factor(
             )
         else:
             advice = (
-                f"Model fits well within the allowed compute shape "
+                f"No override PARAMS needed. \n\nModel fits well within the allowed compute shape "
                 f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed)."
             )
         return advice
@@ -252,6 +278,7 @@ def model_memory(self) -> float:
         layer_params = attn_params + mlp_params
         # Total params
         num_params = c.num_hidden_layers * layer_params + embedding_params
+
         return num_params * c.bytes_per_parameter / 1e9
 
     @property
 
@@ -8,7 +8,13 @@
 from pydantic import BaseModel, Field
 
 from ads.aqua.common.errors import AquaRecommendationError
-from ads.aqua.shaperecommend.constants import NEXT_QUANT, QUANT_MAPPING
+from ads.aqua.shaperecommend.constants import (
+    BITS_AND_BYTES_4BIT,
+    BITS_AND_BYTES_8BIT,
+    DEFAULT_WEIGHT_SIZE,
+    NEXT_QUANT,
+    QUANT_MAPPING,
+)
 
 
 class LLMConfig(BaseModel):
@@ -35,10 +41,11 @@ class LLMConfig(BaseModel):
         description="Dimension of each attention head. Typically hidden_size // num_attention_heads.",
     )
     max_seq_len: Optional[int] = Field(
-        8192, description="Maximum input sequence length (context window)."
+        4096, description="Maximum input sequence length (context window)."
     )
     weight_dtype: Optional[str] = Field(
-        "float32", description="Parameter data type: 'float32', 'float16', etc."
+        DEFAULT_WEIGHT_SIZE,
+        description="Parameter data type: 'float32', 'float16', etc.",
     )
     quantization: Optional[str] = Field(
         None,
@@ -49,6 +56,11 @@ class LLMConfig(BaseModel):
         description="Quantization method (e.g., '8bit', '4bit', 'gptq', 'awq') or None if unquantized.",
     )
 
+    in_flight_quantization: Optional[str] = Field(
+        None,
+        description="By setting this, enables recalculation of model footprint using 4bit in-flight quantization",
+    )
+
     num_key_value_heads: Optional[int] = Field(
         None,
         description="Number of key/value heads (for GQA architectures: Llama, Mistral, Falcon, Qwen, etc.). Used to determine KV cache size",
@@ -82,9 +94,13 @@ def bytes_per_parameter(self) -> float:
             bits = int(m[1])
             return bits / 8  # bytes per parameter
 
+        # consider in-flight quantization
+        if self.in_flight_quantization in QUANT_MAPPING:
+            return QUANT_MAPPING[self.in_flight_quantization]
+
         # Fallback to dtype mapping
-        dtype = (self.weight_dtype or "float32").lower()
-        return QUANT_MAPPING.get(dtype, QUANT_MAPPING["float32"])
+        dtype = (self.weight_dtype or DEFAULT_WEIGHT_SIZE).lower()
+        return QUANT_MAPPING.get(dtype, QUANT_MAPPING[DEFAULT_WEIGHT_SIZE])
 
     @classmethod
     def detect_quantization_type(cls, raw: dict) -> Optional[str]:
@@ -114,9 +130,9 @@ def detect_quantization_bits(cls, raw: dict) -> Optional[str]:
         Detects quantization bit-width as a string (e.g., '4bit', '8bit') from Hugging Face config dict.
         """
         if raw.get("load_in_8bit"):
-            return "8bit"
+            return BITS_AND_BYTES_8BIT
         if raw.get("load_in_4bit"):
-            return "4bit"
+            return BITS_AND_BYTES_4BIT
         if "quantization_config" in raw:
             qcfg = raw["quantization_config"]
             bits = qcfg.get("bits") or qcfg.get("wbits")
@@ -132,7 +148,12 @@ def suggested_quantizations(self):
         If model is un-quantized, uses the weight size.
         If model is pre-quantized, uses the quantization level.
         """
-        key = (self.quantization or self.weight_dtype or "float32").lower()
+        key = (
+            self.quantization
+            or self.in_flight_quantization
+            or self.weight_dtype
+            or DEFAULT_WEIGHT_SIZE
+        ).lower()
         return NEXT_QUANT.get(key, [])
 
     def calculate_possible_seq_len(self, min_len=2048):
@@ -142,22 +163,21 @@ def calculate_possible_seq_len(self, min_len=2048):
         """
         vals = []
         curr = min_len
-        max_seq_len = 16384 if not self.max_seq_len else self.max_seq_len
-        while curr <= max_seq_len:
+        while curr <= self.max_seq_len:
             vals.append(curr)
             curr *= 2
-        if vals and vals[-1] != max_seq_len:
-            vals.append(max_seq_len)
+        if vals and vals[-1] != self.max_seq_len:
+            vals.append(self.max_seq_len)
         return vals
 
     def optimal_config(self):
         """
         Builds a list of optimal configuration parameters (sorted descending). Combination of:
-            - Quantization / weight sizes: bfloat16 weight size -> 8bit -> 4bit
+            - Quantization / weight sizes: bfloat16 weight size -> 4bit
             - max-model-len: power-of-two model lengths from max length (config.json of model) to 2048 tokens.
 
         Example:
-        [('bfloat16', max_model_len supported by model) ('bfloat16', 1/2 of max_model_len) ... ('int8', 2048), ('int4', 4096), ('int4', 2048)]
+        [('bfloat16', max_model_len supported by model) ('bfloat16', 1/2 of max_model_len) ... ('int4', 4096), ('int4', 2048)]
 
         """
         # Create a copy of the suggested_quantizations list
@@ -183,9 +203,11 @@ def validate_model_support(cls, raw: dict) -> ValueError:
         """
         excluded_models = {"t5", "gemma", "bart", "bert", "roberta", "albert"}
         if (
-            raw.get("is_encoder_decoder", False) # exclude encoder-decoder models
-            or (raw.get("is_decoder") is False) # exclude explicit encoder-only models (altho no text-generation task ones, just dbl check)
-            or raw.get("model_type", "").lower() # exclude by known model types
+            raw.get("is_encoder_decoder", False)  # exclude encoder-decoder models
+            or (
+                raw.get("is_decoder") is False
+            )  # exclude explicit encoder-only models (altho no text-generation task ones, just dbl check)
+            or raw.get("model_type", "").lower()  # exclude by known model types
             in excluded_models
         ):
             raise AquaRecommendationError(
@@ -207,7 +229,7 @@ def from_raw_config(cls, raw: dict) -> "LLMConfig":
         )
         hidden_size = raw.get("hidden_size") or raw.get("n_embd") or raw.get("d_model")
         vocab_size = raw.get("vocab_size")
-        weight_dtype = str(raw.get("torch_dtype", "float32"))
+        weight_dtype = str(raw.get("torch_dtype", DEFAULT_WEIGHT_SIZE))
         quantization = cls.detect_quantization_bits(raw)
         quantization_type = cls.detect_quantization_type(raw)