diff --git a/kimodo/model/llm2vec/llm2vec_wrapper.py b/kimodo/model/llm2vec/llm2vec_wrapper.py index eb33c87..108dbba 100644 --- a/kimodo/model/llm2vec/llm2vec_wrapper.py +++ b/kimodo/model/llm2vec/llm2vec_wrapper.py @@ -71,3 +71,46 @@ def __call__(self, text: list[str] | str): encoded_text = torch.tensor(encoded_text).to(self.get_device()) return encoded_text, lengths + + +class DummyTextEncoder: + """Zero-vector text encoder for constraint-only generation without LLM weights. + + Activated by setting TEXT_ENCODER_MODE=dummy. Returns zero embeddings + of the correct shape (llm_dim=4096), which the model treats as + unconditional (same as empty-text in classifier-free guidance training). + + This allows running Kimodo on GPUs with <17GB VRAM and without + Llama-3 access, using only kinematic constraints for motion control. + """ + + def __init__(self, llm_dim: int = 4096, device: str = "cuda:0") -> None: + self.llm_dim = llm_dim + self._device = torch.device(device) + print(f"[Kimodo] Using DummyTextEncoder (zero embeddings, dim={llm_dim})") + print("[Kimodo] Text prompts will be ignored. Use constraints for motion control.") + + def to(self, device: torch.device): + self._device = torch.device(device) + return self + + def eval(self): + return self + + def get_device(self): + return self._device + + def __call__(self, text: list[str] | str): + is_string = False + if isinstance(text, str): + text = [text] + is_string = True + + encoded_text = torch.zeros(len(text), 1, self.llm_dim, device=self._device) + lengths = np.ones(len(text), dtype=int).tolist() + + if is_string: + encoded_text = encoded_text[0] + lengths = lengths[0] + + return encoded_text, lengths diff --git a/kimodo/model/load_model.py b/kimodo/model/load_model.py index b2bed99..fe797aa 100644 --- a/kimodo/model/load_model.py +++ b/kimodo/model/load_model.py @@ -81,8 +81,14 @@ def _select_text_encoder_conf(text_encoder_url: str) -> dict: # TEXT_ENCODER_MODE options: # - "api": force TextEncoderAPI # - "local": force local LLM2VecEncoder + # - "dummy": zero-vector encoder (no LLM needed, constraint-only) # - "auto": try API first, fallback to local if unreachable mode = get_env_var("TEXT_ENCODER_MODE", "auto").lower() + if mode == "dummy": + return { + "_target_": "kimodo.model.llm2vec.llm2vec_wrapper.DummyTextEncoder", + "llm_dim": 4096, + } if mode == "local": return _build_local_text_encoder_conf() if mode == "api":