huggingface · f14-bertolotti · Nov 17, 2025
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -544,6 +544,7 @@ class AsyncVLLMModel(VLLMModel):
     is_async = True
 
     def cleanup(self):
+        self.model.shutdown()
         gc.collect()
         destroy_distributed_environment()
         torch.cuda.empty_cache()
@@ -627,7 +628,6 @@ async def _async_batch(self, docs: list[Doc], generative: bool) -> list:
         results = await asyncio.gather(*processed_requests)
         return results
 
-    @cached(SamplingMethod.GENERATIVE)
     async def greedy_until(
         self,
         docs: list[Doc],
@@ -661,7 +661,6 @@ async def greedy_until(
 
         return results
 
-    @cached(SamplingMethod.LOGPROBS)
     async def loglikelihood(
         self,
         docs: list[Doc],

diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
@@ -308,6 +308,8 @@ async def _run_model_async(self):
                     model_outputs = await self.model.loglikelihood(docs)
                     outputs[sampling_method] = model_outputs
 
+        self.model.cleanup()
+
         return outputs
 
     def _run_model_sync(self):
@@ -327,6 +329,8 @@ def _run_model_sync(self):
                     model_outputs = self.model.loglikelihood_rolling(docs)
                     outputs[sampling_method] = model_outputs
 
+        self.model.cleanup()
+
         return outputs
 
     def _run_model(self):
@@ -339,9 +343,6 @@ def _run_model(self):
         else:
             outputs = self._run_model_sync()
 
-        # Cleaning up the model before running metrics
-        self.model.cleanup()
-
         return outputs
 
     def _post_process_outputs(self, sampling_method_responses: dict[str, list[ModelResponse]]):