diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 969caf8fa..94c335bd6 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -544,6 +544,7 @@ class AsyncVLLMModel(VLLMModel): is_async = True def cleanup(self): + self.model.shutdown() gc.collect() destroy_distributed_environment() torch.cuda.empty_cache() @@ -627,7 +628,6 @@ async def _async_batch(self, docs: list[Doc], generative: bool) -> list: results = await asyncio.gather(*processed_requests) return results - @cached(SamplingMethod.GENERATIVE) async def greedy_until( self, docs: list[Doc], @@ -661,7 +661,6 @@ async def greedy_until( return results - @cached(SamplingMethod.LOGPROBS) async def loglikelihood( self, docs: list[Doc], diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 1f5da9c14..9d43c96ac 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -308,6 +308,8 @@ async def _run_model_async(self): model_outputs = await self.model.loglikelihood(docs) outputs[sampling_method] = model_outputs + self.model.cleanup() + return outputs def _run_model_sync(self): @@ -327,6 +329,8 @@ def _run_model_sync(self): model_outputs = self.model.loglikelihood_rolling(docs) outputs[sampling_method] = model_outputs + self.model.cleanup() + return outputs def _run_model(self): @@ -339,9 +343,6 @@ def _run_model(self): else: outputs = self._run_model_sync() - # Cleaning up the model before running metrics - self.model.cleanup() - return outputs def _post_process_outputs(self, sampling_method_responses: dict[str, list[ModelResponse]]):