IBM
diff --git a/‎ccc_worker_server.py‎
Lines changed: 123 additions & 0 deletions b/‎ccc_worker_server.py‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎src/unitxt/inference.py‎
Lines changed: 203 additions & 5 deletions b/‎src/unitxt/inference.py‎
Lines changed: 203 additions & 5 deletions
@@ -0,0 +1,123 @@
+
+import logging
+import os
+import random
+import sys
+import threading
+import time
+
+import requests
+from flask import Flask, jsonify, request
+from unitxt.inference import HFPipelineBasedInferenceEngine
+
+logging.basicConfig(level=logging.INFO)
+
+app = Flask(__name__)
+PORT = None
+
+class Server:
+    def __init__(self):
+        self.inference_engine = None
+        self.inactivity_timeout = 600
+        self.monitor_thread = threading.Thread(target=self.monitor_activity, daemon=True)
+        self.last_request_time = time.time()
+        self.shutdown_flag = False
+        self.monitor_thread.start()
+
+    def update_last_request_time(self):
+        self.last_request_time = time.time()
+
+    def monitor_activity(self):
+        while not self.shutdown_flag:
+            time.sleep(5)
+            if time.time() - self.last_request_time > self.inactivity_timeout:
+                app.logger.info(f"No requests for {self.inactivity_timeout} seconds. Shutting down server...")
+                try:
+                    requests.post(f"http://localhost:{PORT}/shutdown", timeout=5)
+                except Exception:
+                    pass
+            else:
+                app.logger.info(
+                    f"{int(self.inactivity_timeout - (time.time() - self.last_request_time))} till shutdown...")
+
+    def shutdown_server(self):
+        self.shutdown_flag = True
+        app.logger.info("Server shutting down...")
+        shutdown_func = request.environ.get("werkzeug.server.shutdown")
+        if shutdown_func:
+            shutdown_func()
+        # Allow the shutdown process to complete, then force exit the program
+        time.sleep(1)
+        os._exit(0)  # This immediately stops the program
+
+    def init_server(self, **kwargs):
+        kwargs["use_cache"] =True
+        self.inference_engine = HFPipelineBasedInferenceEngine(**kwargs)
+
+    def infer(self, **kwargs):
+        inputs = []
+        return self.inference_engine(inputs)
+
+
+server = Server()
+
+@app.before_request
+def update_activity():
+    server.update_last_request_time()
+
+
+@app.route("/shutdown", methods=["POST"])
+def shutdown():
+    app.logger.info("Received shutdown request")
+    server.shutdown_server()
+    return jsonify({"message": "Shutting down server..."}), 200
+
+
+@app.route("/init_server", methods=["POST"])
+def init_server():
+    kwargs = request.get_json()
+    server.init_server(**kwargs)
+    return jsonify("Accepted")
+
+
+@app.route("/<model>/v1/chat/completions", methods=["POST"])
+@app.route("/<model_prefix>/<model>/v1/chat/completions", methods=["POST"])
+def completions(model: str, model_prefix: str = "None"):
+    if random.random() < 0:
+        logging.error("Bad luck! Returning 500 with an error message.")
+        app.logger.info("Server shutting down...")
+        shutdown_func = request.environ.get("werkzeug.server.shutdown")
+        if shutdown_func:
+            shutdown_func()
+        # Allow the shutdown process to complete, then force exit the program
+        time.sleep(1)
+        os._exit(0)  # This immediately stops the program
+        return jsonify({"error": "Bad luck, something went wrong!"}), 500
+
+    body = request.get_json()
+    # validate that request parameters are equal to the model config. Print warnings if not.
+    for k, v in body.items():
+        if k == "messages":
+            continue
+        k = "model_name" if k == "model" else k
+        attr = getattr(server.inference_engine, k, None)
+        if attr is None:
+            logging.warning(f"Warning: {k} is not an attribute in inference_engine")
+        else:
+            if attr != v:
+                logging.warning(f"Warning: {k} value in boody({v}) is different from value in inference engine ({attr})")
+    texts = [{"source": m[0]["content"]} for m in body["messages"]]
+    predictions = server.inference_engine(texts)
+    return jsonify({
+        "choices": [{"message": {"role": "assistant","content": p}} for p in predictions],
+    })
+
+
+@app.route("/status", methods=["GET"])
+def status():
+    return "up", 200
+
+
+if __name__ == "__main__":
+    PORT = sys.argv[1]
+    app.run(host="0.0.0.0", port=PORT, debug=True)
@@ -7,11 +7,14 @@
 import json
 import logging
 import os
+import random
 import re
 import sys
+import threading
 import time
 import uuid
 from collections import Counter
+from concurrent.futures import Future, ThreadPoolExecutor, wait
 from datetime import datetime
 from itertools import islice
 from multiprocessing.pool import ThreadPool
@@ -30,6 +33,7 @@
     Union,
 )
 
+import requests
 from datasets import Dataset, DatasetDict, Image
 from tqdm import tqdm, trange
 from tqdm.asyncio import tqdm_asyncio
@@ -276,7 +280,7 @@ def infer(
                             if prediction is None:
                                 continue
                             cache_key = self._get_cache_key(item)
-                            self._cache[cache_key] = prediction
+                            self.store_in_cache(cache_key, prediction)
                     else:
                         inferred_results = []
                     # Combine cached and inferred results in original order
@@ -286,6 +290,9 @@ def infer(
                     result.extend(batch_predictions)
             else:
                 result = self._infer(dataset, return_meta_data)
+
+            result = self.post_process_results(result)
+
         return ListWithMetadata(
             result,
             metadata={
@@ -295,6 +302,12 @@ def infer(
             },
         )
 
+    def store_in_cache(self, cache_key, prediction):
+        self._cache[cache_key] = prediction
+
+    def post_process_results(self, result):
+        return result
+
     def _mock_infer(
         self,
         dataset: Union[List[Dict[str, Any]], Dataset],
@@ -1957,7 +1970,7 @@ def prepare_engine(self):
     @staticmethod
     def get_base_url_from_model_name(model_name: str):
         base_url_template = (
-            "https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com/{}"
+            "http://localhost:5000/{}"
         )
         return base_url_template.format(
             RITSInferenceEngine._get_model_name_for_endpoint(model_name)
@@ -3546,10 +3559,9 @@ def _infer(
         dataset: Union[List[Dict[str, Any]], Dataset],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        if return_meta_data and not hasattr(self.engine, "get_return_object"):
+        if return_meta_data:
             raise NotImplementedError(
-                f"Inference engine {self.engine.__class__.__name__} does not support return_meta_data as it "
-                f"does not contain a 'get_return_object' method. Please set return_meta_data=False."
+                f"Inference engine {self.engine.__class__.__name__} does not support return_meta_data."
             )
 
         inputs = []
@@ -3576,3 +3588,189 @@ def _infer(
             predictions.append(options_scores.most_common(1)[0][0])
 
         return predictions
+
+
+class MultiServersInferenceEngine(OpenAiInferenceEngine,
+                         HFGenerationParamsMixin):
+
+    workers_url: List[str]
+
+    def post_server(self, server_url, endpoint, data):
+        headers = {"Content-Type": "application/json"}
+        response = requests.post(url=f"{server_url}/{endpoint}", json=data, headers=headers)
+        response.raise_for_status()
+        return response.json()
+
+    def prepare_engine(self):
+        from openai import OpenAI
+        self.lock = threading.Lock()
+        self.workers_state = {}
+        credentials = self._prepare_credentials()
+        for url in self.workers_url:
+            init_result = self.post_server(endpoint="init_server",server_url=url,
+                             data={**self.to_dict([HFGenerationParamsMixin]), **{"model_name": self.model_name}})
+            if init_result == "Accepted":
+                self.add_worker(url, client=OpenAI(
+                    api_key=credentials["api_key"],
+                    base_url= f"{url}/{self.model_name}" + "/v1",
+                    default_headers=self.get_default_headers(),
+                ))
+
+    #def init_server_and_add_to_workers_list
+
+
+    def add_worker(self, url, client):
+        with self.lock:
+            self.workers_state[url] = {"status": "ready", "client": client}
+
+    def release_worker(self, url):
+        with self.lock:
+            self.workers_state[url]["status"] = "ready"
+
+    def assign_worker(self):
+        with self.lock:
+            while True:
+             #   print("trying to assign worker...")
+                for url, rec in self.workers_state.items():
+                    if rec["status"] == "ready":
+                        rec["status"] ="assigned"
+                        return url, rec["client"]
+                time.sleep(random.uniform(0, 1))
+
+    def _prepare_credentials(self) -> CredentialsOpenAi:
+        return {"api" + "_" + "key": "no-api-key",}
+
+    def _infer(
+            self,
+            dataset: Union[List[Dict[str, Any]], Dataset],
+            return_meta_data: bool = False,
+    ) -> List[Any]:  # Now returns a Future object
+        """Runs inference in parallel, returning futures for each batch."""
+        # Lazy-initialize executor if not already created
+        if not hasattr(self, "_executor"):
+            self._executor = ThreadPoolExecutor(max_workers=len(self.workers_state))
+
+        # Submit the batch job
+        batch_future = self._executor.submit(self._run_batch, dataset, return_meta_data)
+
+        # Create individual futures that resolve when batch_future is done
+        element_futures = [Future() for _ in dataset]
+
+        def set_results(batch_fut: Future):
+            """Callback to set individual results once batch computation is done."""
+            try:
+                results = batch_fut.result()  # Get the batch results
+                for i, res in enumerate(results):
+                    element_futures[i].set_result(res)  # Set each individual future
+            except Exception as e:
+                for f in element_futures:
+                    f.set_exception(e)  # Propagate any exception
+
+        # Attach the callback to the batch future
+        batch_future.add_done_callback(set_results)
+
+        return element_futures  # Return a list of futures
+
+    def _run_batch(self, batch, return_meta_data):
+        """Helper function to process a batch inside a thread."""
+        logger.info(f"Trying to get assigned: {self.workers_state}")
+        url, client = self.assign_worker()
+        logger.info(f"Thread {url} processing batch: {self.workers_state}")
+        messages = [self.to_messages(instance) for instance in batch]
+        logger.info(f"a {url}")
+        try:
+            response = client.chat.completions.create(
+                messages=messages,
+                model=self.model_name,
+                **self._get_completion_kwargs(),
+            )
+            logger.info(f"response: {response}")
+            predictions = [r.message.content for r in response.choices]
+            result = [self.get_return_object(p, response, return_meta_data) for p in predictions]
+        finally:
+            logger.info(f"Thread {url} release state:")
+            self.release_worker(url)
+            logger.info(f"Thread {url} release state done: {self.workers_state}")
+        return result
+
+    def post_process_results(self, result):
+        futures = [r for r in result if isinstance(r, Future)]
+        if futures:
+            wait(futures)
+
+        return [r.result() if isinstance(r, Future) else r for r in result]
+
+    def store_in_cache(self, cache_key, prediction):
+        if isinstance(prediction, Future):
+            def store_after_pack_in_cache(future, cache_key):
+                prediction = future.result()
+                if prediction is not None:
+                    self._cache[cache_key] = prediction
+
+            prediction.add_done_callback(lambda f, key=cache_key: store_after_pack_in_cache(f, key))
+        else:
+            self._cache[cache_key] = prediction
+
+
+class CCCInferenceEngine(MultiServersInferenceEngine):
+    ccc_host: str
+    ccc_user: str
+    ccc_path: str
+    ccc_python: str
+    server_port: str = "5000"
+    num_of_workers: int = 5
+    workers_url: List[str] = []
+
+    def prepare_engine(self):
+        assert not self.workers_url, "CCCInferenceEngine doesn't support explicit setting of workers_url"
+        self.start_ccc_servers()
+        self.prepare_engine()
+
+    def start_ccc_servers(self):
+        import paramiko
+        ssh = paramiko.SSHClient()
+        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        ssh.connect(self.ccc_host, username=self.ccc_user)
+        ssh.exec_command(f"mkdir -p {self.ccc_path}")
+        self.ccc_jobs = {}
+        for i in range(self.num_of_workers):
+            command = f"bash -l -c 'jbsub -queue x86_6h -cores 4+1 -require v100 -mem 24G -out ~/server{i}.log {self.ccc_python} /dccstor/fuse/unitxt/ccc_worker_server.py {self.server_port}'"
+            stdin, stdout, stderr = ssh.exec_command(command)
+            job_output = stdout.read().decode().strip()
+            job_error = stderr.read().decode().strip()
+            match = re.search(r"Job <(\d+)> is submitted", job_output)
+            if match:
+                job_id = match.group(1)
+                logger.info(f"Start job ID: {job_id}")
+                self.ccc_jobs[job_id] ={"status": "AVAIL", "log_id": i}
+            else:
+                raise RuntimeError(f"Failed to run jbsub on host {self.ccc_host}.\nstdout: {job_output}.\nstderr: {job_error}")
+
+        def run_monitor_ccc_jobs(ssh, sample_every):
+            while True:
+                command = "bash -l -c 'jbinfo'"
+                stdin, stdout, stderr = ssh.exec_command(command)
+                output = stdout.read().decode().strip()
+                #error = stderr.read().decode().strip()
+                for job_id in self.ccc_jobs.keys():
+                    match = re.search(rf"^{job_id}\s+\S+\s+(\w+)", output, re.MULTILINE)
+                    if match:
+                        status = match.group(1)
+                        if status != self.ccc_jobs[job_id]["status"]:
+                            if self.ccc_jobs[job_id]["status"] == "RUN":
+                                pass # add server to server list
+                            elif status == "RUN":
+                                pass # remove server from server list. Consider fetching the server log.
+                            self.ccc_jobs[job_id]["status"] = status
+                            logger.info(f"status has been changed: {job_id} - {status}")
+
+
+                time.sleep(sample_every)
+
+        thread = threading.Thread(target=run_monitor_ccc_jobs, args=(ssh, 10))
+        thread.daemon = True  #
+        thread.start()
+
+
+        time.sleep(200)  # This keeps the main thread alive so the background thread can continue
+