intel
diff --git a/‎omni/README.md‎
Lines changed: 195 additions & 0 deletions b/‎omni/README.md‎
Lines changed: 195 additions & 0 deletions
diff --git a/‎visual-ai/ComfyUI/assets/confyui_workflow.png‎ renamed to ‎omni/assets/confyui_workflow.png‎ b/‎visual-ai/ComfyUI/assets/confyui_workflow.png‎ renamed to ‎omni/assets/confyui_workflow.png‎
diff --git a/‎visual-ai/ComfyUI/assets/wan_raylight.png‎ renamed to ‎omni/assets/wan_raylight.png‎ b/‎visual-ai/ComfyUI/assets/wan_raylight.png‎ renamed to ‎omni/assets/wan_raylight.png‎
diff --git a/‎omni/assets/xinference_configure.png‎
43.1 KB b/‎omni/assets/xinference_configure.png‎
43.1 KB
diff --git a/‎omni/assets/xinference_gradio.png‎
68.8 KB b/‎omni/assets/xinference_gradio.png‎
68.8 KB
diff --git a/‎omni/assets/xinference_launch.png‎
72.2 KB b/‎omni/assets/xinference_launch.png‎
72.2 KB
diff --git a/‎omni/assets/xinference_sd.png‎
88 KB b/‎omni/assets/xinference_sd.png‎
88 KB
diff --git a/‎omni/build.sh‎
Lines changed: 6 additions & 0 deletions b/‎omni/build.sh‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎visual-ai/ComfyUI/docker/Dockerfile‎ renamed to ‎omni/docker/Dockerfile‎
Lines changed: 26 additions & 9 deletions b/‎visual-ai/ComfyUI/docker/Dockerfile‎ renamed to ‎omni/docker/Dockerfile‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎omni/patches/raylight_for_multi_arc.patch‎
Lines changed: 77 additions & 0 deletions b/‎omni/patches/raylight_for_multi_arc.patch‎
Lines changed: 77 additions & 0 deletions
@@ -0,0 +1,195 @@
+# llm-scaler-omni
+
+---
+
+## Table of Contents
+
+1. [Getting Started with Omni Docker Image](#getting-started-with-omni-docker-image)
+2. [ComfyUI](#comfyui)
+3. [XInference](#xinference)
+4. [Stand-alone Examples](#stand-alone-examples)
+
+---
+
+## Getting Started with Omni Docker Image
+
+Build docker image:
+
+```bash
+bash build.sh
+```
+
+Run docker image:
+
+```bash
+export DOCKER_IMAGE=intel/llm-scaler-omni:0.1-b1
+export CONTAINER_NAME=comfyui
+export MODEL_DIR=<your_model_dir>
+export COMFYUI_MODEL_DIR=<your_comfyui_model_dir>
+sudo docker run -itd \
+        --privileged \
+        --net=host \
+        --device=/dev/dri \
+        -e no_proxy=localhost,127.0.0.1 \
+        --name=$CONTAINER_NAME \
+        -v $MODEL_DIR:/llm/models/ \
+        -v $COMFYUI_MODEL_DIR:/llm/ComfyUI/models \
+        --shm-size="64g" \
+        --entrypoint=/bin/bash \
+        $DOCKER_IMAGE
+
+docker exec -it comfyui bash
+```
+
+## ComfyUI:
+```bash
+cd /llm/ComfyUI
+
+MODEL_PATH=<your_comfyui_models_path>
+rm -rf /llm/ComfyUI/models
+ln -s $MODEL_PATH /llm/ComfyUI/models
+echo "Symbolic link created from $MODEL_PATH to /llm/ComfyUI/models"
+
+export http_proxy=<your_proxy>
+export https_proxy=<your_proxy>
+export no_proxy=localhost,127.0.0.1
+
+python3 main.py
+```
+
+Then you can access the webUI at `http://<your_local_ip>:8188/`. On the left side, 
+
+![workflow image](./assets/confyui_workflow.png)
+
+### ComfyUI workflows
+
+Currently, the following workflows are supported on B60:
+- Qwen-Image (refer to https://raw.githubusercontent.com/Comfy-Org/example_workflows/main/image/qwen/image_qwen_image_distill.json)
+- Qwen-Image-Edit (refer to https://raw.githubusercontent.com/Comfy-Org/workflow_templates/refs/heads/main/templates/image_qwen_image_edit.json)
+- Wan2.2-TI2V-5B (refer to https://raw.githubusercontent.com/Comfy-Org/workflow_templates/refs/heads/main/templates/video_wan2_2_5B_ti2v.json)
+- Wan2.2-T2V-14B with raylight (refer to https://github.com/komikndr/raylight/blob/main/example_workflows/WanT2V_Raylight.json)
+- Flux.1 Kontext Dev(Basic) workflow in ComfyUI examples (refer to https://docs.comfy.org/tutorials/flux/flux-1-kontext-dev)
+- SD3.5 Simple in ComfyUI examples (refer to https://comfyanonymous.github.io/ComfyUI_examples/sd3/)
+
+#### Qwen-Image
+
+ComfyUI tutorial for qwen-image: https://docs.comfy.org/tutorials/image/qwen/qwen-image
+
+Only `Qwen-Image Native Workflow Example` part is validated and there are some issues using LoRA. It's recommended to run the Distilled version for better performance.
+
+#### Qwen-Image-Edit
+
+ComfyUI tutorial for qwen-image-edit: https://docs.comfy.org/tutorials/image/qwen/qwen-image-edit
+
+#### Wan2.2-TI2V-5B
+
+ComfyUI tutorial for wan2.2: https://docs.comfy.org/tutorials/video/wan/wan2_2
+
+Due to memory limit with single device, only `
+Wan2.2 TI2V 5B Hybrid Version Workflow Example` is validated.
+
+#### Wan2.2-T2V-14B with raylight
+
+Currently using [WAN2.2-14B-Rapid-AllInOne](https://huggingface.co/Phr00t/WAN2.2-14B-Rapid-AllInOne) and [raylight](https://github.com/komikndr/raylight) as a faster solution with multi-XPU support. The model weights can get from [here](https://modelscope.cn/models/Phr00t/WAN2.2-14B-Rapid-AllInOne/files), and you may need to extract the unet part and VAE part seperately with `tools/extract.py`.
+
+![wan_raylight](./assets/wan_raylight.png)
+
+##### Follow the Steps to Complete the Workflow
+
+1. Model Loading
+
+- Ensure the `Load Diffusion Model (Ray)` node loads the diffusion model part from WAN2.2-14B-Rapid-AllInOne.
+- Ensure the `Load VAE` node loads the VAE part from WAN2.2-14B-Rapid-AllInOne.
+- Ensure the `Load CLIP` node loads `umt5_xxl_fp8_e4m3fn_scaled.safetensors`
+
+2. Ray configuration
+
+Set the `GPU` and `ulysses_degree` in `Ray Init Actor` node to GPU nums you want to use.
+
+3. Click the `Run` button or use the shortcut `Ctrl(cmd) + Enter` to run the workflow
+
+## XInference
+
+```bash
+export ZE_AFFINITY_MASK=0 # In multi XPU environment, clearly select GPU index to avoid issues.
+xinference-local --host 0.0.0.0 --port 9997
+```
+Supported models:
+- Stable Diffusion 3.5 Medium
+- Kokoro 82M
+- whisper large v3
+
+### WebUI Usage
+
+#### 1. Access Xinference Web UI
+![xinference_launch](./assets/xinference_launch.png)
+
+#### 2. Select model and configure `model_path`
+![xinference_model](./assets/xinference_configure.png)
+
+#### 3. Find running model and launch Gradio UI for this model
+![xinference_gradio](./assets/xinference_gradio.png)
+
+#### 4. Generate within Gradio UI
+![xinference_example](./assets/xinference_sd.png)
+
+### OpenAI API Usage
+
+> Visit http://127.0.0.1:9997/docs to inspect the API docs.
+
+#### 1. Launch API service
+You can select model and launch service via WebUI (refer to [here](#1-access-xinference-web-ui)) or by command:
+
+```bash
+export ZE_AFFINITY_MASK=0 # In multi XPU environment, clearly select GPU index to avoid issues.
+xinference-local --host 0.0.0.0 --port 9997
+
+xinference launch --model-name sd3.5-medium --model-type image --model-path /llm/models/stable-diffusion-3.5-medium/
+```
+
+#### 2. Post request in OpenAI API format
+
+For TTS model (`Kokoro 82M` for example):
+```bash
+curl http://localhost:9997/v1/audio/speech   -H "Content-Type: application/json"   -d '{
+    "model": "Kokoro-82M",
+    "input": "kokoro, hello, I am kokoro." 
+  }'   --output output.wav
+```
+
+For STT models (`whisper large v3` for example):
+```bash
+AUDIO_FILE_PATH=<your_audio_file_path>
+
+curl -X 'POST' \
+  "http://localhost:9997/v1/audio/translations" \
+  -H 'accept: application/json' \
+  -F "model=whisper-large-v3" \
+  -F "file=@${AUDIO_FILE_PATH}"
+
+{"text":" Cacaro's hello, I am Cacaro."}
+```
+
+For text-to-image models (`Stable Diffusion 3.5 Medium` for example):
+```bash
+curl http://localhost:9997/v1/images/generations \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "sd3.5-medium",
+    "prompt": "A Shiba Inu chasing butterflies on a sunny grassy field, cartoon style, with vibrant colors.",
+    "n": 1,
+    "size": "1024x1024",
+    "quality": "standard",
+    "response_format": "url"
+  }'
+```
+
+## Stand-alone Examples 
+
+> Notes: Stand-alone examples are excluded from `intel/llm-scaler-omni` image.
+
+Supported models:
+- Hunyuan3D 2.1
+- Qwen Image
+- Wan 2.1 / 2.2
+
@@ -0,0 +1,6 @@
+set -x
+
+export HTTP_PROXY=<your_http_proxy>
+export HTTPS_PROXY=<your_https_proxy>
+
+docker build -f ./docker/Dockerfile . -t intel/llm-scaler-omni:0.1-b1 --build-arg https_proxy=$HTTPS_PROXY --build-arg http_proxy=$HTTP_PROXY
@@ -8,33 +8,44 @@ ARG https_proxy
 ARG http_proxy
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib/python3.10/dist-packages/torch/lib:$LD_LIBRARY_PATH"
 
+COPY ./patches/yunchang_for_multi_arc.patch /tmp/
+COPY ./patches/xdit_for_multi_arc.patch /tmp/
+COPY ./patches/raylight_for_multi_arc.patch /tmp/
+
 # Add Intel oneAPI repo and PPA for GPU support
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
     add-apt-repository -y ppa:kobuk-team/intel-graphics-testing && \
 # Install dependencies and Python 3.10
     apt-get update -y && \
     apt-get install -y software-properties-common libgl1 && \
+    apt-get install -y libxrender1 libxfixes3 libx11-dev libxi6 libxxf86vm1 libxcursor1 libxrandr2 libxinerama1 libxkbcommon0 libsm6 ffmpeg && \
     add-apt-repository ppa:deadsnakes/ppa && \
     apt-get update -y && \
     apt-get install -y python3.10 python3.10-distutils python3.10-dev && \
     curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
     update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
-    pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu && \
-    pip install intel-extension-for-pytorch==2.7.10+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
-    pip install oneccl_bind_pt==2.7.0+xpu --index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
-    pip install bigdl-core-xe-all==2.6.0 --extra-index-url https://download.pytorch.org/whl/xpu && \
+    pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/xpu && \
+    pip install oneccl_bind_pt==2.8.0+xpu --index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
+    pip install bigdl-core-xe-all==2.6.0 --index-url https://download.pytorch.org/whl/xpu && \
     apt remove python3-blinker -y && \
-# Install xDit related dependencies
+    wget https://download.blender.org/pypi/bpy/bpy-4.0.0-cp310-cp310-manylinux_2_28_x86_64.whl && \
+    pip install bpy-4.0.0-cp310-cp310-manylinux_2_28_x86_64.whl && \
+    rm bpy-4.0.0-cp310-cp310-manylinux_2_28_x86_64.whl && \
+    # Install xDit related dependencies
     mkdir /llm && \
     cd /llm && \
     ln -s /usr/bin/python3 /usr/bin/python && \
-    git clone https://github.com/analytics-zoo/long-context-attention.git -b xpu-main && \
+    git clone https://github.com/feifeibear/long-context-attention.git && \
     cd long-context-attention && \
+    git checkout fc5d55e61b78b3102fd824bea1791cf406cc2a4b && \
+    git apply /tmp/yunchang_for_multi_arc.patch && \
     pip install -e . && \
     cd /llm && \
-    git clone https://github.com/analytics-zoo/xDiT.git -b xpu-main && \
+    git clone https://github.com/xdit-project/xDiT.git && \
     cd xDiT && \
+    git checkout fb8fb0e437a8745b9629020759de31d1626a4a7b && \
+    git apply /tmp/xdit_for_multi_arc.patch && \
     pip install -e . && \
 # Install ComfyUI
     cd /llm && \
@@ -47,13 +58,19 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
     cd comfyui-videohelpersuite && \
     pip install -r requirements.txt && \
     cd .. && \
-    git clone https://github.com/xiangyuT/raylight.git -b xpu_main && \
+    git clone https://github.com/komikndr/raylight.git && \
     cd raylight && \
+    git checkout 290c934cdd498b003fbf083e74e91ffc8edb961a && \
+    git apply /tmp/raylight_for_multi_arc.patch && \
     pip install -r requirements.txt && \
     cd .. && \
     git clone https://github.com/yolain/ComfyUI-Easy-Use.git comfyui-easy-use && \
     cd comfyui-easy-use && \
-    pip install -r requirements.txt
+    pip install -r requirements.txt && \
+# Install Xinference
+    pip install "xinference[transformers]" && \
+    # Clean
+    rm -rf /tmp/*
 
 COPY ./workflows/* /llm/ComfyUI/user/default/workflows/
 
 
@@ -0,0 +1,77 @@
+diff --git a/src/raylight/distributed_worker/ray_worker.py b/src/raylight/distributed_worker/ray_worker.py
+index b3fcd2a..804fd6d 100644
+--- a/src/raylight/distributed_worker/ray_worker.py
++++ b/src/raylight/distributed_worker/ray_worker.py
+@@ -98,6 +98,13 @@ def usp_inject_callback(
+         )
+ 
+ 
++try:
++    import intel_extension_for_pytorch as ipex
++except:
++    pass
++
++import oneccl_bindings_for_pytorch
++
+ class RayWorker:
+     def __init__(self, local_rank, world_size, device_id, parallel_dict):
+         self.model = None
+@@ -109,7 +116,7 @@ class RayWorker:
+ 
+         self.parallel_dict = parallel_dict
+         self.parallel_dict["is_fsdp_wrapped"] = False
+-        self.device = torch.device(f"cuda:{self.device_id}")
++        self.device = torch.device(f"xpu:{self.device_id}")
+ 
+         if self.model is not None:
+             self.is_model_load = True
+@@ -117,9 +124,10 @@ class RayWorker:
+             self.is_model_load = False
+ 
+         if self.parallel_dict["is_xdit"] or self.parallel_dict["is_fsdp"]:
+-            os.environ["CUDA_VISIBLE_DEVICES"] = str(self.device_id)
++            #os.environ["CUDA_VISIBLE_DEVICES"] = str(self.device_id)
++            torch.xpu.set_device(local_rank)
+             dist.init_process_group(
+-                "nccl",
++                "ccl",
+                 rank=local_rank,
+                 world_size=self.world_size,
+                 timeout=timedelta(minutes=1)
+@@ -303,8 +311,8 @@ class RayWorker:
+             out["samples"] = samples
+ 
+         # Temporary for reducing change of OOM before VAE
+-        if ray.get_runtime_context().get_accelerator_ids()["GPU"][0] == "0":
+-            self.model.detach()
++        #if ray.get_runtime_context().get_accelerator_ids()["GPU"][0] == "0":
++        #    self.model.detach()
+         self.model.detach()
+         comfy.model_management.soft_empty_cache()
+         gc.collect()
+diff --git a/src/raylight/nodes.py b/src/raylight/nodes.py
+index 7a552d8..cff7cb7 100644
+--- a/src/raylight/nodes.py
++++ b/src/raylight/nodes.py
+@@ -50,9 +50,9 @@ class RayInitializer:
+ 
+         # Currenty not implementing CFG parallel, since LoRa can enable non cfg run
+         world_size = GPU
+-        max_world_size = torch.cuda.device_count()
+-        if world_size > max_world_size:
+-            raise ValueError("To many gpus")
++        #max_world_size = torch.xpu.device_count()
++        #if world_size > max_world_size:
++        #    raise ValueError("To many gpus")
+         if world_size == 0:
+             raise ValueError("Num of cuda/cudalike device is 0")
+         if world_size < ulysses_degree * ring_degree:
+@@ -101,7 +101,7 @@ class RayInitializer:
+         gpu_actors = []
+         for local_rank in range(world_size):
+             gpu_actors.append(
+-                gpu_actor.options(num_gpus=1, name=f"RayWorker:{local_rank}").remote(
++                gpu_actor.options(name=f"RayWorker:{local_rank}").remote(
+                     local_rank=local_rank,
+                     world_size=world_size,
+                     device_id=0,