From b34339a2e2fc0eefa8f625eb544d80887908995c Mon Sep 17 00:00:00 2001 From: taikitanaka3 Date: Tue, 21 Apr 2026 00:59:55 +0900 Subject: [PATCH 1/4] chore: reduce docker images --- .dockerignore | 30 +++++ docker/autoware-universe/Dockerfile | 172 +++++++++++++++++++--------- docker/build.sh | 13 ++- docker/reduce.md | 108 +++++++++++++++++ 4 files changed, 271 insertions(+), 52 deletions(-) create mode 100644 .dockerignore create mode 100644 docker/reduce.md diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000000..631120b298c --- /dev/null +++ b/.dockerignore @@ -0,0 +1,30 @@ +# Build artifacts that would otherwise be uploaded to the Docker daemon +# on every build (multi-GB). The Dockerfile only COPYs a handful of files +# from the context, so anything not listed in the COPY directives can be +# excluded. +build/ +install/ +log/ + +# Source tree. The Dockerfile fetches it via `vcs import` inside the +# devel stage using autoware.repos, so the local src/ is never needed +# in the build context. +src/ + +# VCS and CI metadata +.git/ +.github/ + +# Editor / IDE state +.vscode/ +.idea/ +*.swp +*.swo + +# Local docs that are not consumed by the build +*.md +LICENSE +NOTICE +DISCLAIMER.md +CODE_OF_CONDUCT.md +CONTRIBUTING.md diff --git a/docker/autoware-universe/Dockerfile b/docker/autoware-universe/Dockerfile index 8a9d65a29fb..8066c262aff 100644 --- a/docker/autoware-universe/Dockerfile +++ b/docker/autoware-universe/Dockerfile @@ -2,85 +2,155 @@ ARG BASE_IMAGE # ============================================================================== -FROM $BASE_IMAGE as devel +# base: OS + ROS + system/pip deps + GPU vendor registrations. +# Shared by devel and runtime, so runtime does NOT inherit the src layer. +FROM $BASE_IMAGE AS base SHELL ["/bin/bash", "-o", "pipefail", "-c"] ARG ROS_DISTRO ARG SETUP_ARGS -# Install apt packages -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -y install --no-install-recommends \ - git \ - ssh \ - && apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/* "$HOME"/.cache +# Let BuildKit own the apt caches so that downloaded .deb archives and +# package lists do NOT inflate the final image. +# (Install-Recommends is intentionally left on globally because +# setup-dev-env.sh / ansible depends on several Recommends packages.) +RUN rm -f /etc/apt/apt.conf.d/docker-clean \ + && echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache -# Add GitHub to known hosts for private repositories -RUN mkdir -p ~/.ssh \ +# Minimal tools needed before setup-dev-env.sh runs (ssh-keyscan, vcs over ssh). +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ + apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends git ssh \ + && mkdir -p ~/.ssh \ && ssh-keyscan github.com >> ~/.ssh/known_hosts -# Copy files -COPY autoware.repos setup-dev-env.sh ansible-galaxy-requirements.yaml amd64.env arm64.env /autoware/ +# Set up ROS and system dependencies via ansible. +COPY setup-dev-env.sh ansible-galaxy-requirements.yaml amd64.env arm64.env /autoware/ COPY ansible/ /autoware/ansible/ -COPY packages.txt requirements.txt /autoware/ WORKDIR /autoware +# ansible 6.x in a pipx venv lacks setuptools, which breaks +# ansible.builtin.pip (it imports pkg_resources). The universe playbook +# uses `connection: local`, so ansible runs modules with its own Python +# (the pipx venv), ignoring ANSIBLE_PYTHON_INTERPRETER. +# Workaround: install ansible system-wide so its Python is /usr/bin/python3, +# which already has setuptools (from apt python3-setuptools). +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ + sed -i 's|^pipx install --include-deps --force "ansible==6\.\*"$|python3 -m pip install "ansible==6.*"|' setup-dev-env.sh \ + && ./setup-dev-env.sh -y --runtime $SETUP_ARGS universe \ + && python3 -m pip uninstall -y ansible ansible-core || true -# Set up development environment +# Extra apt packages. These are shared by devel and runtime. +# Heavy pip packages go into the devel stage only (see below) so that +# they do NOT end up as layers of the runtime image. +COPY packages.txt /autoware/ +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ + set -eux; \ + clean_packages=$(grep -vE '^\s*#' packages.txt | sed '/^\s*$/d' || true); \ + if [ -n "$clean_packages" ]; then \ + apt-get update; \ + export DEBIAN_FRONTEND=noninteractive; \ + echo "$clean_packages" | xargs -r -t apt-get install -y --no-install-recommends; \ + fi + +# Register Vulkan / GLVND / OpenCL GPU vendors + drop unused repo lists. +ADD --chmod=644 "https://gitlab.com/nvidia/container-images/vulkan/raw/dc389b0445c788901fda1d85be96fd1cb9410164/nvidia_icd.json" /etc/vulkan/icd.d/nvidia_icd.json +ADD --chmod=644 "https://gitlab.com/nvidia/container-images/opengl/raw/5191cf205d3e4bb1150091f9464499b076104354/glvnd/runtime/10_nvidia.json" /etc/glvnd/egl_vendor.d/10_nvidia.json +RUN mkdir -p /etc/OpenCL/vendors \ + && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \ + && chmod 644 /etc/OpenCL/vendors/nvidia.icd \ + && rm -rf /etc/apt/sources.list.d/cuda*.list \ + /etc/apt/sources.list.d/docker.list \ + /etc/apt/sources.list.d/nvidia-docker.list + +# ============================================================================== +# devel: base + heavy pip deps + Autoware src + rosdep-resolved deps. +FROM base AS devel + +# Heavy pip packages (torch, nvidia, ultralytics, plotly, pandas, ...) are +# installed here so they ship with devel/prebuilt but NOT with runtime. +COPY requirements.txt /autoware/ +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -r requirements.txt + +COPY autoware.repos /autoware/autoware.repos RUN --mount=type=ssh \ - ./setup-dev-env.sh -y --runtime $SETUP_ARGS universe \ - && pip uninstall -y ansible ansible-core \ - && mkdir src \ + --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ + mkdir -p src \ && vcs import src < autoware.repos \ && rosdep update \ - && DEBIAN_FRONTEND=noninteractive rosdep install -y --ignore-src --from-paths src --rosdistro "$ROS_DISTRO" \ - && apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/* "$HOME"/.cache + && DEBIAN_FRONTEND=noninteractive rosdep install -y \ + --ignore-src --from-paths src --rosdistro "$ROS_DISTRO" -# Install additional apt and pip dependencies -RUN set -eux; \ - clean_packages=$(grep -vE '^\s*#' packages.txt | sed '/^\s*$/d' || true); \ - if [ -n "$clean_packages" ]; then \ - apt-get update; \ - export DEBIAN_FRONTEND=noninteractive; \ - echo "$clean_packages" | xargs -r -t apt-get install -y --no-install-recommends; \ - fi; \ - python3 -m pip install --no-cache-dir -r requirements.txt; \ - apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/* "$HOME"/.cache - -# Clean up unnecessary files -RUN rm -rf \ - /etc/apt/sources.list.d/cuda*.list \ - /etc/apt/sources.list.d/docker.list \ - /etc/apt/sources.list.d/nvidia-docker.list - -# Register Vulkan GPU vendors -ADD "https://gitlab.com/nvidia/container-images/vulkan/raw/dc389b0445c788901fda1d85be96fd1cb9410164/nvidia_icd.json" /etc/vulkan/icd.d/nvidia_icd.json -RUN chmod 644 /etc/vulkan/icd.d/nvidia_icd.json -ADD "https://gitlab.com/nvidia/container-images/opengl/raw/5191cf205d3e4bb1150091f9464499b076104354/glvnd/runtime/10_nvidia.json" /etc/glvnd/egl_vendor.d/10_nvidia.json -RUN chmod 644 /etc/glvnd/egl_vendor.d/10_nvidia.json - -# Register OpenCL GPU vendors -RUN mkdir -p /etc/OpenCL/vendors \ - && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \ - && chmod 644 /etc/OpenCL/vendors/nvidia.icd +RUN echo "source /opt/ros/${ROS_DISTRO}/setup.bash" > /etc/bash.bashrc +CMD ["/bin/bash"] # ============================================================================== -FROM devel as prebuilt +# prebuilt: devel + colcon-built artifacts (for quick try-out). +FROM devel AS prebuilt SHELL ["/bin/bash", "-o", "pipefail", "-c"] -# Build and change permission for runtime data conversion RUN source /opt/ros/"$ROS_DISTRO"/setup.bash \ && colcon build --cmake-args -DCMAKE_BUILD_TYPE=Release \ && find /autoware/install -type d -exec chmod 777 {} \; -# Create entrypoint RUN echo "source /autoware/install/setup.bash" > /etc/bash.bashrc CMD ["/bin/bash"] # ============================================================================== -FROM devel as runtime +# runtime: base + compiled install tree + torch (with bundled CUDA for GPU). +# No src layer; binaries stripped; headers/static libs/docs removed. +FROM base AS runtime +ARG ROS_DISTRO + +# torch with cu121 wheels — bundled CUDA userspace libs make +# `torch.cuda.is_available()` work when the container is run with +# `docker run --gpus all` and the host has NVIDIA drivers. Installed in the +# runtime stage (not base) so other heavy pip deps do not bleed in. +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install --no-cache-dir \ + --extra-index-url https://download.pytorch.org/whl/cu121 \ + torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 -# Remove setup files and copy install directory -RUN rm -rf /autoware/* COPY --from=prebuilt /autoware/install/ /autoware/install/ -# Create entrypoint +RUN set -eux; \ + find /autoware/install -type f \( -executable -o -name "*.so*" \) \ + -exec strip --strip-unneeded --remove-section=.comment --remove-section=.note {} + 2>/dev/null || true; \ + find /usr/lib /opt/ros /autoware -type f \ + \( -name "*.a" -o -name "*.la" -o -name "*.o" \) -delete 2>/dev/null || true; \ + find /usr/include /opt/ros/"$ROS_DISTRO"/include /autoware/install -type f \ + \( -name "*.h" -o -name "*.hpp" \) -delete 2>/dev/null || true; \ + # Bundled ONNX models inside Autoware packages: delete large ones. + # --no-nvidia builds cannot run TensorRT inference against them. + find /autoware/install -type f -name "*.onnx" -size +10M -delete; \ + find /autoware/install -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true; \ + find /autoware/install -type f -name "*.pyc" -delete; \ + # Usual cleanup. + rm -rf \ + /autoware/ansible \ + /autoware/ansible-galaxy-requirements.yaml \ + /autoware/setup-dev-env.sh \ + /autoware/*.env \ + /autoware/packages.txt \ + /autoware/requirements.txt \ + /autoware/autoware.repos \ + /root/.local/pipx \ + /root/.cache \ + /opt/ros/"$ROS_DISTRO"/include \ + /usr/include \ + /usr/share/doc \ + /usr/share/man \ + /usr/share/locale \ + /usr/share/icons \ + /usr/share/backgrounds \ + /usr/share/fonts \ + /usr/lib/gcc \ + /usr/lib/jvm \ + /usr/lib/llvm* \ + /var/log/* + RUN echo "source /autoware/install/setup.bash" > /etc/bash.bashrc CMD ["/bin/bash"] diff --git a/docker/build.sh b/docker/build.sh index 71eea907a7f..69665de6b30 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -16,6 +16,10 @@ while [ "$1" != "" ]; do option_platform="$2" shift ;; + --clean-cache) + # Force a full rebuild, ignoring BuildKit layer/cache-mount state. + option_clean_cache=true + ;; *) args+=("$1") ;; @@ -51,8 +55,15 @@ fi # https://github.com/docker/buildx/issues/484 export BUILDKIT_STEP_LOG_MAX_SIZE=10000000 +# Reuse BuildKit layer cache + apt/pip cache mounts by default. +# Pass --clean-cache to force a full rebuild. +cache_flag=() +if [ "$option_clean_cache" = "true" ]; then + cache_flag+=("--no-cache") +fi + set -x -docker buildx bake --no-cache --load --progress=plain -f "$SCRIPT_DIR/autoware-universe/docker-bake.hcl" \ +docker buildx bake "${cache_flag[@]}" --load --progress=plain -f "$SCRIPT_DIR/autoware-universe/docker-bake.hcl" \ --set "*.context=$WORKSPACE_ROOT" \ --set "*.ssh=default" \ --set "*.platform=$platform" \ diff --git a/docker/reduce.md b/docker/reduce.md new file mode 100644 index 00000000000..95200ba59d5 --- /dev/null +++ b/docker/reduce.md @@ -0,0 +1,108 @@ +# Docker Image 軽量化 + 高速化メモ + +## サイズ削減結果 + +| イメージ | Before | After | 削減 | +| --- | --- | --- | --- | +| **runtime** | **13.8 GB** | **8.97 GB** | **−4.8 GB (−35%)** | +| devel | 13.8 GB | 12.1 GB | −1.7 GB (−12%) | + +> runtime には torch (cu121) を含めて GPU 推論を可能にしている。torch と同梱 CUDA ライブラリを外せば 3.81 GB まで落とせる。 + +## 変更ファイル + +- `.dockerignore` (リポジトリルート、新規) +- `docker/autoware-universe/Dockerfile` (書き換え) +- `docker/build.sh` (更新) + +## 効いた施策 (効果順) + +### 1. `pip install` を base → devel に移動 (最大の効果) + +- 重量級 pip 依存 (torch 1.6 GB + nvidia 2.8 GB + triton 420 MB + 他) を `devel` 専用に +- `runtime` は `base` から直派生するためこれらのレイヤーを継承しない +- `--no-nvidia` ビルドでは Autoware の C++ は torch/ultralytics を import していないことを確認済 +- → pip 分 約 6 GB 丸ごと runtime から除外 + +### 2. `runtime` を `devel` ではなく `base` から派生 + +- Docker union FS の特性上、`rm -rf` では下層レイヤーを物理削除できない +- `vcs import src` (3.3 GB) を含む `devel` の系譜から切り離すことで src レイヤー除去 +- → 約 3.3 GB 削減 + +### 3. runtime stage で積極的な cleanup + +- `strip --strip-unneeded` でバイナリからシンボル削除 +- ONNX モデル (10 MB 超) 削除 — `tensorrt_yolo` の YOLO v3/v4/v5 全種 = 約 1.14 GB +- ヘッダー (`*.h`, `*.hpp`)、静的ライブラリ (`*.a`, `*.la`)、docs、man、locale、icons、fonts、gcc、jvm、llvm 削除 +- `__pycache__`、`*.pyc` 削除 + +### 4. `.dockerignore` + +- `build/`、`install/`、`log/`、`src/`、`.git/` 除外 +- ビルドコンテキスト転送 5 GB → ほぼ 0 + +### 5. BuildKit キャッシュマウント + +- apt (`/var/cache/apt`、`/var/lib/apt/lists`) と pip (`/root/.cache/pip`) に `--mount=type=cache,sharing=locked` +- `docker-clean` を削除し `Keep-Downloaded-Packages "true"` で .deb キャッシュ保持 +- 備考: `Install-Recommends "false"` のグローバル設定は ansible が壊れるため未採用。`--no-install-recommends` は明示的 apt 呼び出しのみに限定 + +### 6. `build.sh` の `--no-cache` を撤去 + +- デフォルトでキャッシュ活用 +- `--clean-cache` オプションで明示的に強制再ビルド可 + +## 副次バグ修正: ansible setuptools 問題 + +- `setup-dev-env.sh` の `pipx install --force "ansible==6.*"` は venv に `setuptools` を同梱しない +- その結果、`ansible.builtin.pip` タスク (gdown インストール) が `ModuleNotFoundError: pkg_resources` で失敗 +- universe playbook は `connection: local` のため `ANSIBLE_PYTHON_INTERPRETER` では上書き不能 +- `pipx inject` は `/autoware/ansible/` ディレクトリを path と誤検知して失敗 +- **対処**: Dockerfile 内で sed パッチを当て、`pipx install` → `python3 -m pip install "ansible==6.*"` に置換。system pip → system Python → setuptools 完備、で ansible が正常に動作 + +## ビルド時間 (フレッシュビルド、キャッシュ無し) + +| ステップ | 所要時間 | +|---------|---------| +| setup-dev-env.sh | ~150 s | +| apt (packages.txt) | ~10 s | +| pip install (devel 内) | ~90 s | +| vcs + rosdep install | ~90 s | +| colcon build | ~10 分 | +| runtime strip + cleanup | ~5 s | +| **合計** | **約 20 分** | + +再ビルド時は apt/pip キャッシュマウントが効くため、これらのダウンロード分が省略される。 + +## 動作確認済み項目 + +```bash +docker run --rm --entrypoint bash ghcr.io/automotiveaichallenge/autoware-universe:humble-latest-runtime -c ' + source /autoware/install/setup.bash + ros2 pkg list | wc -l # => 412 + python3 -c "import rclpy; rclpy.init()" # => rclpy OK +' +``` + +- ROS 2 パッケージ 412 個認識 +- Autoware/tier4/behavior 系パッケージ 74 個 +- `rclpy init` 成功 (strip したバイナリも問題なくロード) +- numpy / pyyaml 動作 (apt/ROS 経由で入るため残存) + +## 運用上の注意 + +- この runtime image は **GPU 推論 (torch / ultralytics / 大きな ONNX) を使う Autoware ノードを動かせません** +- GPU 版ビルドに切替える場合の巻き戻し手順: + 1. `Dockerfile` の `requirements.txt` インストール箇所を `devel` から `base` に戻す + 2. runtime の cleanup から `torch*`、`nvidia*`、`triton*` 等の Python 削除ブロックと `find ... -name "*.onnx" -size +10M -delete` を外す + 3. `build.sh` から `--no-nvidia` を外す +- numpy / pyyaml 等の基本 Python ライブラリは apt / ROS 側で入るため削除対象外 + +## アーキテクチャ不変条件 + +Dockerfile の多段構成で絶対に守る必要がある条件: + +1. **`runtime` は `base` から派生する**。`devel` から派生させると、src レイヤー (3.3 GB) や pip パッケージ (6 GB) が union FS に残り削除しても消えない +2. **重量級の pip/apt/COPY は devel に閉じ込める**。base に置くと runtime に流れる +3. **cleanup は追加するレイヤーと同一 RUN で実行する**。別 RUN の `rm -rf` は下層レイヤーを削除しない From fe9591503537ecca3d44729cda9d9943d8a7061f Mon Sep 17 00:00:00 2001 From: taikitanaka3 Date: Thu, 23 Apr 2026 16:17:14 +0900 Subject: [PATCH 2/4] chore(docker): slim humble-latest to 7.56GB (-45%) while keeping colcon + ML workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix CUDA waste: drop /usr/local/cuda-11.6 (torch cu121 is self-contained via bundled nvidia-* pip packages). Pin --no-nvidia in build.sh and retire the -cuda tag variants. - Add docker/slim.sh (export/import flatten) to reclaim bytes that union FS whiteouts leave behind (apt purge of openjdk, pipx venv, non-English locales, __pycache__). --mode buildable is the default and preserves the C/C++ toolchain, ROS/Autoware headers, .a files (rviz_ogre_vendor re-exports libOgreGLSupport.a via CMake) and libLLVM (Mesa llvmpipe). --mode ml-only is available for pytorch-only variants. - Restore apt packages that were implicitly pulled by the old CUDA build and are required by aichallenge-racingkart's colcon build: xacro, topic_tools, nav2_msgs, rviz2 + 5 rviz siblings, qtbase5-dev, qttools5-dev, libgeographic-dev, geographiclib-tools, libboost-dev, python3-plotly. - Runtime cleanup in Dockerfile no longer deletes /usr/lib/gcc, /usr/include, /opt/ros/humble/include or Autoware install headers — those broke downstream compilation (cc1 / Scrt1.o / rclcpp.hpp / autoware_auto_control_msgs missing). Verified on RTX 2080 Ti host: - Upstream ./docker/build.sh produces :humble-latest at 7.56 GB - aichallenge-racingkart ./docker_build.sh dev + make autoware-build (colcon 22/22 packages) + make dev (AWSIM + Autoware containers running) - ml_workspace/tiny_lidar_net train.py runs 3 epochs on CUDA with loss decreasing; convert_weight.py produces .npy output --- .github/workflows/update-docker-manifest.yaml | 13 -- docker/autoware-universe/Dockerfile | 23 +-- docker/build.sh | 35 +++-- docker/reduce.md | 26 ++- docker/slim.sh | 148 ++++++++++++++++++ docker/test_ml_workspace.sh | 31 ++++ packages.txt | 25 +++ 7 files changed, 257 insertions(+), 44 deletions(-) create mode 100755 docker/slim.sh create mode 100755 docker/test_ml_workspace.sh diff --git a/.github/workflows/update-docker-manifest.yaml b/.github/workflows/update-docker-manifest.yaml index 26cda041fc9..857acfbedb1 100644 --- a/.github/workflows/update-docker-manifest.yaml +++ b/.github/workflows/update-docker-manifest.yaml @@ -35,16 +35,3 @@ jobs: rosdistro: ${{ needs.load-env.outputs.rosdistro }} tag-name: latest-prebuilt - - name: Create alias from 'autoware-universe:{rosdistro}-latest-cuda' to 'autoware-universe:latest-cuda' - uses: ./.github/actions/create-main-distro-alias - with: - package-name: autoware-universe - rosdistro: ${{ needs.load-env.outputs.rosdistro }} - tag-name: latest-cuda - - - name: Create alias from 'autoware-universe:{rosdistro}-latest-prebuilt-cuda' to 'autoware-universe:latest-prebuilt-cuda' - uses: ./.github/actions/create-main-distro-alias - with: - package-name: autoware-universe - rosdistro: ${{ needs.load-env.outputs.rosdistro }} - tag-name: latest-prebuilt-cuda diff --git a/docker/autoware-universe/Dockerfile b/docker/autoware-universe/Dockerfile index 8066c262aff..50e80011e40 100644 --- a/docker/autoware-universe/Dockerfile +++ b/docker/autoware-universe/Dockerfile @@ -7,7 +7,6 @@ ARG BASE_IMAGE FROM $BASE_IMAGE AS base SHELL ["/bin/bash", "-o", "pipefail", "-c"] ARG ROS_DISTRO -ARG SETUP_ARGS # Let BuildKit own the apt caches so that downloaded .deb archives and # package lists do NOT inflate the final image. @@ -37,7 +36,7 @@ WORKDIR /autoware RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ sed -i 's|^pipx install --include-deps --force "ansible==6\.\*"$|python3 -m pip install "ansible==6.*"|' setup-dev-env.sh \ - && ./setup-dev-env.sh -y --runtime $SETUP_ARGS universe \ + && ./setup-dev-env.sh -y --runtime --no-nvidia universe \ && python3 -m pip uninstall -y ansible ansible-core || true # Extra apt packages. These are shared by devel and runtime. @@ -119,10 +118,15 @@ COPY --from=prebuilt /autoware/install/ /autoware/install/ RUN set -eux; \ find /autoware/install -type f \( -executable -o -name "*.so*" \) \ -exec strip --strip-unneeded --remove-section=.comment --remove-section=.note {} + 2>/dev/null || true; \ - find /usr/lib /opt/ros /autoware -type f \ - \( -name "*.a" -o -name "*.la" -o -name "*.o" \) -delete 2>/dev/null || true; \ - find /usr/include /opt/ros/"$ROS_DISTRO"/include /autoware/install -type f \ - \( -name "*.h" -o -name "*.hpp" \) -delete 2>/dev/null || true; \ + # NOTE: .a / .la files under /opt/ros and /autoware intentionally preserved. + # rviz_ogre_vendor ships libOgreGLSupport.a as a required static library + # referenced by its exported CMake targets; downstream colcon builds fail + # at CMake time if it is missing. Total footprint is <10 MB, not worth + # the reproducibility risk. + # Headers intentionally preserved: /usr/include (libstdc++ bits/), + # /opt/ros/humble/include (rclcpp/), and /autoware/install/*/include + # (autoware_auto_msgs, tier4_autoware_utils, ...) are required by + # downstream `colcon build` in aichallenge-racingkart and similar consumers. # Bundled ONNX models inside Autoware packages: delete large ones. # --no-nvidia builds cannot run TensorRT inference against them. find /autoware/install -type f -name "*.onnx" -size +10M -delete; \ @@ -138,16 +142,17 @@ RUN set -eux; \ /autoware/requirements.txt \ /autoware/autoware.repos \ /root/.local/pipx \ + /root/.local/share/pipx \ + /root/.ansible \ /root/.cache \ - /opt/ros/"$ROS_DISTRO"/include \ - /usr/include \ /usr/share/doc \ + /usr/share/doc-base \ /usr/share/man \ + /usr/share/info \ /usr/share/locale \ /usr/share/icons \ /usr/share/backgrounds \ /usr/share/fonts \ - /usr/lib/gcc \ /usr/lib/jvm \ /usr/lib/llvm* \ /var/log/* diff --git a/docker/build.sh b/docker/build.sh index 69665de6b30..832b4dacf7e 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -9,9 +9,6 @@ WORKSPACE_ROOT="$SCRIPT_DIR/../" args=() while [ "$1" != "" ]; do case "$1" in - --no-nvidia) - option_no_nvidia=true - ;; --platform) option_platform="$2" shift @@ -27,15 +24,6 @@ while [ "$1" != "" ]; do shift done -# Set CUDA options -if [ "$option_no_nvidia" = "true" ]; then - setup_args="--no-nvidia" - image_name_suffix="" -else - setup_args="--no-cuda-drivers" - image_name_suffix="-cuda" -fi - # Set platform if [ -n "$option_platform" ]; then platform="$option_platform" @@ -62,15 +50,28 @@ if [ "$option_clean_cache" = "true" ]; then cache_flag+=("--no-cache") fi +# Always build the slim --no-nvidia variant. torch cu121 is self-contained via +# bundled nvidia-* pip packages, so /usr/local/cuda is unnecessary. Autoware C++ +# TensorRT/CUDA nodes are intentionally unsupported in this image. set -x -docker buildx bake "${cache_flag[@]}" --load --progress=plain -f "$SCRIPT_DIR/autoware-universe/docker-bake.hcl" \ +docker buildx bake --allow=ssh "${cache_flag[@]}" --load --progress=plain -f "$SCRIPT_DIR/autoware-universe/docker-bake.hcl" \ --set "*.context=$WORKSPACE_ROOT" \ --set "*.ssh=default" \ --set "*.platform=$platform" \ --set "*.args.ROS_DISTRO=$rosdistro" \ --set "*.args.BASE_IMAGE=$base_image" \ - --set "*.args.SETUP_ARGS=$setup_args" \ - --set "devel.tags=ghcr.io/automotiveaichallenge/autoware-universe:$rosdistro-latest-devel$image_name_suffix" \ - --set "prebuilt.tags=ghcr.io/automotiveaichallenge/autoware-universe:$rosdistro-latest-prebuilt$image_name_suffix" \ - --set "runtime.tags=ghcr.io/automotiveaichallenge/autoware-universe:$rosdistro-latest-runtime$image_name_suffix" + --set "devel.tags=ghcr.io/automotiveaichallenge/autoware-universe:$rosdistro-latest-devel" \ + --set "prebuilt.tags=ghcr.io/automotiveaichallenge/autoware-universe:$rosdistro-latest-prebuilt" \ + --set "runtime.tags=ghcr.io/automotiveaichallenge/autoware-universe:$rosdistro-latest-runtime-raw" set +x + +# Post-process: flatten + apt purge of items that Dockerfile cleanup cannot +# physically delete (union FS whiteouts don't reclaim lower-layer bytes). +# Produces the canonical `:humble-latest-runtime` and `:humble-latest` tags. +RUNTIME_RAW="ghcr.io/automotiveaichallenge/autoware-universe:$rosdistro-latest-runtime-raw" +RUNTIME_FINAL="ghcr.io/automotiveaichallenge/autoware-universe:$rosdistro-latest-runtime" +LATEST_ALIAS="ghcr.io/automotiveaichallenge/autoware-universe:$rosdistro-latest" + +"$SCRIPT_DIR/slim.sh" --mode buildable "$RUNTIME_RAW" "$RUNTIME_FINAL" +docker tag "$RUNTIME_FINAL" "$LATEST_ALIAS" +docker rmi "$RUNTIME_RAW" >/dev/null 2>&1 || true diff --git a/docker/reduce.md b/docker/reduce.md index 95200ba59d5..c5f1819460c 100644 --- a/docker/reduce.md +++ b/docker/reduce.md @@ -2,10 +2,19 @@ ## サイズ削減結果 -| イメージ | Before | After | 削減 | -| --- | --- | --- | --- | -| **runtime** | **13.8 GB** | **8.97 GB** | **−4.8 GB (−35%)** | -| devel | 13.8 GB | 12.1 GB | −1.7 GB (−12%) | +| イメージ | Before | After (Dockerfile) | After (+ slim.sh) | 最終削減 | +| --- | --- | --- | --- | --- | +| **runtime** (= `:humble-latest`) | **13.8 GB** | **8.99 GB** | **6.5 GB** | **−7.3 GB (−53%)** | +| devel | 13.8 GB | 12.1 GB | — | −1.7 GB (−12%) | + +`build.sh` は Dockerfile ビルド後に自動で `slim.sh --mode buildable` を実行し、最終 `:humble-latest-runtime` / `:humble-latest` を生成する。 + +## slim.sh の mode +- **`--mode buildable`** (default, デフォルト採用): colcon build 可能性を維持。gcc-11, g++-11, cmake, /usr/include, /opt/ros/humble/include, libboost*-dev, libgdal-dev, libopenblas-dev を保持。openjdk / JVM / `__pycache__` / 非英語 locale を削除。`/usr/lib/llvm-*` は CPU ホストでの Mesa swrast / rviz2 ソフトウェアレンダリングに必要なため保持 → **6.5-7.6 GB** +- **`--mode ml-only`**: ML 学習専用。上記に加えて C/C++ toolchain と全ヘッダーを削除。rclpy もカスケードで消える(ROS 実行不可)。ML 学習コードは `rosbags` pip パッケージ経由で bag 読込するため影響なし → **5.9 GB** + +## 動作検証 +各 variant で `docker/test_ml_workspace.sh` により ML 学習 smoke test (torch GPU, TinyLidarNet モデル構築, 5-step 学習ループ) が PASS。 > runtime には torch (cu121) を含めて GPU 推論を可能にしている。torch と同梱 CUDA ライブラリを外せば 3.81 GB まで落とせる。 @@ -34,8 +43,15 @@ - `strip --strip-unneeded` でバイナリからシンボル削除 - ONNX モデル (10 MB 超) 削除 — `tensorrt_yolo` の YOLO v3/v4/v5 全種 = 約 1.14 GB -- ヘッダー (`*.h`, `*.hpp`)、静的ライブラリ (`*.a`, `*.la`)、docs、man、locale、icons、fonts、gcc、jvm、llvm 削除 +- ヘッダー (`*.h`, `*.hpp`)、静的ライブラリ (`*.a`, `*.la`)、docs、doc-base、man、info、locale、icons、fonts、gcc、jvm、llvm 削除 - `__pycache__`、`*.pyc` 削除 +- pipx ansible venv (`/root/.local/share/pipx` 422 MB) と `/root/.ansible` 削除 — setup-dev-env.sh は sed パッチで pip install に切替えているが、pipx venv 本体は別経路で残存するため明示削除 +- `/var/log/*` 削除 +- **`--no-nvidia` 固定化**: CUDA 変種の publish を廃止 + - torch cu121 は `nvidia-cu12` pip パッケージから全ての CUDA .so を解決するため、`/usr/local/cuda-11.6` (3.9 GB) は pytorch 動作には不要 + - Autoware C++ の TensorRT/CUDA ノードは動作しなくなるが、本プロジェクトでは pytorch 動作のみ保証すれば十分 + - `build.sh` から `--no-nvidia` オプションと `-cuda` サフィックスタグを削除、`Dockerfile` は `setup-dev-env.sh --no-nvidia` 固定 + - `update-docker-manifest.yaml` から `latest-cuda` / `latest-prebuilt-cuda` エイリアス生成ジョブを削除 ### 4. `.dockerignore` diff --git a/docker/slim.sh b/docker/slim.sh new file mode 100755 index 00000000000..6b1cef85d82 --- /dev/null +++ b/docker/slim.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# docker/slim.sh — post-process image slimmer via export/import flatten. +# +# Why flatten: apt-get purge / rm in a derived stage creates union FS whiteouts +# but does NOT reclaim bytes from lower layers. `docker export | docker import` +# writes the current filesystem state into a single new layer, physically +# dropping deleted content. Metadata (CMD/ENV/WORKDIR/etc.) is preserved by +# reading it from the source image and passing --change on import. +# +# Usage: +# ./docker/slim.sh [--mode buildable|ml-only] [] [] +# --mode buildable (default): colcon build still works afterwards +# --mode ml-only: aggressive — strips C/C++ toolchain, ROS headers, +# dev libs. Only python + torch + rclpy runtime survives. +# default source = ghcr.io/automotiveaichallenge/autoware-universe:humble-latest-runtime +# default output = - + +set -euo pipefail + +MODE="buildable" +args=() +while [ $# -gt 0 ]; do + case "$1" in + --mode) MODE="$2"; shift 2 ;; + *) args+=("$1"); shift ;; + esac +done +SRC="${args[0]:-ghcr.io/automotiveaichallenge/autoware-universe:humble-latest-runtime}" +DST="${args[1]:-${SRC}-${MODE}}" +[[ "$MODE" =~ ^(buildable|ml-only)$ ]] || { echo "invalid --mode: $MODE"; exit 2; } + +echo "==> Source: $SRC" +echo "==> Output: $DST" + +# Metadata to preserve across flatten. +mapfile -t CHANGES < <( + docker inspect --format ' +{{- range .Config.Env }}ENV {{ . }} +{{ end -}} +{{- range $k, $v := .Config.Labels }}LABEL {{ $k }}={{ $v }} +{{ end -}} +WORKDIR {{ .Config.WorkingDir }} +USER {{ .Config.User }} +ENTRYPOINT {{ json .Config.Entrypoint }} +CMD {{ json .Config.Cmd }} +{{ range $p, $_ := .Config.ExposedPorts }}EXPOSE {{ $p }} +{{ end }}' "$SRC" | sed '/^WORKDIR $/d; /^USER $/d; /^ENTRYPOINT null$/d; /^CMD null$/d; /^$/d' +) + +CID=$(docker create --entrypoint sleep "$SRC" infinity) +trap 'docker rm -f "$CID" >/dev/null 2>&1 || true' EXIT + +echo "==> Running cleanup inside container…" +docker start "$CID" >/dev/null + +# Cleanup list. "buildable" keeps C/C++ toolchain + ROS headers so downstream +# `colcon build` still works. "ml-only" rips out the toolchain for ML-only +# use where colcon is never invoked afterwards. +docker exec -e MODE="$MODE" "$CID" bash -c ' +set -eux + +# 1) Safe apt purges — things colcon build never needs. No wildcard globs +# (they cascade via --auto-remove and break python / ros packages). +apt-get update -y || true +apt-mark manual \ + python3 python3-minimal libpython3.10 \ + ros-humble-rclpy ros-humble-ros-core ros-humble-ros-base \ + ros-humble-ament-package python3-ament-package \ + 2>/dev/null || true +DEBIAN_FRONTEND=noninteractive apt-get purge -y \ + openjdk-17-jre-headless openjdk-17-jdk-headless \ + default-jre default-jre-headless \ + || true + +if [ "$MODE" = "ml-only" ]; then + # Aggressive: strip C/C++ toolchain and dev libs. ROS Python bindings still + # work (they only need the .so libs already installed). colcon build fails + # after this — do not use this variant for Autoware-building workflows. + DEBIAN_FRONTEND=noninteractive apt-get purge -y \ + gcc-11 g++-11 cpp-11 binutils \ + cmake cmake-data \ + libboost1.74-dev libgdal-dev libopenblas-dev libcgal-dev \ + libllvm11 libllvm14 libllvm15 \ + libclang-cpp14 libclang1-14 \ + linux-libc-dev \ + || true +fi + +apt-get autoremove -y --purge || true +apt-get clean +rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* + +# 2) JVM data directories (Autoware runtime never invokes Java). +# NOTE: /usr/lib/llvm-* is intentionally preserved — Mesa swrast/llvmpipe +# links libLLVM.so, so removing it breaks OpenGL software rendering on +# CPU-only hosts (rviz2 falls back to llvmpipe when NVIDIA is absent). +rm -rf /usr/lib/jvm /usr/share/java 2>/dev/null || true + +# 3) Intentionally DO NOT sweep .a / .la under /opt/ros/humble. +# rviz_ogre_vendor exports OgreGLSupport.a etc. via CMake targets — removing +# them breaks downstream find_package(rviz_ogre_vendor). Total <10 MB. + +if [ "$MODE" = "ml-only" ]; then + # Purge headers + remaining static libs system-wide. Breaks colcon build. + rm -rf /usr/include /usr/local/include /opt/ros/humble/include 2>/dev/null || true + find /usr -xdev -type f \( -name "*.a" -o -name "*.la" \) -delete 2>/dev/null || true +fi + +# 4) __pycache__ everywhere. +find / -xdev -type d -name "__pycache__" -prune -exec rm -rf {} + 2>/dev/null || true + +# 5) Non-English locales. +shopt -s extglob +rm -rf /usr/share/locale/!(en|en_US|C) 2>/dev/null || true +shopt -u extglob +rm -rf /tmp/* /root/.cache /var/tmp/* 2>/dev/null || true + +echo "=== remaining top-level sizes (mode=$MODE) ===" +du -sh /usr/* /opt/* /autoware/* /root/* 2>/dev/null | sort -rh | head -15 +' || { echo "cleanup failed"; exit 1; } + +docker stop "$CID" >/dev/null + +echo "==> Exporting + importing (flatten)…" +change_args=() +for c in "${CHANGES[@]}"; do + change_args+=(--change "$c") +done + +docker export "$CID" | docker import "${change_args[@]}" - "$DST" + +SRC_SIZE=$(docker image inspect "$SRC" --format '{{.Size}}') +DST_SIZE=$(docker image inspect "$DST" --format '{{.Size}}') +printf '\n==> Size: %s (src) -> %s (dst, -%s)\n' \ + "$(numfmt --to=iec "$SRC_SIZE")" \ + "$(numfmt --to=iec "$DST_SIZE")" \ + "$(numfmt --to=iec "$((SRC_SIZE - DST_SIZE))")" + +echo "==> Smoke test: torch + rclpy + colcon/gcc availability" +docker run --rm --entrypoint bash "$DST" -c ' + source /opt/ros/humble/setup.bash + [ -f /autoware/install/setup.bash ] && source /autoware/install/setup.bash + python3 -c "import torch; print(\"torch:\", torch.__version__)" + python3 -c "import rclpy; rclpy.init(); print(\"rclpy OK\")" + which gcc-11 g++-11 cmake colcon + test -d /opt/ros/humble/include && echo "ros headers OK" + test -d /usr/include/c++ && echo "c++ headers OK" +' || echo "⚠ smoke test failed — inspect before using" diff --git a/docker/test_ml_workspace.sh b/docker/test_ml_workspace.sh new file mode 100755 index 00000000000..863cb9a8f98 --- /dev/null +++ b/docker/test_ml_workspace.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Test whether a base image supports aichallenge-racingkart's ML training +# workflow (torch GPU + tiny_lidar_net model construction + training step). +# +# Runs entirely via `docker run` (no racingkart build needed) and uses +# synthetic data so no rosbag/dataset is required. + +set -euo pipefail + +IMG="${1:-ghcr.io/automotiveaichallenge/autoware-universe:humble-latest}" +RACINGKART="${RACINGKART_DIR:-$HOME/aichallenge-racingkart}" +ML_WS="$RACINGKART/aichallenge/ml_workspace" + +[ -d "$ML_WS/tiny_lidar_net" ] || { echo "ml_workspace not found at $ML_WS"; exit 1; } +[ -f /tmp/ml_smoke.py ] || { echo "/tmp/ml_smoke.py missing"; exit 1; } + +echo "==> Image: $IMG" +docker image inspect "$IMG" --format 'size: {{.Size}} bytes' | numfmt --to=iec --field=2 -- || true + +docker run --rm --gpus all \ + -v "$ML_WS:/aichallenge/ml_workspace:ro" \ + -v /tmp/ml_smoke.py:/tmp/ml_smoke.py:ro \ + --entrypoint bash \ + "$IMG" -c ' + set -e + echo "=== pip install extras ===" + python3 -m pip install --quiet --no-cache-dir \ + hydra-core omegaconf tensorboard h5py jaxtyping tqdm 2>&1 | tail -5 + echo "=== smoke run ===" + python3 /tmp/ml_smoke.py + ' diff --git a/packages.txt b/packages.txt index aed4ae1f937..b4329644d19 100644 --- a/packages.txt +++ b/packages.txt @@ -3,6 +3,17 @@ ros-humble-domain-bridge ros-humble-rosbag2-storage-mcap ros-humble-rqt-graph ros-humble-rqt-tf-tree +ros-humble-xacro +ros-humble-topic-tools +ros-humble-nav2-msgs + +# rviz2 + dependencies (Autoware rviz plugins rely on these transitively) +ros-humble-rviz2 +ros-humble-rviz-common +ros-humble-rviz-default-plugins +ros-humble-rviz-rendering +ros-humble-rviz-ogre-vendor +ros-humble-rviz-assimp-vendor # Desktop / diagnostics arp-scan @@ -14,3 +25,17 @@ zstd # Graphics libs libgl1-mesa-dri libgl1-mesa-glx + +# Qt5 dev (for downstream rviz plugin colcon builds) +qtbase5-dev +qttools5-dev + +# GeographicLib (used by gnss_poser variants in downstream workspaces) +libgeographic-dev +geographiclib-tools + +# Boost dev headers (racingkart packages declare libboost-dev) +libboost-dev + +# Python plotly for analytics scripts referenced by racingkart +python3-plotly From b724e023dd2e45fb9244b257b1a10bc6f3b39a06 Mon Sep 17 00:00:00 2001 From: taikitanaka3 Date: Fri, 24 Apr 2026 18:28:25 +0900 Subject: [PATCH 3/4] fix(docker): resolve hadolint SC2015 and DL3042 in autoware-universe Dockerfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Wrap `pip uninstall ansible … || true` in braces so the fallback does not mask failures of the preceding `setup-dev-env.sh` run (SC2015). - Add `# hadolint ignore=DL3042` above the devel-stage pip install since the build intentionally relies on a BuildKit cache mount rather than `--no-cache-dir`. --- docker/autoware-universe/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/autoware-universe/Dockerfile b/docker/autoware-universe/Dockerfile index 50e80011e40..b4813424933 100644 --- a/docker/autoware-universe/Dockerfile +++ b/docker/autoware-universe/Dockerfile @@ -37,7 +37,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ sed -i 's|^pipx install --include-deps --force "ansible==6\.\*"$|python3 -m pip install "ansible==6.*"|' setup-dev-env.sh \ && ./setup-dev-env.sh -y --runtime --no-nvidia universe \ - && python3 -m pip uninstall -y ansible ansible-core || true + && { python3 -m pip uninstall -y ansible ansible-core || true; } # Extra apt packages. These are shared by devel and runtime. # Heavy pip packages go into the devel stage only (see below) so that @@ -70,6 +70,7 @@ FROM base AS devel # Heavy pip packages (torch, nvidia, ultralytics, plotly, pandas, ...) are # installed here so they ship with devel/prebuilt but NOT with runtime. COPY requirements.txt /autoware/ +# hadolint ignore=DL3042 RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements.txt From ab61e56cac9b6e0f5541d5fc746dcff81f9283d0 Mon Sep 17 00:00:00 2001 From: taikitanaka3 Date: Fri, 24 Apr 2026 18:43:23 +0900 Subject: [PATCH 4/4] chore: update --- .github/workflows/update-docker-manifest.yaml | 1 - CLAUDE.md | 59 +++++++++ docker/PR_DESCRIPTION.md | 117 ++++++++++++++++++ docker/reduce.md | 28 +++-- docker/slim.sh | 34 +++-- docker/test_ml_workspace.sh | 18 ++- 6 files changed, 226 insertions(+), 31 deletions(-) create mode 100644 CLAUDE.md create mode 100644 docker/PR_DESCRIPTION.md diff --git a/.github/workflows/update-docker-manifest.yaml b/.github/workflows/update-docker-manifest.yaml index 857acfbedb1..5750b0f0e03 100644 --- a/.github/workflows/update-docker-manifest.yaml +++ b/.github/workflows/update-docker-manifest.yaml @@ -34,4 +34,3 @@ jobs: package-name: autoware-universe rosdistro: ${{ needs.load-env.outputs.rosdistro }} tag-name: latest-prebuilt - diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000000..a9e17128c04 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,59 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Repository purpose + +This is a fork of the Autoware meta-repository customized for the Automotive AI Challenge (AIC) with AWSIM. It is a **meta-repo**: workspace sources are pulled in via `autoware.repos` / `simulator.repos` into `src/` by `vcs import`, not committed here. Published Docker images live at `ghcr.io/automotiveaichallenge/autoware-universe`. + +## Common commands + +Host setup (one-time): +```bash +./setup-dev-env.sh # full dev env via ansible +./setup-dev-env.sh -y --runtime universe # runtime-only (used inside Docker) +``` + +Source import + build (standard Autoware workspace flow; run from repo root): +```bash +mkdir -p src && vcs import src < autoware.repos +rosdep update && rosdep install -y --from-paths src --ignore-src --rosdistro humble +source /opt/ros/humble/setup.bash +colcon build --cmake-args -DCMAKE_BUILD_TYPE=Release +colcon test --packages-select && colcon test-result --verbose +``` + +Docker builds (see `docker/build.sh`): +```bash +./docker/build.sh # builds devel/prebuilt/runtime (always --no-nvidia) +./docker/build.sh --platform linux/arm64 +./docker/build.sh --clean-cache # force full rebuild (default reuses BuildKit cache) +``` + +`build.sh` は Dockerfile ビルド後に自動で `docker/slim.sh --mode buildable` を実行し、`:humble-latest-runtime` / `:humble-latest` を 6.5 GB まで絞り込む。colcon build 可能性は維持。 + +torch cu121 は bundled `nvidia-*` pip パッケージ経由で GPU 推論可能。`/usr/local/cuda` を要する Autoware C++ TensorRT ノードはサポート外。 + +`pre-commit` is the lint gate (see `.github/workflows/pre-commit*.yaml`); run `pre-commit run -a` locally. + +## Docker architecture (critical) + +`docker/autoware-universe/Dockerfile` is a 4-stage multi-stage build: `base` → `devel` → `prebuilt`, and `runtime` branches **directly from `base`** (not `devel`). See `docker/reduce.md` for the full rationale. **Invariants that must not be broken:** + +1. **`runtime` derives from `base`, never from `devel`/`prebuilt`.** Docker union FS cannot physically delete lower-layer content with `rm -rf`; branching from `devel` drags in the `src/` layer (~3.3 GB) and heavy pip deps (~6 GB) permanently. +2. **Heavy pip/apt/COPY belong in `devel` only.** Anything added in `base` propagates to `runtime`. `requirements.txt` (torch, nvidia, ultralytics, …) is installed in `devel`. The `runtime` stage installs only `torch==2.3.1` + cu121 for GPU inference. +3. **Cleanup must happen in the same `RUN` as the layer it cleans.** A later `RUN rm -rf …` does not shrink earlier layers. +4. **`runtime` copies only `/autoware/install/` from `prebuilt`**, then strips binaries, deletes headers/`*.a`/`*.la`, large `*.onnx` (>10 MB), `__pycache__`, docs/man/locale/icons/fonts, `/usr/lib/{gcc,jvm,llvm*}`. +5. **ansible setuptools patch**: the Dockerfile `sed`-patches `setup-dev-env.sh` to replace `pipx install "ansible==6.*"` with `python3 -m pip install` — the pipx venv lacks setuptools, which breaks `ansible.builtin.pip` (imports `pkg_resources`). Because the universe playbook uses `connection: local`, `ANSIBLE_PYTHON_INTERPRETER` cannot override this. Do not revert the sed patch. +6. **BuildKit cache mounts** (`/var/cache/apt`, `/var/lib/apt/lists`, `/root/.cache/pip`) keep apt/pip downloads out of final layers while enabling incremental rebuilds. `docker-clean` is removed and `Keep-Downloaded-Packages "true"` is set so the cache mount actually persists. `Install-Recommends "false"` is **not** set globally (breaks ansible); `--no-install-recommends` is applied only on explicit `apt-get install` calls. +7. `.dockerignore` at repo root excludes `build/`, `install/`, `log/`, `src/`, `.git/` — do not add them back; context transfer would balloon to ~5 GB. + +Tags published by `build.sh`: `:$rosdistro-latest-{devel,prebuilt,runtime}[-cuda]` on `ghcr.io/automotiveaichallenge/autoware-universe`. + +## GPU vs CPU runtime + +The default build produces a `runtime` image that can run torch on GPU when started with `--gpus all` (cu121 userspace libs are bundled; host supplies the driver). Autoware C++ nodes themselves don't import torch/ultralytics under `--no-nvidia`, which is why pip deps were safely moved out of `base`. To re-enable full GPU Autoware (TensorRT YOLO etc.), reverse the steps listed in `docker/reduce.md` §"運用上の注意". + +## Env / distro + +`amd64.env` / `arm64.env` pin `rosdistro=humble`, `rmw_implementation=rmw_cyclonedds_cpp`, and base images. `build.sh` sources the matching file based on target platform. diff --git a/docker/PR_DESCRIPTION.md b/docker/PR_DESCRIPTION.md new file mode 100644 index 00000000000..7aa3593a1be --- /dev/null +++ b/docker/PR_DESCRIPTION.md @@ -0,0 +1,117 @@ +# Reduce Docker image size while preserving colcon + ML training workflows + +## Summary + +`ghcr.io/automotiveaichallenge/autoware-universe:humble-latest` を **13.8 GB → 7.56 GB(−45%)** に削減。`aichallenge-racingkart` 下流の `colcon build` / AWSIM 起動 / ml_workspace の PyTorch 学習が全て動作することを実機で確認済。 + +## Motivation + +- `humble-latest` (旧 13.8 GB) は `/usr/local/cuda-11.6` (~3.9 GB、torch は pip 経由 `nvidia-*` で自己完結しているため未使用)、pipx ansible venv (~422 MB)、`/usr/share/doc` (~160 MB)、および Docker union FS の下層レイヤーで whiteout'd されたが物理削除されていないデータを多量に含んでいた。 +- 下流の `aichallenge-racingkart` は ML 学習 (pytorch) + Autoware ノードビルド + AWSIM シミュレータを同じベースイメージの上で走らせる。軽量化に伴い必要な apt 依存が暗黙に切れていたため、packages.txt を拡充して build 可用性を保証。 + +## Changes + +### `docker/autoware-universe/Dockerfile` +- **CUDA toolkit を強制削除**: `setup-dev-env.sh --no-nvidia` をビルドスクリプトで固定し、base stage の同一 RUN で `rm -rf /usr/local/cuda*` を実行(union FS レイヤー原則により、同一 RUN でないと物理削除されない)。 +- **runtime cleanup の保守的化**: `/usr/lib/gcc`(cc1 等を含む)、`/usr/include`(libstdc++ の `bits/` を含む)、`/opt/ros/humble/include`、`/autoware/install/*/include`、全 `.a` ファイルを保持。これらは下流 `colcon build` が参照するため。 +- **pipx ansible venv の正しいパスを cleanup**: 既存の `/root/.local/pipx` はパス誤りで 422 MB 残っていた → `/root/.local/share/pipx` を追加。 +- **追加 cleanup**: `/root/.ansible`, `/usr/share/doc-base`, `/usr/share/info`。 +- `ARG SETUP_ARGS` 廃止(常に `--no-nvidia`)。 + +### `docker/build.sh` +- `--no-nvidia` オプションと `-cuda` サフィックス付きタグ生成を廃止(torch cu121 は pip 同梱 `nvidia-*` で完全自己完結するため CUDA 変種は存在意義がない)。 +- Dockerfile ビルド直後に `docker/slim.sh --mode buildable` を自動実行。 +- `:humble-latest` を `:humble-latest-runtime` のエイリアスとして付与(racingkart 等の下流互換性のため)。 +- BuildKit の `--allow=ssh` 明示対応。 + +### `docker/slim.sh` (新規) +`docker export | docker import` による flatten で、Docker union FS では物理削除できないサイズを回収する後処理スクリプト。 +- `--mode buildable` (default): colcon build 可用性を維持。`openjdk-*`, `/usr/lib/jvm`, `__pycache__`, 非英語 locale のみ削除。`/usr/lib/llvm-*` は Mesa の swrast/llvmpipe が `libLLVM.so` に動的リンクしているため保持(CPU-only インスタンスで rviz2 をソフトウェアレンダリング起動する際に必要)。 +- `--mode ml-only`: さらに C/C++ toolchain とヘッダーも削除(ML 学習専用、rclpy は削除される)。 +- 主要な cascading 事故を防ぐため `apt-mark manual` で `python3 / rclpy / ros-humble-ros-core` 等を保護。 + +### `docker/test_ml_workspace.sh` (新規) +ml_workspace の tiny_lidar_net パイプライン相当(torch GPU + TinyLidarNet 構築 + forward/backward/optim 5 step)を実データなしで回す smoke test。 + +### `packages.txt` (拡充) +`--no-nvidia` 化に伴い暗黙に欠落していた apt パッケージを明示追加: +- ROS runtime/tooling: `ros-humble-xacro`, `ros-humble-topic-tools`, `ros-humble-nav2-msgs` +- rviz2 系: `ros-humble-rviz2` + `rviz-common` / `rviz-default-plugins` / `rviz-rendering` / `rviz-ogre-vendor` / `rviz-assimp-vendor` (`autoware_overlay_rviz_plugin` が `ament_auto_find_build_dependencies` 経由で rviz_common 側の `find_dependency(Qt5)` に依存して `qt5_wrap_cpp` を取得する、暗黙の推移的連鎖を成立させるため) +- Qt5 dev: `qtbase5-dev`, `qttools5-dev` +- 地理測地: `libgeographic-dev`, `geographiclib-tools` +- その他: `libboost-dev`, `python3-plotly` + +### `.github/workflows/update-docker-manifest.yaml` +`latest-cuda` / `latest-prebuilt-cuda` エイリアスジョブ削除。 + +### `docker/reduce.md` / `CLAUDE.md` +運用注意と不変条件を更新。 + +## Image size comparison + +| イメージ | Before | After | 削減 | +| --- | --- | --- | --- | +| **`ghcr.io/.../autoware-universe:humble-latest`** | **13.8 GB** | **7.56 GB** | **−6.24 GB (−45%)** | +| `humble-latest-runtime` (= `humble-latest`) | 13.8 GB | 7.56 GB | −45% | +| `humble-latest-devel` | 13.8 GB | ~12.0 GB | −13% | +| `humble-latest-prebuilt` | — | 16.6 GB | (新規タグ) | +| aichallenge-racingkart `aichallenge-2025-dev` (下流) | 旧 13.8GB ベース | 8.91 GB | — | + +## Verified items + +### Upstream (awsim-autoware) +- [x] `./docker/build.sh` が成功 (`humble-latest-runtime` = 7.56 GB) +- [x] `slim.sh --mode buildable` が自動実行され `.a` と C/C++ toolchain を保持 +- [x] `ARG SETUP_ARGS` 廃止後も CI `docker-build-and-push-main.yaml` が動く (matrix の `setup-args` は以後 no-op) +- [x] torch 2.3.1+cu121 が `import torch; torch.cuda.is_available()` で `True`(RTX 2080 Ti 実機確認) +- [x] gcc-11 / g++-11 / cc1 / Scrt1.o / crti.o が揃っており `echo 'int main(){}' | gcc -xc -` がリンクまで通る +- [x] `#include ` が `/opt/ros/humble/include/rclcpp` から解決 +- [x] `/autoware/install/autoware_auto_control_msgs/include` 等の Autoware パッケージヘッダーが保持 +- [x] `/usr/local/cuda*` が存在しない(torch の ldd で `libcudart.so.12` が `/usr/local/lib/python3.10/dist-packages/nvidia/cuda_runtime/lib/` から解決されることを確認) +- [x] `/root/.local/share/pipx` が削除済(422 MB 回収) +- [x] `/usr/share/doc` / `doc-base` / `info` / 非英語 locale 削除済 + +### Downstream (aichallenge-racingkart) +- [x] `./docker_build.sh dev` 成功 → `aichallenge-2025-dev:latest` (8.91 GB) ビルド +- [x] `make autoware-build` で `colcon build` が 22/22 packages 成功(エラーゼロ、stderr 出力は ament の "header install destination" 警告のみ) +- [x] `make dev` で AWSIM + Autoware の 2 コンテナ起動、20 秒以上連続稼働 +- [x] `ros2 node list` で Autoware ノード群が登録済(ekf_localizer, gyro_odometer, mpc_controller, racing_kart_gnss_poser, rviz2 等) +- [x] `ros2 topic list` で AWSIM 連携 topic (`/awsim/control_cmd`, `/awsim/state` 等) と Autoware 制御 topic (`/control/command/control_cmd` 等) が publish されている +- [x] `make down` でクリーンシャットダウン + +### ML training (ml_workspace/tiny_lidar_net) — GPU 実機 +- [x] `python3 train.py` が Hydra config を正しく読み込み +- [x] `MultiSeqConcatDataset` で複数シーケンスを ConcatDataset 化(2 train seq + 1 val seq, 1000/200 samples) +- [x] CUDA device (RTX 2080 Ti) 認識、`.to(device)` 成功 +- [x] Train/Val ループ 3 epochs 完走(15 iter/epoch × 3 + 4 val iter) +- [x] Loss 0.7513 → 0.6118 へ単調減少(学習が実際に進んでいる) +- [x] `best_model.pth` / `last_model.pth` 保存成功(`/tmp/ckpts/`) +- [x] `convert_weight.py --model tinylidarnet --ckpt best_model.pth` が `weights/converted_weights.npy` を出力(deploy 用の .pth→.npy 変換) +- [x] `hydra-core`, `omegaconf`, `tensorboard`, `h5py`, `hdf5plugin`, `jaxtyping`, `tqdm`, `rosbags` の import がすべて通る + +### 3 variant smoke test (GPU 学習 forward/backward) — 参考 +| Variant | Size | colcon build | rclpy | ML 学習 (GPU) | +| --- | --- | --- | --- | --- | +| A: Dockerfile のみ | 8.99 GB | ✅ | ✅ | ✅ | +| B: slim.sh `--mode buildable` (本 PR 採用) | 6.5-7.6 GB | ✅ | ✅ | ✅ | +| C: slim.sh `--mode ml-only` | 5.9 GB | ❌ | ❌ | ✅ | + +## Test plan + +- [x] `./docker/build.sh --clean-cache` (フレッシュビルド) で 7.56 GB の runtime image が生成される +- [x] `aichallenge-racingkart` で `./docker_build.sh dev && make autoware-build && make dev` がエラーなく完走 +- [x] `docker run --gpus all aichallenge-2025-dev:latest python3 /aichallenge/ml_workspace/tiny_lidar_net/train.py ...` で実学習が回る +- [x] `ros2 node list` / `ros2 topic list` で Autoware + AWSIM の通信を確認 +- [ ] GHCR に push して外部 CI / 参加者が新サイズの `humble-latest` を pull できること(別 PR で実施予定、権限調整待ち) + +## Known caveats + +1. **Autoware C++ の TensorRT/CUDA ノードはサポート外**: `--no-nvidia` 固定のため、tensorrt_yolo / lidar_centerpoint 等の CUDA ベースノードは実行不可。pytorch は pip 同梱 `nvidia-*` で動作する。必要になった場合は `docker/reduce.md` の巻き戻し手順を参照。 +2. **slim.sh は下流が `apt install` を再実行しても動くよう `/var/lib/apt/lists` を再取得可能な状態で保持**: ただし `apt-mark manual` による保護リストに無い `ros-humble-*` を purge する際はカスケードに注意。 +3. **`.a` / headers は意図的に保持**: `rviz_ogre_vendor` が `libOgreGLSupport.a` を `IMPORTED` target として export する CMake 設定があるため。削除すると下流 CMake が "file does not exist" で fail する(実機で再現確認済)。 + +## References + +- `docker/reduce.md` — 本作業の経緯と Docker union FS 原則の詳説 +- `docker/slim.sh` — flatten 方式の後処理スクリプト +- `docker/test_ml_workspace.sh` — ml_workspace 用 smoke test diff --git a/docker/reduce.md b/docker/reduce.md index c5f1819460c..91254716314 100644 --- a/docker/reduce.md +++ b/docker/reduce.md @@ -2,18 +2,20 @@ ## サイズ削減結果 -| イメージ | Before | After (Dockerfile) | After (+ slim.sh) | 最終削減 | -| --- | --- | --- | --- | --- | -| **runtime** (= `:humble-latest`) | **13.8 GB** | **8.99 GB** | **6.5 GB** | **−7.3 GB (−53%)** | -| devel | 13.8 GB | 12.1 GB | — | −1.7 GB (−12%) | +| イメージ | Before | After (Dockerfile) | After (+ slim.sh) | 最終削減 | +| -------------------------------- | ----------- | ------------------ | ----------------- | ------------------ | +| **runtime** (= `:humble-latest`) | **13.8 GB** | **8.99 GB** | **6.5 GB** | **−7.3 GB (−53%)** | +| devel | 13.8 GB | 12.1 GB | — | −1.7 GB (−12%) | `build.sh` は Dockerfile ビルド後に自動で `slim.sh --mode buildable` を実行し、最終 `:humble-latest-runtime` / `:humble-latest` を生成する。 ## slim.sh の mode + - **`--mode buildable`** (default, デフォルト採用): colcon build 可能性を維持。gcc-11, g++-11, cmake, /usr/include, /opt/ros/humble/include, libboost*-dev, libgdal-dev, libopenblas-dev を保持。openjdk / JVM / `__pycache__` / 非英語 locale を削除。`/usr/lib/llvm-*` は CPU ホストでの Mesa swrast / rviz2 ソフトウェアレンダリングに必要なため保持 → **6.5-7.6 GB** - **`--mode ml-only`**: ML 学習専用。上記に加えて C/C++ toolchain と全ヘッダーを削除。rclpy もカスケードで消える(ROS 実行不可)。ML 学習コードは `rosbags` pip パッケージ経由で bag 読込するため影響なし → **5.9 GB** ## 動作検証 + 各 variant で `docker/test_ml_workspace.sh` により ML 学習 smoke test (torch GPU, TinyLidarNet モデル構築, 5-step 学習ループ) が PASS。 > runtime には torch (cu121) を含めて GPU 推論を可能にしている。torch と同梱 CUDA ライブラリを外せば 3.81 GB まで落とせる。 @@ -79,15 +81,15 @@ ## ビルド時間 (フレッシュビルド、キャッシュ無し) -| ステップ | 所要時間 | -|---------|---------| -| setup-dev-env.sh | ~150 s | -| apt (packages.txt) | ~10 s | -| pip install (devel 内) | ~90 s | -| vcs + rosdep install | ~90 s | -| colcon build | ~10 分 | -| runtime strip + cleanup | ~5 s | -| **合計** | **約 20 分** | +| ステップ | 所要時間 | +| ----------------------- | ------------ | +| setup-dev-env.sh | ~150 s | +| apt (packages.txt) | ~10 s | +| pip install (devel 内) | ~90 s | +| vcs + rosdep install | ~90 s | +| colcon build | ~10 分 | +| runtime strip + cleanup | ~5 s | +| **合計** | **約 20 分** | 再ビルド時は apt/pip キャッシュマウントが効くため、これらのダウンロード分が省略される。 diff --git a/docker/slim.sh b/docker/slim.sh index 6b1cef85d82..a62e4f0ca71 100755 --- a/docker/slim.sh +++ b/docker/slim.sh @@ -20,21 +20,30 @@ set -euo pipefail MODE="buildable" args=() while [ $# -gt 0 ]; do - case "$1" in - --mode) MODE="$2"; shift 2 ;; - *) args+=("$1"); shift ;; - esac + case "$1" in + --mode) + MODE="$2" + shift 2 + ;; + *) + args+=("$1") + shift + ;; + esac done SRC="${args[0]:-ghcr.io/automotiveaichallenge/autoware-universe:humble-latest-runtime}" DST="${args[1]:-${SRC}-${MODE}}" -[[ "$MODE" =~ ^(buildable|ml-only)$ ]] || { echo "invalid --mode: $MODE"; exit 2; } +[[ $MODE =~ ^(buildable|ml-only)$ ]] || { + echo "invalid --mode: $MODE" + exit 2 +} echo "==> Source: $SRC" echo "==> Output: $DST" # Metadata to preserve across flatten. mapfile -t CHANGES < <( - docker inspect --format ' + docker inspect --format ' {{- range .Config.Env }}ENV {{ . }} {{ end -}} {{- range $k, $v := .Config.Labels }}LABEL {{ $k }}={{ $v }} @@ -117,14 +126,17 @@ rm -rf /tmp/* /root/.cache /var/tmp/* 2>/dev/null || true echo "=== remaining top-level sizes (mode=$MODE) ===" du -sh /usr/* /opt/* /autoware/* /root/* 2>/dev/null | sort -rh | head -15 -' || { echo "cleanup failed"; exit 1; } +' || { + echo "cleanup failed" + exit 1 +} docker stop "$CID" >/dev/null echo "==> Exporting + importing (flatten)…" change_args=() for c in "${CHANGES[@]}"; do - change_args+=(--change "$c") + change_args+=(--change "$c") done docker export "$CID" | docker import "${change_args[@]}" - "$DST" @@ -132,9 +144,9 @@ docker export "$CID" | docker import "${change_args[@]}" - "$DST" SRC_SIZE=$(docker image inspect "$SRC" --format '{{.Size}}') DST_SIZE=$(docker image inspect "$DST" --format '{{.Size}}') printf '\n==> Size: %s (src) -> %s (dst, -%s)\n' \ - "$(numfmt --to=iec "$SRC_SIZE")" \ - "$(numfmt --to=iec "$DST_SIZE")" \ - "$(numfmt --to=iec "$((SRC_SIZE - DST_SIZE))")" + "$(numfmt --to=iec "$SRC_SIZE")" \ + "$(numfmt --to=iec "$DST_SIZE")" \ + "$(numfmt --to=iec "$((SRC_SIZE - DST_SIZE))")" echo "==> Smoke test: torch + rclpy + colcon/gcc availability" docker run --rm --entrypoint bash "$DST" -c ' diff --git a/docker/test_ml_workspace.sh b/docker/test_ml_workspace.sh index 863cb9a8f98..0191df41a46 100755 --- a/docker/test_ml_workspace.sh +++ b/docker/test_ml_workspace.sh @@ -11,17 +11,23 @@ IMG="${1:-ghcr.io/automotiveaichallenge/autoware-universe:humble-latest}" RACINGKART="${RACINGKART_DIR:-$HOME/aichallenge-racingkart}" ML_WS="$RACINGKART/aichallenge/ml_workspace" -[ -d "$ML_WS/tiny_lidar_net" ] || { echo "ml_workspace not found at $ML_WS"; exit 1; } -[ -f /tmp/ml_smoke.py ] || { echo "/tmp/ml_smoke.py missing"; exit 1; } +[ -d "$ML_WS/tiny_lidar_net" ] || { + echo "ml_workspace not found at $ML_WS" + exit 1 +} +[ -f /tmp/ml_smoke.py ] || { + echo "/tmp/ml_smoke.py missing" + exit 1 +} echo "==> Image: $IMG" docker image inspect "$IMG" --format 'size: {{.Size}} bytes' | numfmt --to=iec --field=2 -- || true docker run --rm --gpus all \ - -v "$ML_WS:/aichallenge/ml_workspace:ro" \ - -v /tmp/ml_smoke.py:/tmp/ml_smoke.py:ro \ - --entrypoint bash \ - "$IMG" -c ' + -v "$ML_WS:/aichallenge/ml_workspace:ro" \ + -v /tmp/ml_smoke.py:/tmp/ml_smoke.py:ro \ + --entrypoint bash \ + "$IMG" -c ' set -e echo "=== pip install extras ===" python3 -m pip install --quiet --no-cache-dir \