-
Notifications
You must be signed in to change notification settings - Fork 391
feat(guestos): build patched kernel with folio_split() race fix #9953
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1e15ca9
d064a9f
dd4f802
24701c5
6dc569f
cda093c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -35,6 +35,98 @@ RUN cd /tmp/ && \ | |||||
| echo "c46e5b6f53948477ff3a19d97c58307394a29fe64a01905646f026ddc32cb65b node_exporter-1.10.2.linux-amd64.tar.gz" > node_exporter.sha256 && \ | ||||||
| sha256sum -c node_exporter.sha256 | ||||||
|
|
||||||
| # | ||||||
| # Kernel build stage: | ||||||
| # - Fetch Ubuntu's linux-hwe-6.17 source package | ||||||
| # - Apply local patches from kernel-patches/ (lexicographic order) | ||||||
| # - Build the "generic" flavor binary .deb packages | ||||||
| # | ||||||
| # The resulting .debs are consumed by the final image stage below, replacing | ||||||
| # the stock kernel that would otherwise be pulled in via apt. | ||||||
| # | ||||||
| FROM ubuntu:24.04 as kernel-build | ||||||
|
|
||||||
| USER root:root | ||||||
|
|
||||||
| ENV TZ=UTC | ||||||
| ENV DEBIAN_FRONTEND=noninteractive | ||||||
| ENV SOURCE_DATE_EPOCH=0 | ||||||
| RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone | ||||||
|
|
||||||
| # Source package to patch and rebuild. Must match (or be ABI-compatible with) | ||||||
| # the kernel metapackage installed in the final stage. | ||||||
| ARG _KERNEL_SOURCE_PACKAGE=linux-hwe-6.17 | ||||||
|
|
||||||
| # Enable deb-src for Ubuntu's deb822 sources file and install build deps. | ||||||
| RUN sed -i 's/^Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/ubuntu.sources && \ | ||||||
| apt-get -y update && \ | ||||||
| apt-get -y --no-install-recommends install \ | ||||||
| ca-certificates \ | ||||||
| build-essential \ | ||||||
| fakeroot \ | ||||||
| devscripts \ | ||||||
| dpkg-dev \ | ||||||
| kmod \ | ||||||
| cpio \ | ||||||
| rsync \ | ||||||
| bc \ | ||||||
| bison \ | ||||||
| flex \ | ||||||
| libelf-dev \ | ||||||
| libssl-dev \ | ||||||
| libncurses-dev \ | ||||||
| dwarves \ | ||||||
| zstd \ | ||||||
| python3 \ | ||||||
| python3-debian \ | ||||||
| patch && \ | ||||||
| apt-get -y build-dep ${_KERNEL_SOURCE_PACKAGE} | ||||||
|
|
||||||
| WORKDIR /build | ||||||
| RUN apt-get -y source ${_KERNEL_SOURCE_PACKAGE} | ||||||
|
|
||||||
| COPY kernel-patches /tmp/kernel-patches | ||||||
|
|
||||||
| # Apply all *.patch files in lexicographic order. Tolerate a missing/empty | ||||||
| # directory so that removing the patches (once upstream ships the fix) only | ||||||
| # requires deleting files under kernel-patches/. | ||||||
| # | ||||||
| # Note: /bin/sh in the base image is dash, which does not support "shopt", so | ||||||
| # we iterate explicitly and skip the loop when no *.patch files exist. | ||||||
| RUN set -eux; \ | ||||||
| srcdir="$(find /build -maxdepth 1 -mindepth 1 -type d -name 'linux-*' | head -n1)"; \ | ||||||
| test -n "$srcdir"; \ | ||||||
| cd "$srcdir"; \ | ||||||
| for p in /tmp/kernel-patches/*.patch; do \ | ||||||
| [ -e "$p" ] || continue; \ | ||||||
| echo "Applying $p"; \ | ||||||
| patch -p1 --no-backup-if-mismatch < "$p"; \ | ||||||
| done; \ | ||||||
| EDITOR=true DEBFULLNAME="IC GuestOS build" DEBEMAIL="devnull@dfinity.org" \ | ||||||
| debchange --local=+dfinity "Apply DFINITY custom kernel patches." | ||||||
|
|
||||||
| # Build only the amd64 "generic" flavor, skipping debug and retpoline extras | ||||||
| # to cut build time. Signed image packages are not produced (and not needed | ||||||
| # for GuestOS, which signs kernels as part of the IC image build). | ||||||
| # DEB_BUILD_OPTIONS=parallel=N makes debian/rules fan out to $(nproc) jobs. | ||||||
| RUN set -eux; \ | ||||||
| srcdir="$(find /build -maxdepth 1 -mindepth 1 -type d -name 'linux-*' | head -n1)"; \ | ||||||
| cd "$srcdir"; \ | ||||||
| export DEB_BUILD_OPTIONS="parallel=$(nproc)"; \ | ||||||
| fakeroot debian/rules clean; \ | ||||||
| fakeroot debian/rules binary-generic skipdbg=true skipretpoline=true | ||||||
|
|
||||||
| # Collect exactly the .debs needed to boot the final image: | ||||||
| # linux-image-unsigned-<abi>-generic (vmlinuz + core modules) | ||||||
| # linux-modules-<abi>-generic | ||||||
| # linux-modules-extra-<abi>-generic | ||||||
| RUN set -eux; \ | ||||||
| mkdir /debs; \ | ||||||
| cp /build/linux-image-unsigned-*-generic_*_amd64.deb /debs/; \ | ||||||
| cp /build/linux-modules-*-generic_*_amd64.deb /debs/; \ | ||||||
| cp /build/linux-modules-extra-*-generic_*_amd64.deb /debs/; \ | ||||||
| ls -la /debs | ||||||
|
|
||||||
| # | ||||||
| # Second build stage: | ||||||
| # - Download and cache minimal Ubuntu Server 24.04 LTS Docker image | ||||||
|
|
@@ -53,18 +145,21 @@ ENV TZ=UTC | |||||
| # For the dev image, use both "packages.common" and "packages.dev" -- this can | ||||||
| # be set via docker build args (see above). | ||||||
| ARG PACKAGE_FILES=packages.common | ||||||
| # The kernel is installed here to keep the extra modules in sync. | ||||||
| # Unfortunately, there is no metapackage to track the extra modules that does | ||||||
| # not also include firmware. | ||||||
| ARG _KERNEL_PACKAGE=linux-image-virtual-hwe-24.04 | ||||||
| # The kernel is installed from locally built .deb packages produced by the | ||||||
| # kernel-build stage above, so that we can carry patches on top of Ubuntu's | ||||||
| # linux-hwe-6.17. The linux-image, linux-modules and linux-modules-extra | ||||||
| # packages are installed together to keep the extra modules in sync with the | ||||||
| # kernel ABI. | ||||||
| RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone | ||||||
| COPY packages.* /tmp/ | ||||||
| COPY --from=kernel-build /debs /tmp/kernel-debs | ||||||
| RUN apt-get -y update && \ | ||||||
| apt-get -y upgrade && \ | ||||||
| apt-get -y --no-install-recommends install $(for P in ${PACKAGE_FILES}; do cat /tmp/$P | sed -e "s/#.*//" ; done) \ | ||||||
| ${_KERNEL_PACKAGE} \ | ||||||
| linux-modules-extra-$(apt-cache depends ${_KERNEL_PACKAGE} | sed -n -e 's/ Depends: linux-image-\(.*\)-generic/\1/p')-generic && \ | ||||||
| rm /tmp/packages.* | ||||||
| /tmp/kernel-debs/linux-image-unsigned-*-generic_*_amd64.deb \ | ||||||
| /tmp/kernel-debs/linux-modules-*-generic_*_amd64.deb \ | ||||||
|
||||||
| /tmp/kernel-debs/linux-modules-*-generic_*_amd64.deb \ | |
| /tmp/kernel-debs/linux-modules-[0-9]*-generic_*_amd64.deb \ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| From 03b75f017ffe6cf556fefbd44f44655bf4a9af48 Mon Sep 17 00:00:00 2001 | ||
| From: Zi Yan <ziy@nvidia.com> | ||
| Date: Fri, 27 Feb 2026 14:11:36 -0500 | ||
| Subject: [PATCH] mm/huge_memory: fix folio_split() race condition with | ||
| folio_try_get() | ||
|
|
||
| During a pagecache folio split, the values in the related xarray should not | ||
| be changed from the original folio at xarray split time until all | ||
| after-split folios are ready and stored in the xarray. Otherwise, a | ||
| parallel folio_try_get() can see stale values in the xarray and a stale | ||
| value can be a unfrozen after-split folio. This leads to a wrong folio | ||
|
basvandijk marked this conversation as resolved.
|
||
| returned to userspace. | ||
|
|
||
| Backport of upstream commit 577a1f495fd78d8fb61b67ac3d3b595b01f6fcb0 | ||
| (merged in mainline v7.0-rc4, 2026-03; also applied to linux-6.18.y as | ||
| 08b2b65c63bb26dbb2a4e2adc2ce96e2929b8b60 on 2026-03-25). Adapted to | ||
| Ubuntu's linux-hwe-6.17 6.17.0-22.22~24.04.1 tree, which predates the | ||
| upstream __split_unmapped_folio() refactor that introduced SPLIT_TYPE_* | ||
| and folio_split_supported(); the fix itself is identical in effect. | ||
|
|
||
| As of 2026-04-20, the fix has NOT yet reached Ubuntu's linux-hwe-6.17 | ||
| package on noble (24.04). Reported against the IC GuestOS by | ||
| Bas van Dijk <bas@dfinity.org>; reproducer at | ||
| https://github.com/dfinity/thp-madv-remove-test. | ||
|
|
||
| Signed-off-by: Zi Yan <ziy@nvidia.com> | ||
| --- | ||
| mm/huge_memory.c | 9 ++++++++- | ||
| 1 file changed, 8 insertions(+), 1 deletion(-) | ||
|
|
||
| diff --git a/mm/huge_memory.c b/mm/huge_memory.c | ||
| --- a/mm/huge_memory.c | ||
| +++ b/mm/huge_memory.c | ||
| @@ -3422,6 +3422,7 @@ | ||
| { | ||
| int order = folio_order(folio); | ||
| int start_order = uniform_split ? new_order : order - 1; | ||
| + struct folio *origin_folio = folio; | ||
| bool stop_split = false; | ||
| struct folio *next; | ||
| int split_order; | ||
| @@ -3459,7 +3460,13 @@ | ||
| xas_split(xas, folio, old_order); | ||
| else { | ||
| xas_set_order(xas, folio->index, split_order); | ||
| - xas_try_split(xas, folio, old_order); | ||
| + /* | ||
| + * use the original folio, so that a | ||
| + * parallel folio_try_get() waits on it | ||
| + * until xarray is updated with after-split | ||
| + * folios and the original one is unfreezed | ||
| + */ | ||
| + xas_try_split(xas, origin_folio, old_order); | ||
| if (xas_error(xas)) { | ||
| ret = xas_error(xas); | ||
| stop_split = true; | ||
| -- | ||
| 2.51.0 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| # GuestOS kernel patches | ||
|
|
||
| Patches in this directory are applied (in lexicographic order) to Ubuntu's | ||
| `linux-hwe-6.17` source package in the `kernel-build` stage of | ||
| `Dockerfile.base`. The resulting `.deb` packages replace the stock kernel in | ||
| the final GuestOS base image. | ||
|
|
||
| ## Conventions | ||
|
|
||
| - Name patches `NNNN-short-description.patch` so they apply in a deterministic | ||
| order. | ||
| - Each patch file must be a single-commit `git format-patch` output and apply | ||
| with `patch -p1` from the root of the kernel source tree. | ||
| - Include in the commit message: the upstream mainline commit SHA, the | ||
| upstream stable branch it has (or has not) landed on, and the reason for | ||
| carrying the patch locally. | ||
| - Remove a patch once it is no longer necessary (i.e. the Ubuntu package in | ||
| use already contains the fix). | ||
|
|
||
| ## Current patches | ||
|
|
||
| - `0001-mm-huge_memory-fix-folio_split-race-condition.patch` — backport of | ||
| upstream `577a1f495fd78d8fb61b67ac3d3b595b01f6fcb0` ("mm/huge_memory: fix a | ||
| folio_split() race condition with folio_try_get()"). Drop once the Ubuntu | ||
| kernel in use (`linux-hwe-6.17` or a newer HWE track) ships this fix. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment says a missing
kernel-patches/directory is tolerated, but the precedingCOPY kernel-patches /tmp/kernel-patcheswill fail if the directory is removed. Either adjust the comment to only claim empty-directory tolerance, or ensure the directory always exists in the repo (even when no patches are carried).