From 87d9cdc10662d2d944349d642700e998ce217b8c Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 14:56:14 +1200
Subject: [PATCH 01/27] update

---
 CHANGELOG.md       |   7 --
 Cargo.toml         |  40 +++----
 README-zh_CN.md    |  51 ---------
 README.md          | 102 +++++++++++------
 benches/decode.rs  | 114 +++++++++++++++++++
 benches/foo.rs     |   1 -
 docs/design.md     | 154 +++++++++++++++++++++++++
 examples/decode.rs |  74 ++++++++++++
 examples/foo.rs    |   1 -
 src/backend.rs     | 137 ++++++++++++++++++++++
 src/decoder.rs     | 278 +++++++++++++++++++++++++++++++++++++++++++++
 src/error.rs       |  37 ++++++
 src/ffi.rs         |  70 ++++++++++++
 src/lib.rs         |  26 ++++-
 tests/decode.rs    |  68 +++++++++++
 tests/foo.rs       |   1 -
 tests/hw_smoke.rs  |  64 +++++++++++
 17 files changed, 1106 insertions(+), 119 deletions(-)
 delete mode 100644 CHANGELOG.md
 delete mode 100644 README-zh_CN.md
 create mode 100644 benches/decode.rs
 delete mode 100644 benches/foo.rs
 create mode 100644 docs/design.md
 create mode 100644 examples/decode.rs
 delete mode 100644 examples/foo.rs
 create mode 100644 src/backend.rs
 create mode 100644 src/decoder.rs
 create mode 100644 src/error.rs
 create mode 100644 src/ffi.rs
 create mode 100644 tests/decode.rs
 delete mode 100644 tests/foo.rs
 create mode 100644 tests/hw_smoke.rs
diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index bd7a668..0000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# UNRELEASED
-
-# 0.1.2 (January 6th, 2022)
-
-FEATURES
-
-
diff --git a/Cargo.toml b/Cargo.toml
index ff7fe91..8e4ea79 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,35 +1,37 @@
 [package]
-name = "template-rs"
+name = "hwdecode"
 version = "0.0.0"
 edition = "2021"
-repository = "https://github.com/al8n/template-rs"
-homepage = "https://github.com/al8n/template-rs"
-documentation = "https://docs.rs/template-rs"
-description = "A template for creating Rust open-source repo on GitHub"
+rust-version = "1.95"
+description = "Cross-platform hardware-accelerated video decoder built on top of ffmpeg-next, with auto-probe and software fallback."
+repository = "https://github.com/findit-ai/hwdecode"
+homepage = "https://github.com/findit-ai/hwdecode"
+documentation = "https://docs.rs/hwdecode"
 license = "MIT OR Apache-2.0"
-rust-version = "1.73"
-
-[[bench]]
-path = "benches/foo.rs"
-name = "foo"
-harness = false
-
-[features]
-default = ["std"]
-alloc = []
-std = []
 
 [dependencies]
+ffmpeg-next = { version = "8.1", default-features = false, features = ["codec", "format"] }
+thiserror = "2"
+tracing = "0.1"
+libc = "0.2"
 
 [dev-dependencies]
 criterion = "0.8"
-tempfile = "3"
+
+[[example]]
+name = "decode"
+path = "examples/decode.rs"
+
+[[bench]]
+name = "decode"
+path = "benches/decode.rs"
+harness = false
 
 [profile.bench]
 opt-level = 3
 debug = false
 codegen-units = 1
-lto = 'thin'
+lto = "thin"
 incremental = false
 debug-assertions = false
 overflow-checks = false
@@ -41,8 +43,6 @@ rustdoc-args = ["--cfg", "docsrs"]
 
 [lints.rust]
 rust_2018_idioms = "warn"
-single_use_lifetimes = "warn"
 unexpected_cfgs = { level = "warn", check-cfg = [
-  'cfg(all_tests)',
   'cfg(tarpaulin)',
 ] }
diff --git a/README-zh_CN.md b/README-zh_CN.md
deleted file mode 100644
index 7a07f4d..0000000
--- a/README-zh_CN.md
+++ /dev/null
@@ -1,51 +0,0 @@
-<div align="center">
-<h1>template-rs</h1>
-</div>
-<div align="center">
-
-开源Rust代码库GitHub模版
-
-[<img alt="github" src="https://img.shields.io/badge/github-al8n/template--rs-8da0cb?style=for-the-badge&logo=Github" height="22">][Github-url]
-<img alt="LoC" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fal8n%2F327b2a8aef9003246e45c6e47fe63937%2Fraw%2Ftemplate-rs" height="22">
-[<img alt="Build" src="https://img.shields.io/github/actions/workflow/status/al8n/template-rs/ci.yml?logo=Github-Actions&style=for-the-badge" height="22">][CI-url]
-[<img alt="codecov" src="https://img.shields.io/codecov/c/gh/al8n/template-rs?style=for-the-badge&token=6R3QFWRWHL&logo=codecov" height="22">][codecov-url]
-
-[<img alt="docs.rs" src="https://img.shields.io/badge/docs.rs-template--rs-66c2a5?style=for-the-badge&labelColor=555555&logo=data:image/svg+xml;base64,PHN2ZyByb2xlPSJpbWciIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgdmlld0JveD0iMCAwIDUxMiA1MTIiPjxwYXRoIGZpbGw9IiNmNWY1ZjUiIGQ9Ik00ODguNiAyNTAuMkwzOTIgMjE0VjEwNS41YzAtMTUtOS4zLTI4LjQtMjMuNC0zMy43bC0xMDAtMzcuNWMtOC4xLTMuMS0xNy4xLTMuMS0yNS4zIDBsLTEwMCAzNy41Yy0xNC4xIDUuMy0yMy40IDE4LjctMjMuNCAzMy43VjIxNGwtOTYuNiAzNi4yQzkuMyAyNTUuNSAwIDI2OC45IDAgMjgzLjlWMzk0YzAgMTMuNiA3LjcgMjYuMSAxOS45IDMyLjJsMTAwIDUwYzEwLjEgNS4xIDIyLjEgNS4xIDMyLjIgMGwxMDMuOS01MiAxMDMuOSA1MmMxMC4xIDUuMSAyMi4xIDUuMSAzMi4yIDBsMTAwLTUwYzEyLjItNi4xIDE5LjktMTguNiAxOS45LTMyLjJWMjgzLjljMC0xNS05LjMtMjguNC0yMy40LTMzLjd6TTM1OCAyMTQuOGwtODUgMzEuOXYtNjguMmw4NS0zN3Y3My4zek0xNTQgMTA0LjFsMTAyLTM4LjIgMTAyIDM4LjJ2LjZsLTEwMiA0MS40LTEwMi00MS40di0uNnptODQgMjkxLjFsLTg1IDQyLjV2LTc5LjFsODUtMzguOHY3NS40em0wLTExMmwtMTAyIDQxLjQtMTAyLTQxLjR2LS42bDEwMi0zOC4yIDEwMiAzOC4ydi42em0yNDAgMTEybC04NSA0Mi41di03OS4xbDg1LTM4Ljh2NzUuNHptMC0xMTJsLTEwMiA0MS40LTEwMi00MS40di0uNmwxMDItMzguMiAxMDIgMzguMnYuNnoiPjwvcGF0aD48L3N2Zz4K" height="20">][doc-url]
-[<img alt="crates.io" src="https://img.shields.io/crates/v/template-rs?style=for-the-badge&logo=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iaXNvLTg4NTktMSI/Pg0KPCEtLSBHZW5lcmF0b3I6IEFkb2JlIElsbHVzdHJhdG9yIDE5LjAuMCwgU1ZHIEV4cG9ydCBQbHVnLUluIC4gU1ZHIFZlcnNpb246IDYuMDAgQnVpbGQgMCkgIC0tPg0KPHN2ZyB2ZXJzaW9uPSIxLjEiIGlkPSJMYXllcl8xIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIiB4PSIwcHgiIHk9IjBweCINCgkgdmlld0JveD0iMCAwIDUxMiA1MTIiIHhtbDpzcGFjZT0icHJlc2VydmUiPg0KPGc+DQoJPGc+DQoJCTxwYXRoIGQ9Ik0yNTYsMEwzMS41MjgsMTEyLjIzNnYyODcuNTI4TDI1Niw1MTJsMjI0LjQ3Mi0xMTIuMjM2VjExMi4yMzZMMjU2LDB6IE0yMzQuMjc3LDQ1Mi41NjRMNzQuOTc0LDM3Mi45MTNWMTYwLjgxDQoJCQlsMTU5LjMwMyw3OS42NTFWNDUyLjU2NHogTTEwMS44MjYsMTI1LjY2MkwyNTYsNDguNTc2bDE1NC4xNzQsNzcuMDg3TDI1NiwyMDIuNzQ5TDEwMS44MjYsMTI1LjY2MnogTTQzNy4wMjYsMzcyLjkxMw0KCQkJbC0xNTkuMzAzLDc5LjY1MVYyNDAuNDYxbDE1OS4zMDMtNzkuNjUxVjM3Mi45MTN6IiBmaWxsPSIjRkZGIi8+DQoJPC9nPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPC9zdmc+DQo=" height="22">][crates-url]
-[<img alt="crates.io" src="https://img.shields.io/crates/d/template-rs?color=critical&logo=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBzdGFuZGFsb25lPSJubyI/PjwhRE9DVFlQRSBzdmcgUFVCTElDICItLy9XM0MvL0RURCBTVkcgMS4xLy9FTiIgImh0dHA6Ly93d3cudzMub3JnL0dyYXBoaWNzL1NWRy8xLjEvRFREL3N2ZzExLmR0ZCI+PHN2ZyB0PSIxNjQ1MTE3MzMyOTU5IiBjbGFzcz0iaWNvbiIgdmlld0JveD0iMCAwIDEwMjQgMTAyNCIgdmVyc2lvbj0iMS4xIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHAtaWQ9IjM0MjEiIGRhdGEtc3BtLWFuY2hvci1pZD0iYTMxM3guNzc4MTA2OS4wLmkzIiB3aWR0aD0iNDgiIGhlaWdodD0iNDgiIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIj48ZGVmcz48c3R5bGUgdHlwZT0idGV4dC9jc3MiPjwvc3R5bGU+PC9kZWZzPjxwYXRoIGQ9Ik00NjkuMzEyIDU3MC4yNHYtMjU2aDg1LjM3NnYyNTZoMTI4TDUxMiA3NTYuMjg4IDM0MS4zMTIgNTcwLjI0aDEyOHpNMTAyNCA2NDAuMTI4QzEwMjQgNzgyLjkxMiA5MTkuODcyIDg5NiA3ODcuNjQ4IDg5NmgtNTEyQzEyMy45MDQgODk2IDAgNzYxLjYgMCA1OTcuNTA0IDAgNDUxLjk2OCA5NC42NTYgMzMxLjUyIDIyNi40MzIgMzAyLjk3NiAyODQuMTYgMTk1LjQ1NiAzOTEuODA4IDEyOCA1MTIgMTI4YzE1Mi4zMiAwIDI4Mi4xMTIgMTA4LjQxNiAzMjMuMzkyIDI2MS4xMkM5NDEuODg4IDQxMy40NCAxMDI0IDUxOS4wNCAxMDI0IDY0MC4xOTJ6IG0tMjU5LjItMjA1LjMxMmMtMjQuNDQ4LTEyOS4wMjQtMTI4Ljg5Ni0yMjIuNzItMjUyLjgtMjIyLjcyLTk3LjI4IDAtMTgzLjA0IDU3LjM0NC0yMjQuNjQgMTQ3LjQ1NmwtOS4yOCAyMC4yMjQtMjAuOTI4IDIuOTQ0Yy0xMDMuMzYgMTQuNC0xNzguMzY4IDEwNC4zMi0xNzguMzY4IDIxNC43MiAwIDExNy45NTIgODguODMyIDIxNC40IDE5Ni45MjggMjE0LjRoNTEyYzg4LjMyIDAgMTU3LjUwNC03NS4xMzYgMTU3LjUwNC0xNzEuNzEyIDAtODguMDY0LTY1LjkyLTE2NC45MjgtMTQ0Ljk2LTE3MS43NzZsLTI5LjUwNC0yLjU2LTUuODg4LTMwLjk3NnoiIGZpbGw9IiNmZmZmZmYiIHAtaWQ9IjM0MjIiIGRhdGEtc3BtLWFuY2hvci1pZD0iYTMxM3guNzc4MTA2OS4wLmkwIiBjbGFzcz0iIj48L3BhdGg+PC9zdmc+&style=for-the-badge" height="22">][crates-url]
-<img alt="license" src="https://img.shields.io/badge/License-Apache%202.0/MIT-blue.svg?style=for-the-badge&fontColor=white&logoColor=f5c076&logo=data:image/svg+xml;base64,PCFET0NUWVBFIHN2ZyBQVUJMSUMgIi0vL1czQy8vRFREIFNWRyAxLjEvL0VOIiAiaHR0cDovL3d3dy53My5vcmcvR3JhcGhpY3MvU1ZHLzEuMS9EVEQvc3ZnMTEuZHRkIj4KDTwhLS0gVXBsb2FkZWQgdG86IFNWRyBSZXBvLCB3d3cuc3ZncmVwby5jb20sIFRyYW5zZm9ybWVkIGJ5OiBTVkcgUmVwbyBNaXhlciBUb29scyAtLT4KPHN2ZyBmaWxsPSIjZmZmZmZmIiBoZWlnaHQ9IjgwMHB4IiB3aWR0aD0iODAwcHgiIHZlcnNpb249IjEuMSIgaWQ9IkNhcGFfMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayIgdmlld0JveD0iMCAwIDI3Ni43MTUgMjc2LjcxNSIgeG1sOnNwYWNlPSJwcmVzZXJ2ZSIgc3Ryb2tlPSIjZmZmZmZmIj4KDTxnIGlkPSJTVkdSZXBvX2JnQ2FycmllciIgc3Ryb2tlLXdpZHRoPSIwIi8+Cg08ZyBpZD0iU1ZHUmVwb190cmFjZXJDYXJyaWVyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiLz4KDTxnIGlkPSJTVkdSZXBvX2ljb25DYXJyaWVyIj4gPGc+IDxwYXRoIGQ9Ik0xMzguMzU3LDBDNjIuMDY2LDAsMCw2Mi4wNjYsMCwxMzguMzU3czYyLjA2NiwxMzguMzU3LDEzOC4zNTcsMTM4LjM1N3MxMzguMzU3LTYyLjA2NiwxMzguMzU3LTEzOC4zNTcgUzIxNC42NDgsMCwxMzguMzU3LDB6IE0xMzguMzU3LDI1OC43MTVDNzEuOTkyLDI1OC43MTUsMTgsMjA0LjcyMywxOCwxMzguMzU3UzcxLjk5MiwxOCwxMzguMzU3LDE4IHMxMjAuMzU3LDUzLjk5MiwxMjAuMzU3LDEyMC4zNTdTMjA0LjcyMywyNTguNzE1LDEzOC4zNTcsMjU4LjcxNXoiLz4gPHBhdGggZD0iTTE5NC43OTgsMTYwLjkwM2MtNC4xODgtMi42NzctOS43NTMtMS40NTQtMTIuNDMyLDIuNzMyYy04LjY5NCwxMy41OTMtMjMuNTAzLDIxLjcwOC0zOS42MTQsMjEuNzA4IGMtMjUuOTA4LDAtNDYuOTg1LTIxLjA3OC00Ni45ODUtNDYuOTg2czIxLjA3Ny00Ni45ODYsNDYuOTg1LTQ2Ljk4NmMxNS42MzMsMCwzMC4yLDcuNzQ3LDM4Ljk2OCwyMC43MjMgYzIuNzgyLDQuMTE3LDguMzc1LDUuMjAxLDEyLjQ5NiwyLjQxOGM0LjExOC0yLjc4Miw1LjIwMS04LjM3NywyLjQxOC0xMi40OTZjLTEyLjExOC0xNy45MzctMzIuMjYyLTI4LjY0NS01My44ODItMjguNjQ1IGMtMzUuODMzLDAtNjQuOTg1LDI5LjE1Mi02NC45ODUsNjQuOTg2czI5LjE1Miw2NC45ODYsNjQuOTg1LDY0Ljk4NmMyMi4yODEsMCw0Mi43NTktMTEuMjE4LDU0Ljc3OC0zMC4wMDkgQzIwMC4yMDgsMTY5LjE0NywxOTguOTg1LDE2My41ODIsMTk0Ljc5OCwxNjAuOTAzeiIvPiA8L2c+IDwvZz4KDTwvc3ZnPg==" height="22">
-
-[English][en-url] | 简体中文
-
-</div>
-
-## Installation
-
-```toml
-[dependencies]
-template_rs = "0.1"
-```
-
-## Features
-
-- [x] 更快的创建GitHub开源Rust代码库
-
-#### License
-
-`Template-rs` is under the terms of both the MIT license and the
-Apache License (Version 2.0).
-
-See [LICENSE-APACHE](LICENSE-APACHE), [LICENSE-MIT](LICENSE-MIT) for details.
-
-Copyright (c) 2021 Al Liu.
-
-[Github-url]: https://github.com/al8n/template-rs/
-[CI-url]: https://github.com/al8n/template/actions/workflows/template.yml
-[doc-url]: https://docs.rs/template-rs
-[crates-url]: https://crates.io/crates/template-rs
-[codecov-url]: https://app.codecov.io/gh/al8n/template-rs/
-[license-url]: https://opensource.org/licenses/Apache-2.0
-[rustc-url]: https://github.com/rust-lang/rust/blob/master/RELEASES.md
-[license-apache-url]: https://opensource.org/licenses/Apache-2.0
-[license-mit-url]: https://opensource.org/licenses/MIT
-[en-url]: https://github.com/al8n/template-rs/tree/main/README.md
diff --git a/README.md b/README.md
index 1af27e2..bcfb058 100644
--- a/README.md
+++ b/README.md
@@ -1,46 +1,84 @@
-<div align="center">
-<h1>template-rs</h1>
-</div>
-<div align="center">
+# hwdecode
 
-A template for creating Rust open-source GitHub repo.
+Cross-platform hardware-accelerated video decoder for Rust, built on top of
+[`ffmpeg-next`](https://crates.io/crates/ffmpeg-next).
 
-[<img alt="github" src="https://img.shields.io/badge/github-al8n/template--rs-8da0cb?style=for-the-badge&logo=Github" height="22">][Github-url]
-<img alt="LoC" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fal8n%2F327b2a8aef9003246e45c6e47fe63937%2Fraw%2Ftemplate-rs" height="22">
-[<img alt="Build" src="https://img.shields.io/github/actions/workflow/status/al8n/template-rs/ci.yml?logo=Github-Actions&style=for-the-badge" height="22">][CI-url]
-[<img alt="codecov" src="https://img.shields.io/codecov/c/gh/al8n/template-rs?style=for-the-badge&token=6R3QFWRWHL&logo=codecov" height="22">][codecov-url]
+`VideoDecoder` mirrors the `send_packet` / `receive_frame` interface of
+`ffmpeg::decoder::Video` and silently picks the best hardware backend for the
+host platform, falling back to software if none open. Output frames are
+CPU-side — for HW backends they are downloaded with `av_hwframe_transfer_data`
+(NV12 for 8-bit, P010 for 10-bit). Pixel-format conversion is intentionally
+out of scope.
 
-[<img alt="docs.rs" src="https://img.shields.io/badge/docs.rs-template--rs-66c2a5?style=for-the-badge&labelColor=555555&logo=data:image/svg+xml;base64,PHN2ZyByb2xlPSJpbWciIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgdmlld0JveD0iMCAwIDUxMiA1MTIiPjxwYXRoIGZpbGw9IiNmNWY1ZjUiIGQ9Ik00ODguNiAyNTAuMkwzOTIgMjE0VjEwNS41YzAtMTUtOS4zLTI4LjQtMjMuNC0zMy43bC0xMDAtMzcuNWMtOC4xLTMuMS0xNy4xLTMuMS0yNS4zIDBsLTEwMCAzNy41Yy0xNC4xIDUuMy0yMy40IDE4LjctMjMuNCAzMy43VjIxNGwtOTYuNiAzNi4yQzkuMyAyNTUuNSAwIDI2OC45IDAgMjgzLjlWMzk0YzAgMTMuNiA3LjcgMjYuMSAxOS45IDMyLjJsMTAwIDUwYzEwLjEgNS4xIDIyLjEgNS4xIDMyLjIgMGwxMDMuOS01MiAxMDMuOSA1MmMxMC4xIDUuMSAyMi4xIDUuMSAzMi4yIDBsMTAwLTUwYzEyLjItNi4xIDE5LjktMTguNiAxOS45LTMyLjJWMjgzLjljMC0xNS05LjMtMjguNC0yMy40LTMzLjd6TTM1OCAyMTQuOGwtODUgMzEuOXYtNjguMmw4NS0zN3Y3My4zek0xNTQgMTA0LjFsMTAyLTM4LjIgMTAyIDM4LjJ2LjZsLTEwMiA0MS40LTEwMi00MS40di0uNnptODQgMjkxLjFsLTg1IDQyLjV2LTc5LjFsODUtMzguOHY3NS40em0wLTExMmwtMTAyIDQxLjQtMTAyLTQxLjR2LS42bDEwMi0zOC4yIDEwMiAzOC4ydi42em0yNDAgMTEybC04NSA0Mi41di03OS4xbDg1LTM4Ljh2NzUuNHptMC0xMTJsLTEwMiA0MS40LTEwMi00MS40di0uNmwxMDItMzguMiAxMDIgMzguMnYuNnoiPjwvcGF0aD48L3N2Zz4K" height="20">][doc-url]
-[<img alt="crates.io" src="https://img.shields.io/crates/v/template-rs?style=for-the-badge&logo=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iaXNvLTg4NTktMSI/Pg0KPCEtLSBHZW5lcmF0b3I6IEFkb2JlIElsbHVzdHJhdG9yIDE5LjAuMCwgU1ZHIEV4cG9ydCBQbHVnLUluIC4gU1ZHIFZlcnNpb246IDYuMDAgQnVpbGQgMCkgIC0tPg0KPHN2ZyB2ZXJzaW9uPSIxLjEiIGlkPSJMYXllcl8xIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIiB4PSIwcHgiIHk9IjBweCINCgkgdmlld0JveD0iMCAwIDUxMiA1MTIiIHhtbDpzcGFjZT0icHJlc2VydmUiPg0KPGc+DQoJPGc+DQoJCTxwYXRoIGQ9Ik0yNTYsMEwzMS41MjgsMTEyLjIzNnYyODcuNTI4TDI1Niw1MTJsMjI0LjQ3Mi0xMTIuMjM2VjExMi4yMzZMMjU2LDB6IE0yMzQuMjc3LDQ1Mi41NjRMNzQuOTc0LDM3Mi45MTNWMTYwLjgxDQoJCQlsMTU5LjMwMyw3OS42NTFWNDUyLjU2NHogTTEwMS44MjYsMTI1LjY2MkwyNTYsNDguNTc2bDE1NC4xNzQsNzcuMDg3TDI1NiwyMDIuNzQ5TDEwMS44MjYsMTI1LjY2MnogTTQzNy4wMjYsMzcyLjkxMw0KCQkJbC0xNTkuMzAzLDc5LjY1MVYyNDAuNDYxbDE1OS4zMDMtNzkuNjUxVjM3Mi45MTN6IiBmaWxsPSIjRkZGIi8+DQoJPC9nPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPC9zdmc+DQo=" height="22">][crates-url]
-[<img alt="crates.io" src="https://img.shields.io/crates/d/template-rs?color=critical&logo=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBzdGFuZGFsb25lPSJubyI/PjwhRE9DVFlQRSBzdmcgUFVCTElDICItLy9XM0MvL0RURCBTVkcgMS4xLy9FTiIgImh0dHA6Ly93d3cudzMub3JnL0dyYXBoaWNzL1NWRy8xLjEvRFREL3N2ZzExLmR0ZCI+PHN2ZyB0PSIxNjQ1MTE3MzMyOTU5IiBjbGFzcz0iaWNvbiIgdmlld0JveD0iMCAwIDEwMjQgMTAyNCIgdmVyc2lvbj0iMS4xIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHAtaWQ9IjM0MjEiIGRhdGEtc3BtLWFuY2hvci1pZD0iYTMxM3guNzc4MTA2OS4wLmkzIiB3aWR0aD0iNDgiIGhlaWdodD0iNDgiIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIj48ZGVmcz48c3R5bGUgdHlwZT0idGV4dC9jc3MiPjwvc3R5bGU+PC9kZWZzPjxwYXRoIGQ9Ik00NjkuMzEyIDU3MC4yNHYtMjU2aDg1LjM3NnYyNTZoMTI4TDUxMiA3NTYuMjg4IDM0MS4zMTIgNTcwLjI0aDEyOHpNMTAyNCA2NDAuMTI4QzEwMjQgNzgyLjkxMiA5MTkuODcyIDg5NiA3ODcuNjQ4IDg5NmgtNTEyQzEyMy45MDQgODk2IDAgNzYxLjYgMCA1OTcuNTA0IDAgNDUxLjk2OCA5NC42NTYgMzMxLjUyIDIyNi40MzIgMzAyLjk3NiAyODQuMTYgMTk1LjQ1NiAzOTEuODA4IDEyOCA1MTIgMTI4YzE1Mi4zMiAwIDI4Mi4xMTIgMTA4LjQxNiAzMjMuMzkyIDI2MS4xMkM5NDEuODg4IDQxMy40NCAxMDI0IDUxOS4wNCAxMDI0IDY0MC4xOTJ6IG0tMjU5LjItMjA1LjMxMmMtMjQuNDQ4LTEyOS4wMjQtMTI4Ljg5Ni0yMjIuNzItMjUyLjgtMjIyLjcyLTk3LjI4IDAtMTgzLjA0IDU3LjM0NC0yMjQuNjQgMTQ3LjQ1NmwtOS4yOCAyMC4yMjQtMjAuOTI4IDIuOTQ0Yy0xMDMuMzYgMTQuNC0xNzguMzY4IDEwNC4zMi0xNzguMzY4IDIxNC43MiAwIDExNy45NTIgODguODMyIDIxNC40IDE5Ni45MjggMjE0LjRoNTEyYzg4LjMyIDAgMTU3LjUwNC03NS4xMzYgMTU3LjUwNC0xNzEuNzEyIDAtODguMDY0LTY1LjkyLTE2NC45MjgtMTQ0Ljk2LTE3MS43NzZsLTI5LjUwNC0yLjU2LTUuODg4LTMwLjk3NnoiIGZpbGw9IiNmZmZmZmYiIHAtaWQ9IjM0MjIiIGRhdGEtc3BtLWFuY2hvci1pZD0iYTMxM3guNzc4MTA2OS4wLmkwIiBjbGFzcz0iIj48L3BhdGg+PC9zdmc+&style=for-the-badge" height="22">][crates-url]
-<img alt="license" src="https://img.shields.io/badge/License-Apache%202.0/MIT-blue.svg?style=for-the-badge&fontColor=white&logoColor=f5c076&logo=data:image/svg+xml;base64,PCFET0NUWVBFIHN2ZyBQVUJMSUMgIi0vL1czQy8vRFREIFNWRyAxLjEvL0VOIiAiaHR0cDovL3d3dy53My5vcmcvR3JhcGhpY3MvU1ZHLzEuMS9EVEQvc3ZnMTEuZHRkIj4KDTwhLS0gVXBsb2FkZWQgdG86IFNWRyBSZXBvLCB3d3cuc3ZncmVwby5jb20sIFRyYW5zZm9ybWVkIGJ5OiBTVkcgUmVwbyBNaXhlciBUb29scyAtLT4KPHN2ZyBmaWxsPSIjZmZmZmZmIiBoZWlnaHQ9IjgwMHB4IiB3aWR0aD0iODAwcHgiIHZlcnNpb249IjEuMSIgaWQ9IkNhcGFfMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayIgdmlld0JveD0iMCAwIDI3Ni43MTUgMjc2LjcxNSIgeG1sOnNwYWNlPSJwcmVzZXJ2ZSIgc3Ryb2tlPSIjZmZmZmZmIj4KDTxnIGlkPSJTVkdSZXBvX2JnQ2FycmllciIgc3Ryb2tlLXdpZHRoPSIwIi8+Cg08ZyBpZD0iU1ZHUmVwb190cmFjZXJDYXJyaWVyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiLz4KDTxnIGlkPSJTVkdSZXBvX2ljb25DYXJyaWVyIj4gPGc+IDxwYXRoIGQ9Ik0xMzguMzU3LDBDNjIuMDY2LDAsMCw2Mi4wNjYsMCwxMzguMzU3czYyLjA2NiwxMzguMzU3LDEzOC4zNTcsMTM4LjM1N3MxMzguMzU3LTYyLjA2NiwxMzguMzU3LTEzOC4zNTcgUzIxNC42NDgsMCwxMzguMzU3LDB6IE0xMzguMzU3LDI1OC43MTVDNzEuOTkyLDI1OC43MTUsMTgsMjA0LjcyMywxOCwxMzguMzU3UzcxLjk5MiwxOCwxMzguMzU3LDE4IHMxMjAuMzU3LDUzLjk5MiwxMjAuMzU3LDEyMC4zNTdTMjA0LjcyMywyNTguNzE1LDEzOC4zNTcsMjU4LjcxNXoiLz4gPHBhdGggZD0iTTE5NC43OTgsMTYwLjkwM2MtNC4xODgtMi42NzctOS43NTMtMS40NTQtMTIuNDMyLDIuNzMyYy04LjY5NCwxMy41OTMtMjMuNTAzLDIxLjcwOC0zOS42MTQsMjEuNzA4IGMtMjUuOTA4LDAtNDYuOTg1LTIxLjA3OC00Ni45ODUtNDYuOTg2czIxLjA3Ny00Ni45ODYsNDYuOTg1LTQ2Ljk4NmMxNS42MzMsMCwzMC4yLDcuNzQ3LDM4Ljk2OCwyMC43MjMgYzIuNzgyLDQuMTE3LDguMzc1LDUuMjAxLDEyLjQ5NiwyLjQxOGM0LjExOC0yLjc4Miw1LjIwMS04LjM3NywyLjQxOC0xMi40OTZjLTEyLjExOC0xNy45MzctMzIuMjYyLTI4LjY0NS01My44ODItMjguNjQ1IGMtMzUuODMzLDAtNjQuOTg1LDI5LjE1Mi02NC45ODUsNjQuOTg2czI5LjE1Miw2NC45ODYsNjQuOTg1LDY0Ljk4NmMyMi4yODEsMCw0Mi43NTktMTEuMjE4LDU0Ljc3OC0zMC4wMDkgQzIwMC4yMDgsMTY5LjE0NywxOTguOTg1LDE2My41ODIsMTk0Ljc5OCwxNjAuOTAzeiIvPiA8L2c+IDwvZz4KDTwvc3ZnPg==" height="22">
+## Backends
 
-English | [简体中文][zh-cn-url]
+| Target              | Probe order                       |
+| ------------------- | --------------------------------- |
+| macOS / iOS / tvOS  | VideoToolbox → Software           |
+| Linux               | VAAPI → CUDA → Software           |
+| Windows             | D3D11VA → CUDA → Software         |
+| other               | Software                          |
 
-</div>
+## Usage
 
-## Installation
+```rust
+use ffmpeg_next as ffmpeg;
+use ffmpeg::{format, frame, media};
+use hwdecode::VideoDecoder;
 
-```toml
-[dependencies]
-template_rs = "0.1"
+ffmpeg::init()?;
+
+let mut input = format::input(path)?;
+let stream = input.streams().best(media::Type::Video).unwrap();
+let stream_index = stream.index();
+
+let mut decoder = VideoDecoder::open(stream.parameters())?;
+println!("backend = {:?}", decoder.backend());
+
+let mut frame = frame::Video::empty();
+for (s, packet) in input.packets() {
+    if s.index() != stream_index { continue; }
+    decoder.send_packet(&packet)?;
+    while decoder.receive_frame(&mut frame).is_ok() {
+        // frame.format() is NV12 / P010 (HW path) or codec-native (SW path)
+        // ... do something with frame ...
+    }
+}
+decoder.send_eof()?;
+while decoder.receive_frame(&mut frame).is_ok() {
+    // ... drain ...
+}
 ```
 
-## Features
-- [x] Create a Rust open-source repo fast 
+To force a specific backend (no probe, no fallback):
+
+```rust
+use hwdecode::{Backend, VideoDecoder};
+let decoder = VideoDecoder::open_with(parameters, Backend::Software)?;
+```
+
+## Running tests and benches
+
+The integration test and benchmark expect a real video file. Set
+`HWDECODE_SAMPLE_VIDEO` to enable them:
+
+```sh
+HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo test
+HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo test --test hw_smoke -- --ignored
+HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo bench
+```
 
-#### License
+Without the env var the integration test skips with a notice; unit tests run
+unconditionally.
 
-`template-rs` is under the terms of both the MIT license and the
-Apache License (Version 2.0).
+## Build requirements
 
-See [LICENSE-APACHE](LICENSE-APACHE), [LICENSE-MIT](LICENSE-MIT) for details.
+- A system FFmpeg ≥ 4.x linkable via `pkg-config`. Verify with
+  `ffmpeg -hwaccels` that your build has the backends you expect compiled in
+  (e.g. `videotoolbox` on macOS, `vaapi` / `cuda` on Linux,
+  `d3d11va` / `cuda` on Windows).
+- Rust ≥ 1.95.
 
-Copyright (c) 2021 Al Liu.
+## License
 
-[Github-url]: https://github.com/al8n/template-rs/
-[CI-url]: https://github.com/al8n/template-rs/actions/workflows/ci.yml
-[doc-url]: https://docs.rs/template-rs
-[crates-url]: https://crates.io/crates/template-rs
-[codecov-url]: https://app.codecov.io/gh/al8n/template-rs/
-[zh-cn-url]: https://github.com/al8n/template-rs/tree/main/README-zh_CN.md
+MIT or Apache-2.0, at your option.
diff --git a/benches/decode.rs b/benches/decode.rs
new file mode 100644
index 0000000..be7281d
--- /dev/null
+++ b/benches/decode.rs
@@ -0,0 +1,114 @@
+//! Benchmark comparing software-only decode against the auto-probed
+//! hardware backend on the same input file.
+//!
+//! Set `HWDECODE_SAMPLE_VIDEO` to a video file path. The hardware bench is
+//! skipped (with a notice) when the auto-probe falls back to software.
+//!
+//! ```sh
+//! HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo bench
+//! ```
+
+use std::{path::PathBuf, time::Duration};
+
+use criterion::{criterion_group, criterion_main, Criterion};
+use ffmpeg::{format, frame, media};
+use ffmpeg_next as ffmpeg;
+use hwdecode::{Backend, VideoDecoder};
+
+const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
+
+fn sample_path() -> Option<PathBuf> {
+  std::env::var_os(SAMPLE_ENV).map(PathBuf::from)
+}
+
+/// Decode every video frame in the file using `decoder`, returning the count.
+/// Re-opens the input each call so each iteration measures a full decode pass.
+fn decode_all(path: &PathBuf, backend: Backend) -> Result<usize, hwdecode::Error> {
+  let mut input = format::input(path).map_err(hwdecode::Error::Ffmpeg)?;
+  let stream = input
+    .streams()
+    .best(media::Type::Video)
+    .ok_or(hwdecode::Error::Ffmpeg(ffmpeg::Error::StreamNotFound))?;
+  let stream_index = stream.index();
+
+  let mut decoder = match backend {
+    Backend::Software => VideoDecoder::open_with(stream.parameters(), Backend::Software)?,
+    _ => VideoDecoder::open(stream.parameters())?,
+  };
+
+  let mut frame = frame::Video::empty();
+  let mut count = 0_usize;
+
+  let mut drain = |decoder: &mut VideoDecoder, count: &mut usize| -> Result<(), hwdecode::Error> {
+    loop {
+      match decoder.receive_frame(&mut frame) {
+        Ok(()) => *count += 1,
+        Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+          if errno == ffmpeg::error::EAGAIN =>
+        {
+          return Ok(());
+        }
+        Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Eof)) => return Ok(()),
+        Err(e) => return Err(e),
+      }
+    }
+  };
+
+  for (s, packet) in input.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet)?;
+    drain(&mut decoder, &mut count)?;
+  }
+  decoder.send_eof()?;
+  drain(&mut decoder, &mut count)?;
+  Ok(count)
+}
+
+fn bench_decode(c: &mut Criterion) {
+  ffmpeg::init().expect("ffmpeg init");
+
+  let Some(path) = sample_path() else {
+    eprintln!("skipping benches: set {SAMPLE_ENV} to a video file path");
+    return;
+  };
+
+  // Probe backend once to print which HW backend (if any) we'd be benching.
+  let probed_backend = {
+    let input = format::input(&path).expect("open input");
+    let stream = input
+      .streams()
+      .best(media::Type::Video)
+      .expect("video stream");
+    let dec = VideoDecoder::open(stream.parameters()).expect("auto-probe");
+    let b = dec.backend();
+    drop(dec);
+    b
+  };
+  eprintln!("auto-probe selected backend: {probed_backend:?}");
+
+  let mut group = c.benchmark_group("decode");
+  group.measurement_time(Duration::from_secs(15));
+  group.sample_size(20);
+
+  group.bench_function("software", |b| {
+    b.iter(|| decode_all(&path, Backend::Software).expect("software decode"))
+  });
+
+  if probed_backend != Backend::Software {
+    group.bench_function("hardware", |b| {
+      b.iter(|| {
+        let n = decode_all(&path, probed_backend).expect("hardware decode");
+        std::hint::black_box(n);
+      })
+    });
+  } else {
+    eprintln!("skipping hardware bench: auto-probe fell back to Software");
+  }
+
+  group.finish();
+}
+
+criterion_group!(benches, bench_decode);
+criterion_main!(benches);
diff --git a/benches/foo.rs b/benches/foo.rs
deleted file mode 100644
index f328e4d..0000000
--- a/benches/foo.rs
+++ /dev/null
@@ -1 +0,0 @@
-fn main() {}
diff --git a/docs/design.md b/docs/design.md
new file mode 100644
index 0000000..056bb4f
--- /dev/null
+++ b/docs/design.md
@@ -0,0 +1,154 @@
+# hwdecode — design
+
+Cross-platform hardware-accelerated video decoder built on top of `ffmpeg-next` 8.1.
+
+## Goals
+
+- Drop-in replacement for `ffmpeg::decoder::Video` at the call site (`send_packet` / `receive_frame` / `send_eof` / `flush`).
+- Auto-probe the platform's hardware backends and silently fall back to software if none open. Caller never has to think about hwaccel availability.
+- Hand back native-format CPU frames (NV12/P010 from the HW path, codec-native from the SW path). Pixel-format conversion is the caller's responsibility (e.g. via `colconv`).
+- Cross-platform: macOS / iOS / iPadOS / tvOS, Linux (Intel/AMD/NVIDIA), Windows (any GPU + CUDA on NVIDIA).
+
+## Non-goals
+
+- Audio hardware decoding. Out of scope; software AAC/Opus/etc. is fast enough that the complexity isn't justified.
+- Demuxing. Callers open files/streams themselves (e.g. via `findit-demuxer`) and feed packets in.
+- Pixel-format conversion. Done downstream (`colconv`).
+- Encoding.
+
+## Public API
+
+```rust
+pub struct VideoDecoder { /* private */ }
+
+impl VideoDecoder {
+    /// Auto-probe HW backends in platform order; fall back to software.
+    /// On success, `backend()` reports the one that won.
+    pub fn open(parameters: ffmpeg::codec::Parameters) -> Result<Self, Error>;
+
+    /// Force a specific backend. No probe, no fallback.
+    pub fn open_with(parameters: ffmpeg::codec::Parameters, backend: Backend) -> Result<Self, Error>;
+
+    pub fn backend(&self) -> Backend;
+    pub fn width(&self) -> u32;
+    pub fn height(&self) -> u32;
+    pub fn format(&self) -> ffmpeg::format::Pixel;
+    pub fn time_base(&self) -> ffmpeg::Rational;
+    pub fn frame_rate(&self) -> ffmpeg::Rational;
+
+    pub fn send_packet(&mut self, packet: &ffmpeg::Packet) -> Result<(), Error>;
+    pub fn send_eof(&mut self) -> Result<(), Error>;
+
+    /// Receive a CPU-side frame. For HW backends, internally calls
+    /// `av_hwframe_transfer_data` and copies PTS/timing onto the result;
+    /// output format is NV12 (8-bit) or P010 (10-bit). For SW, the frame
+    /// is in the codec's native format.
+    pub fn receive_frame(&mut self, frame: &mut ffmpeg::frame::Video) -> Result<(), Error>;
+
+    pub fn flush(&mut self);
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum Backend {
+    Software,
+    VideoToolbox, // macOS, iOS, iPadOS, tvOS
+    Vaapi,        // Linux (Intel/AMD)
+    Cuda,         // Linux/Windows (NVIDIA)
+    D3d11va,      // Windows
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("ffmpeg error: {0}")]
+    Ffmpeg(#[from] ffmpeg::Error),
+    #[error("no decoder for codec id {0:?}")]
+    NoCodec(ffmpeg::codec::Id),
+    #[error("hardware device init failed for {backend:?}: {source}")]
+    HwDeviceInitFailed { backend: Backend, source: ffmpeg::Error },
+    #[error("all backends failed; attempts: {attempts:?}")]
+    AllBackendsFailed { attempts: Vec<(Backend, ffmpeg::Error)> },
+}
+```
+
+## Behavior
+
+### Probe order
+
+| Target              | Order tried                                  |
+| ------------------- | -------------------------------------------- |
+| macOS, iOS, tvOS    | `[VideoToolbox, Software]`                   |
+| Linux               | `[Vaapi, Cuda, Software]`                    |
+| Windows             | `[D3d11va, Cuda, Software]`                  |
+| Other               | `[Software]`                                 |
+
+A HW backend is a candidate only if **(a)** its `AVHWDeviceType` device can be created via `av_hwdevice_ctx_create`, and **(b)** the codec advertises support via `avcodec_get_hw_config` matching that device type. The first candidate that fully opens wins. Each failure logs `tracing::warn!` with the backend and the underlying error and the loop tries the next.
+
+### Device selection
+
+Always device 0 / system default (`av_hwdevice_ctx_create(.., NULL, ..)`). No env var, no config knob in v1. Add later if the multi-GPU use case appears.
+
+### `get_format` callback
+
+A static `extern "C"` callback. The decoder context's `opaque` field points to a small heap-allocated `CallbackState { wanted: AVPixelFormat }`. The callback walks the offered `pix_fmts` list, returns `wanted` if present, else `AV_PIX_FMT_NONE` (which forces FFmpeg to retry with software). This is the standard pattern from `doc/examples/hw_decode.c`.
+
+### Frame transfer
+
+`receive_frame` always:
+
+1. Reads from the codec into an internal `hw_frame: ffmpeg::frame::Video` (allocated once, reused).
+2. If the frame's format is the HW pix fmt, calls `av_hwframe_transfer_data(out, hw_frame, 0)` into the caller's `&mut frame`. Copies `pts`, `pkt_dts`, `time_base`, `duration` (FFmpeg does not transfer timing).
+3. Otherwise (SW path or decoder fell back mid-stream), clones the frame into the caller's slot.
+
+### Threading
+
+`VideoDecoder: Send + !Sync`. Each instance owns its own `AVCodecContext` and `AVBufferRef*`. Multiple decoders can run on different threads; a single decoder is not concurrent.
+
+### Drop
+
+`Drop` calls `av_buffer_unref(&mut self.hw_device_ref)` if non-null, frees the boxed `CallbackState`, then lets `ffmpeg::decoder::Video`'s own Drop free the codec context.
+
+## Internals
+
+```text
+src/
+├── lib.rs       // re-exports + crate-level docs
+├── error.rs     // Error enum
+├── backend.rs   // Backend enum, probe order, AVHWDeviceType <-> Backend mapping
+├── decoder.rs   // VideoDecoder, open/open_with, send/receive
+└── ffi.rs       // get_format callback, av_hwdevice_ctx_create / transfer wrappers,
+                 // avcodec_get_hw_config probe
+```
+
+No other modules. Keep the surface small.
+
+## Build & dependencies
+
+- `ffmpeg-next = { version = "8.1", default-features = false, features = ["codec", "format"] }`
+- `thiserror = "2"`
+- `tracing = "0.1"`
+- `libc = "0.2"`
+
+No platform-specific Cargo features. `cfg!(target_os = ...)` selects which `AVHWDeviceType` constants we even attempt — the FFI symbols are linked unconditionally via `ffmpeg-sys-next`.
+
+System FFmpeg ≥ 4.x. Verified against the user's macOS Homebrew build (FFmpeg 8.1, VideoToolbox enabled).
+
+## Testing
+
+1. **Unit tests** (`src/backend.rs`, `src/error.rs`) — pure-Rust: probe-order construction per platform, `Backend` ↔ `AVHWDeviceType` mapping, error formatting.
+2. **Integration** (`tests/decode.rs`) — opens a sample H.264 file via `ffmpeg::format::input`, decodes 30 frames through `VideoDecoder::open` (auto-probe), asserts frame count and dimensions. Sample path comes from env var `HWDECODE_SAMPLE_VIDEO`; test is skipped with a clear `eprintln!` if unset.
+3. **HW smoke** (`tests/hw_smoke.rs`, `#[ignore]`) — same decode, but additionally asserts `decoder.backend() != Backend::Software`. CI runs this on platform-matched runners.
+
+Sample-file env var keeps the repo binary-free. Documented in `README.md`.
+
+## Benchmark
+
+`benches/decode.rs` (criterion) — two functions:
+
+- `bench_software_decode` — `VideoDecoder::open_with(.., Backend::Software)`, decode all frames of the sample, measure wall-clock per frame.
+- `bench_hardware_decode` — `VideoDecoder::open(..)` (auto-probe). Skipped (`return`) if `decoder.backend() == Backend::Software` (no HW available).
+
+Both use the same `HWDECODE_SAMPLE_VIDEO` file. Bench prints which backend the HW run actually used, so results are interpretable across machines.
+
+## Examples
+
+`examples/decode.rs` — opens a path from `argv[1]` with `ffmpeg::format::input`, finds the best video stream, feeds packets through `VideoDecoder`, prints `(pts, width, height, format, backend)` for each frame.
diff --git a/examples/decode.rs b/examples/decode.rs
new file mode 100644
index 0000000..fa28582
--- /dev/null
+++ b/examples/decode.rs
@@ -0,0 +1,74 @@
+//! Decode every video frame in `argv[1]`, printing one line per frame.
+//!
+//! ```sh
+//! cargo run --release --example decode -- /path/to/video.mp4
+//! ```
+
+use ffmpeg::{format, frame, media};
+use ffmpeg_next as ffmpeg;
+use hwdecode::VideoDecoder;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+  let path = std::env::args()
+    .nth(1)
+    .ok_or("usage: decode <video-file>")?;
+
+  ffmpeg::init()?;
+
+  let mut input = format::input(&path)?;
+  let stream = input
+    .streams()
+    .best(media::Type::Video)
+    .ok_or("no video stream")?;
+  let stream_index = stream.index();
+
+  let mut decoder = VideoDecoder::open(stream.parameters())?;
+  println!(
+    "backend={:?} {}x{} codec_pix_fmt_initial={:?}",
+    decoder.backend(),
+    decoder.width(),
+    decoder.height(),
+    decoder.format(),
+  );
+
+  let mut frame = frame::Video::empty();
+  let mut count: u64 = 0;
+
+  let drain = |decoder: &mut VideoDecoder, frame: &mut frame::Video, count: &mut u64| loop {
+    match decoder.receive_frame(frame) {
+      Ok(()) => {
+        *count += 1;
+        println!(
+          "frame#{count} pts={:?} {}x{} fmt={:?}",
+          frame.pts(),
+          frame.width(),
+          frame.height(),
+          frame.format(),
+        );
+      }
+      Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+        if errno == ffmpeg::error::EAGAIN =>
+      {
+        break
+      }
+      Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Eof)) => break,
+      Err(e) => {
+        eprintln!("decode error: {e}");
+        break;
+      }
+    }
+  };
+
+  for (s, packet) in input.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet)?;
+    drain(&mut decoder, &mut frame, &mut count);
+  }
+  decoder.send_eof()?;
+  drain(&mut decoder, &mut frame, &mut count);
+
+  println!("decoded {count} frames");
+  Ok(())
+}
diff --git a/examples/foo.rs b/examples/foo.rs
deleted file mode 100644
index f328e4d..0000000
--- a/examples/foo.rs
+++ /dev/null
@@ -1 +0,0 @@
-fn main() {}
diff --git a/src/backend.rs b/src/backend.rs
new file mode 100644
index 0000000..cfcd48b
--- /dev/null
+++ b/src/backend.rs
@@ -0,0 +1,137 @@
+use ffmpeg_next::{ffi::AVHWDeviceType, format::Pixel};
+
+/// Decoding backend selected (or forced) for a [`crate::VideoDecoder`].
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum Backend {
+  /// Pure software decode via libavcodec.
+  Software,
+  /// Apple VideoToolbox (macOS, iOS, iPadOS, tvOS).
+  VideoToolbox,
+  /// Linux Video Acceleration API (Intel / AMD GPUs).
+  Vaapi,
+  /// NVIDIA NVDEC via CUDA (Linux / Windows on NVIDIA hardware).
+  Cuda,
+  /// Microsoft Direct3D 11 Video Acceleration (Windows).
+  D3d11va,
+}
+
+impl Backend {
+  /// `AVHWDeviceType` corresponding to this backend, or `None` for
+  /// [`Backend::Software`].
+  pub(crate) fn av_hwdevice_type(self) -> Option<AVHWDeviceType> {
+    match self {
+      Self::Software => None,
+      Self::VideoToolbox => Some(AVHWDeviceType::AV_HWDEVICE_TYPE_VIDEOTOOLBOX),
+      Self::Vaapi => Some(AVHWDeviceType::AV_HWDEVICE_TYPE_VAAPI),
+      Self::Cuda => Some(AVHWDeviceType::AV_HWDEVICE_TYPE_CUDA),
+      Self::D3d11va => Some(AVHWDeviceType::AV_HWDEVICE_TYPE_D3D11VA),
+    }
+  }
+
+  /// Hardware pixel format the codec is expected to produce when this
+  /// backend is in use. Used to inspect the result of `get_format`.
+  /// `None` for [`Backend::Software`].
+  #[allow(dead_code)] // surfaced for tests / future use
+  pub(crate) fn hw_pixel_format(self) -> Option<Pixel> {
+    match self {
+      Self::Software => None,
+      Self::VideoToolbox => Some(Pixel::VIDEOTOOLBOX),
+      Self::Vaapi => Some(Pixel::VAAPI),
+      Self::Cuda => Some(Pixel::CUDA),
+      Self::D3d11va => Some(Pixel::D3D11),
+    }
+  }
+}
+
+/// Probe order for `VideoDecoder::open` on the current target.
+///
+/// Always ends in [`Backend::Software`]; auto-probe never returns an empty
+/// list. Order is fixed at compile time per `target_os`.
+pub(crate) fn probe_order() -> &'static [Backend] {
+  #[cfg(any(
+    target_os = "macos",
+    target_os = "ios",
+    target_os = "tvos",
+    target_os = "visionos",
+  ))]
+  {
+    &[Backend::VideoToolbox, Backend::Software]
+  }
+  #[cfg(target_os = "linux")]
+  {
+    &[Backend::Vaapi, Backend::Cuda, Backend::Software]
+  }
+  #[cfg(target_os = "windows")]
+  {
+    &[Backend::D3d11va, Backend::Cuda, Backend::Software]
+  }
+  #[cfg(not(any(
+    target_os = "macos",
+    target_os = "ios",
+    target_os = "tvos",
+    target_os = "visionos",
+    target_os = "linux",
+    target_os = "windows",
+  )))]
+  {
+    &[Backend::Software]
+  }
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  #[test]
+  fn probe_order_ends_in_software() {
+    let order = probe_order();
+    assert!(!order.is_empty());
+    assert_eq!(*order.last().unwrap(), Backend::Software);
+  }
+
+  #[test]
+  fn software_has_no_av_hwdevice_type() {
+    assert!(Backend::Software.av_hwdevice_type().is_none());
+    assert!(Backend::Software.hw_pixel_format().is_none());
+  }
+
+  #[test]
+  fn hw_backends_have_av_hwdevice_type() {
+    for b in [
+      Backend::VideoToolbox,
+      Backend::Vaapi,
+      Backend::Cuda,
+      Backend::D3d11va,
+    ] {
+      assert!(
+        b.av_hwdevice_type().is_some(),
+        "{b:?} missing hwdevice type"
+      );
+      assert!(b.hw_pixel_format().is_some(), "{b:?} missing hw pix fmt");
+    }
+  }
+
+  #[cfg(any(target_os = "macos", target_os = "ios", target_os = "tvos"))]
+  #[test]
+  fn apple_probe_order() {
+    assert_eq!(probe_order(), &[Backend::VideoToolbox, Backend::Software]);
+  }
+
+  #[cfg(target_os = "linux")]
+  #[test]
+  fn linux_probe_order() {
+    assert_eq!(
+      probe_order(),
+      &[Backend::Vaapi, Backend::Cuda, Backend::Software]
+    );
+  }
+
+  #[cfg(target_os = "windows")]
+  #[test]
+  fn windows_probe_order() {
+    assert_eq!(
+      probe_order(),
+      &[Backend::D3d11va, Backend::Cuda, Backend::Software]
+    );
+  }
+}
diff --git a/src/decoder.rs b/src/decoder.rs
new file mode 100644
index 0000000..7a3a357
--- /dev/null
+++ b/src/decoder.rs
@@ -0,0 +1,278 @@
+use std::{mem::ManuallyDrop, ptr};
+
+use ffmpeg_next::{
+  codec::{self, Context},
+  ffi::{
+    av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_hwdevice_ctx_create,
+    av_hwframe_transfer_data,
+  },
+  format::Pixel,
+  frame, Codec, Packet, Rational,
+};
+
+use crate::{
+  backend::{self, Backend},
+  error::{Error, Result},
+  ffi::{find_hw_pix_fmt, get_hw_format, CallbackState},
+};
+
+/// Hardware-accelerated video decoder with software fallback.
+///
+/// Mirrors `ffmpeg::decoder::Video`'s `send_packet`/`receive_frame` interface.
+/// Frames returned by [`Self::receive_frame`] are always CPU-side; for hardware
+/// backends they are downloaded with `av_hwframe_transfer_data` (NV12 / P010).
+pub struct VideoDecoder {
+  /// Wrapped FFmpeg decoder. `ManuallyDrop` so we can sequence its drop
+  /// before freeing the callback state in our [`Drop`] impl.
+  inner: ManuallyDrop<ffmpeg_next::decoder::Video>,
+  backend: Backend,
+  /// Owned reference produced by `av_hwdevice_ctx_create`. Null for software.
+  hw_device_ref: *mut ffmpeg_next::ffi::AVBufferRef,
+  /// Owned `Box<CallbackState>` raw pointer; `AVCodecContext::opaque` aliases
+  /// it. Null for software.
+  callback_state: *mut CallbackState,
+  /// Reusable frame buffer used for hw-side decoding before transfer.
+  /// Unused on the software path (`receive_frame` writes the caller's frame
+  /// directly).
+  hw_frame: frame::Video,
+}
+
+// SAFETY: All raw pointers are exclusively owned by the struct and never
+// shared. `ffmpeg::decoder::Video` itself is Send (its `Context` is `unsafe
+// impl Send`). The decoder is not safe for concurrent use, hence not `Sync`.
+unsafe impl Send for VideoDecoder {}
+
+impl VideoDecoder {
+  /// Auto-probe hardware backends in the platform's default order, falling
+  /// back to software. The chosen backend is reported by [`Self::backend`].
+  pub fn open(parameters: codec::Parameters) -> Result<Self> {
+    let codec_id = codec::Id::from(unsafe { (*parameters.as_ptr()).codec_id });
+    let codec = ffmpeg_next::decoder::find(codec_id).ok_or(Error::NoCodec(codec_id))?;
+
+    let mut attempts = Vec::new();
+    for &backend in backend::probe_order() {
+      match Self::try_open(parameters.clone(), codec, backend) {
+        Ok(decoder) => {
+          tracing::info!(?backend, "hwdecode: opened video decoder");
+          return Ok(decoder);
+        }
+        Err(e) => {
+          tracing::warn!(?backend, error = %e, "hwdecode: backend probe failed");
+          attempts.push((backend, Box::new(e)));
+        }
+      }
+    }
+    Err(Error::AllBackendsFailed { attempts })
+  }
+
+  /// Open the decoder with a specific backend. No probe, no fallback.
+  /// Returns an error if `backend` is not supported by the codec or fails to
+  /// initialise.
+  pub fn open_with(parameters: codec::Parameters, backend: Backend) -> Result<Self> {
+    let codec_id = codec::Id::from(unsafe { (*parameters.as_ptr()).codec_id });
+    let codec = ffmpeg_next::decoder::find(codec_id).ok_or(Error::NoCodec(codec_id))?;
+    Self::try_open(parameters, codec, backend)
+  }
+
+  /// The backend that opened this decoder.
+  pub fn backend(&self) -> Backend {
+    self.backend
+  }
+
+  /// Decoder width in pixels.
+  pub fn width(&self) -> u32 {
+    self.inner.width()
+  }
+
+  /// Decoder height in pixels.
+  pub fn height(&self) -> u32 {
+    self.inner.height()
+  }
+
+  /// Current pixel format of the codec context. For HW backends this is the
+  /// hardware pixel format (e.g. `Pixel::VIDEOTOOLBOX`) once the first frame
+  /// has been negotiated; the format of frames returned from
+  /// [`Self::receive_frame`] is the *transferred* format (NV12 / P010) and
+  /// must be read from the frame itself.
+  pub fn format(&self) -> Pixel {
+    self.inner.format()
+  }
+
+  /// Codec context time base.
+  pub fn time_base(&self) -> Rational {
+    self.inner.time_base()
+  }
+
+  /// Frame rate from the codec context, if known.
+  pub fn frame_rate(&self) -> Option<Rational> {
+    self.inner.frame_rate()
+  }
+
+  /// Submit a packet to the decoder.
+  pub fn send_packet(&mut self, packet: &Packet) -> Result<()> {
+    self.inner.send_packet(packet).map_err(Error::Ffmpeg)
+  }
+
+  /// Signal end-of-stream to the decoder; remaining frames can be drained
+  /// with [`Self::receive_frame`].
+  pub fn send_eof(&mut self) -> Result<()> {
+    self.inner.send_eof().map_err(Error::Ffmpeg)
+  }
+
+  /// Receive a CPU-side decoded frame.
+  ///
+  /// For hardware backends the frame is transferred from GPU memory via
+  /// `av_hwframe_transfer_data` and frame metadata (pts, time_base, side
+  /// data, ...) is copied with `av_frame_copy_props`. For the software
+  /// backend this is a direct passthrough.
+  ///
+  /// Returns the same errors as `ffmpeg::decoder::Video::receive_frame`,
+  /// e.g. `Error::Other { errno: EAGAIN }` when no frame is ready.
+  pub fn receive_frame(&mut self, frame: &mut frame::Video) -> Result<()> {
+    if self.backend == Backend::Software {
+      return self.inner.receive_frame(frame).map_err(Error::Ffmpeg);
+    }
+
+    // HW path: receive into our reusable hw_frame, then transfer.
+    self
+      .inner
+      .receive_frame(&mut self.hw_frame)
+      .map_err(Error::Ffmpeg)?;
+
+    // SAFETY: both frames are valid AVFrame pointers owned by us. transfer
+    // allocates buffers on `frame` as needed; copy_props moves timing and
+    // side data over (transfer_data does not).
+    unsafe {
+      let ret = av_hwframe_transfer_data(frame.as_mut_ptr(), self.hw_frame.as_ptr(), 0);
+      if ret < 0 {
+        return Err(Error::Ffmpeg(ffmpeg_next::Error::from(ret)));
+      }
+      let ret = av_frame_copy_props(frame.as_mut_ptr(), self.hw_frame.as_ptr());
+      if ret < 0 {
+        return Err(Error::Ffmpeg(ffmpeg_next::Error::from(ret)));
+      }
+    }
+    Ok(())
+  }
+
+  /// Flush internal buffers (e.g. after a seek).
+  pub fn flush(&mut self) {
+    self.inner.flush();
+  }
+
+  /// Inner open: tries one backend exactly, no probing.
+  fn try_open(parameters: codec::Parameters, codec: Codec, backend: Backend) -> Result<Self> {
+    let mut ctx = Context::from_parameters(parameters)?;
+
+    let (hw_device_ref, callback_state) = match backend.av_hwdevice_type() {
+      None => (ptr::null_mut(), ptr::null_mut()),
+      Some(av_type) => {
+        // Verify the codec advertises this hwaccel.
+        let hw_pix_fmt = find_hw_pix_fmt(unsafe { codec.as_ptr() }, av_type)
+          .ok_or(Error::BackendUnsupportedByCodec(backend))?;
+
+        // Create the device context.
+        let mut hw_device_ref = ptr::null_mut();
+        // SAFETY: `hw_device_ref` is a stack ptr we hand FFmpeg to fill.
+        let ret = unsafe {
+          av_hwdevice_ctx_create(&mut hw_device_ref, av_type, ptr::null(), ptr::null_mut(), 0)
+        };
+        if ret < 0 {
+          return Err(Error::HwDeviceInitFailed {
+            backend,
+            source: ffmpeg_next::Error::from(ret),
+          });
+        }
+
+        // Wire up the codec context: a fresh ref for FFmpeg, a heap
+        // pointer for the get_format callback to read.
+        let callback_state = Box::into_raw(Box::new(CallbackState { wanted: hw_pix_fmt }));
+        // SAFETY: ctx is a freshly-constructed AVCodecContext we own;
+        // av_buffer_ref bumps the refcount of the device buffer for FFmpeg's
+        // use (we keep our own ref in `hw_device_ref` for cleanup).
+        unsafe {
+          let raw = ctx.as_mut_ptr();
+          (*raw).hw_device_ctx = av_buffer_ref(hw_device_ref);
+          (*raw).opaque = callback_state.cast();
+          (*raw).get_format = Some(get_hw_format);
+        }
+        (hw_device_ref, callback_state)
+      }
+    };
+
+    // Open the decoder. On any failure, release the resources we just
+    // allocated so we don't leak.
+    let opened = match ctx.decoder().open_as(codec).and_then(|o| o.video()) {
+      Ok(d) => d,
+      Err(e) => {
+        // SAFETY: we either allocated these in this function above or
+        // they are null; av_buffer_unref / Box::from_raw handle null
+        // explicitly (we check first).
+        unsafe {
+          let mut hw = hw_device_ref;
+          if !hw.is_null() {
+            av_buffer_unref(&mut hw);
+          }
+          if !callback_state.is_null() {
+            drop(Box::from_raw(callback_state));
+          }
+        }
+        return Err(Error::Ffmpeg(e));
+      }
+    };
+
+    Ok(Self {
+      inner: ManuallyDrop::new(opened),
+      backend,
+      hw_device_ref,
+      callback_state,
+      hw_frame: frame::Video::empty(),
+    })
+  }
+}
+
+impl Drop for VideoDecoder {
+  fn drop(&mut self) {
+    // Order matters:
+    //  1. Drop the codec context first. While it lives, FFmpeg may invoke
+    //     `get_format`, which dereferences `callback_state` via `opaque`.
+    //  2. Free the callback state heap allocation.
+    //  3. Release our hw device reference (FFmpeg released its own when
+    //     the codec context was freed in step 1).
+    unsafe {
+      ManuallyDrop::drop(&mut self.inner);
+      if !self.callback_state.is_null() {
+        drop(Box::from_raw(self.callback_state));
+      }
+      if !self.hw_device_ref.is_null() {
+        av_buffer_unref(&mut self.hw_device_ref);
+      }
+    }
+  }
+}
+
+#[allow(dead_code)]
+fn _assert_send() {
+  fn check<T: Send>() {}
+  check::<VideoDecoder>();
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  #[test]
+  fn no_codec_for_unknown_id() {
+    // Build a Parameters with an unknown id — easiest path is to allocate
+    // empty parameters and inspect; here we just confirm Error::NoCodec
+    // formats sensibly. (Open behavior is exercised by integration tests
+    // because it requires real stream params.)
+    let err = Error::NoCodec(codec::Id::None);
+    assert!(format!("{err}").contains("no decoder"));
+  }
+
+  #[test]
+  fn videodecoder_is_send() {
+    _assert_send();
+  }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..92cb2d1
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,37 @@
+use crate::backend::Backend;
+
+/// Crate result alias.
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Errors returned from [`crate::VideoDecoder`].
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+  /// An underlying FFmpeg error.
+  #[error("ffmpeg error: {0}")]
+  Ffmpeg(#[from] ffmpeg_next::Error),
+
+  /// `avcodec_find_decoder` returned null for the input codec id.
+  #[error("no decoder for codec id {0:?}")]
+  NoCodec(ffmpeg_next::codec::Id),
+
+  /// The codec does not advertise a hardware configuration matching the
+  /// requested backend (via `avcodec_get_hw_config`).
+  #[error("codec does not support backend {0:?}")]
+  BackendUnsupportedByCodec(Backend),
+
+  /// `av_hwdevice_ctx_create` failed for the requested backend.
+  #[error("hardware device init failed for {backend:?}: {source}")]
+  HwDeviceInitFailed {
+    /// Backend that failed to initialise.
+    backend: Backend,
+    /// Underlying FFmpeg error.
+    source: ffmpeg_next::Error,
+  },
+
+  /// Auto-probe exhausted every backend in the platform's order.
+  #[error("all backends failed; attempts: {attempts:?}")]
+  AllBackendsFailed {
+    /// Per-backend errors collected during probing, in the order tried.
+    attempts: Vec<(Backend, Box<Error>)>,
+  },
+}
diff --git a/src/ffi.rs b/src/ffi.rs
new file mode 100644
index 0000000..6020079
--- /dev/null
+++ b/src/ffi.rs
@@ -0,0 +1,70 @@
+//! FFI shims used by the decoder. Kept in one place so the unsafe surface is
+//! easy to audit.
+
+use ffmpeg_next::ffi::{
+  avcodec_get_hw_config, AVCodec, AVCodecContext, AVHWDeviceType, AVPixelFormat,
+  AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX,
+};
+
+/// State pointed to by `AVCodecContext::opaque` so [`get_hw_format`] can pick
+/// the correct hardware pixel format without globals. One instance per
+/// decoder; freed in [`crate::VideoDecoder::drop`].
+#[repr(C)]
+pub(crate) struct CallbackState {
+  pub(crate) wanted: AVPixelFormat,
+}
+
+/// `AVCodecContext::get_format` callback. FFmpeg invokes it with the list of
+/// pixel formats the codec is willing to output for the current stream. We
+/// pick the hardware format we wired up at open time, or [`AVPixelFormat::AV_PIX_FMT_NONE`]
+/// to signal "no usable format" (which causes FFmpeg to error out — the caller
+/// then sees a normal `ffmpeg::Error` and probes the next backend).
+pub(crate) unsafe extern "C" fn get_hw_format(
+  ctx: *mut AVCodecContext,
+  mut pix_fmts: *const AVPixelFormat,
+) -> AVPixelFormat {
+  debug_assert!(!ctx.is_null());
+  debug_assert!(!pix_fmts.is_null());
+
+  // SAFETY: opaque was set by `try_open` to a valid `Box<CallbackState>`
+  // pointer that outlives the codec context (we only free it after the
+  // codec context's drop runs).
+  let state = unsafe { (*ctx).opaque as *const CallbackState };
+  if state.is_null() {
+    return AVPixelFormat::AV_PIX_FMT_NONE;
+  }
+  let wanted = unsafe { (*state).wanted };
+
+  // Walk the offered list looking for our format.
+  while unsafe { *pix_fmts } != AVPixelFormat::AV_PIX_FMT_NONE {
+    if unsafe { *pix_fmts } == wanted {
+      return wanted;
+    }
+    pix_fmts = unsafe { pix_fmts.add(1) };
+  }
+  AVPixelFormat::AV_PIX_FMT_NONE
+}
+
+/// Walk the codec's `AVCodecHWConfig` table and return the hardware pixel
+/// format associated with `device_type`, if the codec advertises one that
+/// uses the `HW_DEVICE_CTX` setup method.
+pub(crate) fn find_hw_pix_fmt(
+  codec: *const AVCodec,
+  device_type: AVHWDeviceType,
+) -> Option<AVPixelFormat> {
+  debug_assert!(!codec.is_null());
+  let mut i = 0;
+  loop {
+    // SAFETY: `avcodec_get_hw_config` returns null past the end; we stop then.
+    let cfg = unsafe { avcodec_get_hw_config(codec, i) };
+    if cfg.is_null() {
+      return None;
+    }
+    let cfg = unsafe { *cfg };
+    let supports_device_ctx = cfg.methods & (AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX as i32) != 0;
+    if supports_device_ctx && cfg.device_type == device_type {
+      return Some(cfg.pix_fmt);
+    }
+    i += 1;
+  }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 0a58390..7d9c7bd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,11 +1,25 @@
-//! A template for creating Rust open-source repo on GitHub
-#![cfg_attr(not(feature = "std"), no_std)]
+//! Cross-platform hardware-accelerated video decoder built on top of `ffmpeg-next`.
+//!
+//! [`VideoDecoder`] mirrors the surface of `ffmpeg::decoder::Video`
+//! (`send_packet`/`receive_frame`/`send_eof`/`flush`) and silently picks the best
+//! hardware backend for the host platform, falling back to software if none open.
+//!
+//! Output frames returned by [`VideoDecoder::receive_frame`] are CPU-side. For
+//! hardware backends they are downloaded with `av_hwframe_transfer_data` (NV12
+//! for 8-bit input, P010 for 10-bit). For software backends the frame is in the
+//! codec's native format.
+//!
+//! Pixel-format conversion is intentionally out of scope; downstream code is
+//! expected to handle that (e.g. via `colconv`).
 #![cfg_attr(docsrs, feature(doc_cfg))]
 #![cfg_attr(docsrs, allow(unused_attributes))]
 #![deny(missing_docs)]
 
-#[cfg(all(not(feature = "std"), feature = "alloc"))]
-extern crate alloc as std;
+mod backend;
+mod decoder;
+mod error;
+mod ffi;
 
-#[cfg(feature = "std")]
-extern crate std;
+pub use backend::Backend;
+pub use decoder::VideoDecoder;
+pub use error::{Error, Result};
diff --git a/tests/decode.rs b/tests/decode.rs
new file mode 100644
index 0000000..a936ae3
--- /dev/null
+++ b/tests/decode.rs
@@ -0,0 +1,68 @@
+//! Integration test: open the auto-probed decoder against a real video file
+//! and decode the first 30 frames. Skipped (with a clear message) when no
+//! sample is configured.
+//!
+//! Set `HWDECODE_SAMPLE_VIDEO` to an absolute path to enable.
+
+use ffmpeg::{format, frame, media};
+use ffmpeg_next as ffmpeg;
+use hwdecode::VideoDecoder;
+
+const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
+
+#[test]
+fn auto_open_decodes_at_least_one_frame() {
+  let Some(path) = std::env::var_os(SAMPLE_ENV) else {
+    eprintln!("skipping: set {SAMPLE_ENV} to a video file path to run this test");
+    return;
+  };
+
+  ffmpeg::init().expect("ffmpeg init");
+
+  let mut input = format::input(&path).expect("open input");
+  let stream = input
+    .streams()
+    .best(media::Type::Video)
+    .expect("video stream");
+  let stream_index = stream.index();
+  let expected_w = unsafe { (*stream.parameters().as_ptr()).width as u32 };
+  let expected_h = unsafe { (*stream.parameters().as_ptr()).height as u32 };
+
+  let mut decoder = VideoDecoder::open(stream.parameters()).expect("open decoder");
+  eprintln!("backend = {:?}", decoder.backend());
+
+  assert_eq!(decoder.width(), expected_w);
+  assert_eq!(decoder.height(), expected_h);
+
+  let mut frame = frame::Video::empty();
+  let mut count = 0_usize;
+  let target = 30_usize;
+
+  'outer: for (s, packet) in input.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet).expect("send packet");
+    loop {
+      match decoder.receive_frame(&mut frame) {
+        Ok(()) => {
+          assert_eq!(frame.width(), expected_w);
+          assert_eq!(frame.height(), expected_h);
+          count += 1;
+          if count >= target {
+            break 'outer;
+          }
+        }
+        Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+          if errno == ffmpeg::error::EAGAIN =>
+        {
+          break;
+        }
+        Err(e) => panic!("receive_frame: {e}"),
+      }
+    }
+  }
+
+  assert!(count >= 1, "expected at least 1 decoded frame, got {count}");
+  eprintln!("decoded {count} frames");
+}
diff --git a/tests/foo.rs b/tests/foo.rs
deleted file mode 100644
index 8b13789..0000000
--- a/tests/foo.rs
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/tests/hw_smoke.rs b/tests/hw_smoke.rs
new file mode 100644
index 0000000..5aa37c9
--- /dev/null
+++ b/tests/hw_smoke.rs
@@ -0,0 +1,64 @@
+//! `#[ignore]`-gated smoke test that asserts the auto-probed backend is
+//! actually a hardware backend (not Software). Run with:
+//!
+//! ```sh
+//! HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo test --test hw_smoke -- --ignored
+//! ```
+
+use ffmpeg::{format, frame, media};
+use ffmpeg_next as ffmpeg;
+use hwdecode::{Backend, VideoDecoder};
+
+const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
+
+#[test]
+#[ignore = "requires HWDECODE_SAMPLE_VIDEO and a working hardware backend"]
+fn auto_probe_picks_hardware_backend() {
+  let path = std::env::var_os(SAMPLE_ENV).unwrap_or_else(|| panic!("{SAMPLE_ENV} not set"));
+
+  ffmpeg::init().expect("ffmpeg init");
+
+  let mut input = format::input(&path).expect("open input");
+  let stream = input
+    .streams()
+    .best(media::Type::Video)
+    .expect("video stream");
+  let stream_index = stream.index();
+
+  let mut decoder = VideoDecoder::open(stream.parameters()).expect("open decoder");
+  eprintln!("auto-probe selected backend = {:?}", decoder.backend());
+  assert_ne!(
+    decoder.backend(),
+    Backend::Software,
+    "expected hardware backend; got Software"
+  );
+
+  // Verify we can actually decode at least one HW frame end-to-end.
+  let mut frame = frame::Video::empty();
+  let mut got_frame = false;
+  for (s, packet) in input.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet).expect("send packet");
+    match decoder.receive_frame(&mut frame) {
+      Ok(()) => {
+        got_frame = true;
+        eprintln!(
+          "first hw frame: {}x{} fmt={:?}",
+          frame.width(),
+          frame.height(),
+          frame.format()
+        );
+        break;
+      }
+      Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+        if errno == ffmpeg::error::EAGAIN =>
+      {
+        continue;
+      }
+      Err(e) => panic!("receive_frame: {e}"),
+    }
+  }
+  assert!(got_frame, "no frames decoded");
+}

From bfd9b525cba209ea1a8ec29cb79d1e3fffa8e628 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 15:56:13 +1200
Subject: [PATCH 02/27] update

---
 benches/decode.rs |  32 +++-
 src/decoder.rs    | 420 +++++++++++++++++++++++++++++++++++-----------
 src/ffi.rs        | 112 +++++++++++--
 tests/hw_smoke.rs |  20 ++-
 4 files changed, 455 insertions(+), 129 deletions(-)

diff --git a/benches/decode.rs b/benches/decode.rs
index be7281d..2433de9 100644
--- a/benches/decode.rs
+++ b/benches/decode.rs
@@ -74,19 +74,37 @@ fn bench_decode(c: &mut Criterion) {
     return;
   };
 
-  // Probe backend once to print which HW backend (if any) we'd be benching.
+  // Probe by decoding one frame so the probe collapses to the backend that
+  // actually produced output. Reading `backend()` before the first frame
+  // would observe the optimistically-selected value and mislabel HW runs
+  // that silently degraded.
   let probed_backend = {
-    let input = format::input(&path).expect("open input");
+    let mut input = format::input(&path).expect("open input");
     let stream = input
       .streams()
       .best(media::Type::Video)
       .expect("video stream");
-    let dec = VideoDecoder::open(stream.parameters()).expect("auto-probe");
-    let b = dec.backend();
-    drop(dec);
-    b
+    let stream_index = stream.index();
+    let mut dec = VideoDecoder::open(stream.parameters()).expect("auto-probe");
+    let mut frame = frame::Video::empty();
+    'probe: for (s, packet) in input.packets() {
+      if s.index() != stream_index {
+        continue;
+      }
+      dec.send_packet(&packet).expect("probe send_packet");
+      match dec.receive_frame(&mut frame) {
+        Ok(()) => break 'probe,
+        Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+          if errno == ffmpeg::error::EAGAIN =>
+        {
+          continue;
+        }
+        Err(e) => panic!("probe receive_frame: {e}"),
+      }
+    }
+    dec.backend()
   };
-  eprintln!("auto-probe selected backend: {probed_backend:?}");
+  eprintln!("auto-probe settled on backend: {probed_backend:?}");
 
   let mut group = c.benchmark_group("decode");
   group.measurement_time(Duration::from_secs(15));
diff --git a/src/decoder.rs b/src/decoder.rs
index 7a3a357..76fe3e4 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -3,8 +3,8 @@ use std::{mem::ManuallyDrop, ptr};
 use ffmpeg_next::{
   codec::{self, Context},
   ffi::{
-    av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_hwdevice_ctx_create,
-    av_hwframe_transfer_data,
+    av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_frame_move_ref, av_frame_unref,
+    av_hwdevice_ctx_create, av_hwframe_transfer_data, AVBufferRef, AVPixelFormat,
   },
   format::Pixel,
   frame, Codec, Packet, Rational,
@@ -19,45 +19,128 @@ use crate::{
 /// Hardware-accelerated video decoder with software fallback.
 ///
 /// Mirrors `ffmpeg::decoder::Video`'s `send_packet`/`receive_frame` interface.
-/// Frames returned by [`Self::receive_frame`] are always CPU-side; for hardware
-/// backends they are downloaded with `av_hwframe_transfer_data` (NV12 / P010).
+/// Frames returned by [`Self::receive_frame`] are always CPU-side; for the
+/// hardware path they are downloaded with `av_hwframe_transfer_data` (NV12 /
+/// P010).
+///
+/// `open` does a true probe: each backend opens with a strict `get_format`
+/// callback, and on the first non-transient error the decoder is torn down
+/// and the next backend is tried with all packets seen so far replayed
+/// through it. Once the first frame is successfully received the probe
+/// collapses and subsequent calls go straight to the active backend.
 pub struct VideoDecoder {
+  /// Live FFmpeg state for the currently active backend.
+  state: DecoderState,
+  /// Reusable frame buffer used for hw-side decoding before transfer / move.
+  hw_frame: frame::Video,
+  /// Probe state: present until the first frame is received from the active
+  /// backend, then `None`. While `Some`, packets are buffered for replay and
+  /// non-transient errors / decoder failures advance to the next backend.
+  probe: Option<ProbeState>,
+}
+
+/// Owned FFmpeg state for one open codec context. Has its own `Drop` so we
+/// can swap it out cleanly during a probe advance via `mem::replace`.
+struct DecoderState {
   /// Wrapped FFmpeg decoder. `ManuallyDrop` so we can sequence its drop
-  /// before freeing the callback state in our [`Drop`] impl.
+  /// before freeing the callback state.
   inner: ManuallyDrop<ffmpeg_next::decoder::Video>,
+  /// Backend driving this state.
   backend: Backend,
   /// Owned reference produced by `av_hwdevice_ctx_create`. Null for software.
-  hw_device_ref: *mut ffmpeg_next::ffi::AVBufferRef,
+  hw_device_ref: *mut AVBufferRef,
   /// Owned `Box<CallbackState>` raw pointer; `AVCodecContext::opaque` aliases
   /// it. Null for software.
   callback_state: *mut CallbackState,
-  /// Reusable frame buffer used for hw-side decoding before transfer.
-  /// Unused on the software path (`receive_frame` writes the caller's frame
-  /// directly).
-  hw_frame: frame::Video,
+  /// Hardware pixel format we asked the decoder to produce. Compared (as
+  /// `i32` to avoid enum-discriminant UB) against each received frame's
+  /// format. `AV_PIX_FMT_NONE` for the software path.
+  hw_pix_fmt: AVPixelFormat,
+}
+
+/// State carried only during the probe window (before the first successful
+/// frame). Holds enough information to tear down the current decoder and
+/// retry with the next backend.
+struct ProbeState {
+  parameters: codec::Parameters,
+  codec: Codec,
+  /// Backends still to try, in order. Empty means "no more options after
+  /// the active one fails".
+  remaining_backends: Vec<Backend>,
+  /// Packets sent so far, kept for replay through the next backend.
+  buffered_packets: Vec<Packet>,
+  /// Whether `send_eof` has been called; replayed alongside packets.
+  eof_sent: bool,
 }
 
-// SAFETY: All raw pointers are exclusively owned by the struct and never
-// shared. `ffmpeg::decoder::Video` itself is Send (its `Context` is `unsafe
-// impl Send`). The decoder is not safe for concurrent use, hence not `Sync`.
+// SAFETY: All raw pointers are exclusively owned by `DecoderState` and never
+// shared. `ffmpeg::decoder::Video` is itself `Send` (its `Context` carries an
+// `unsafe impl Send`). The decoder is not safe for concurrent use, hence not
+// `Sync`.
+unsafe impl Send for DecoderState {}
 unsafe impl Send for VideoDecoder {}
 
+impl Drop for DecoderState {
+  fn drop(&mut self) {
+    // Order matters:
+    //  1. Drop the codec context first. While it lives, FFmpeg may invoke
+    //     `get_format`, which dereferences `callback_state` via `opaque`.
+    //  2. Free the callback state heap allocation.
+    //  3. Release our hw device reference (FFmpeg released its own when
+    //     the codec context was freed in step 1).
+    unsafe {
+      ManuallyDrop::drop(&mut self.inner);
+      if !self.callback_state.is_null() {
+        drop(Box::from_raw(self.callback_state));
+        self.callback_state = ptr::null_mut();
+      }
+      if !self.hw_device_ref.is_null() {
+        av_buffer_unref(&mut self.hw_device_ref);
+      }
+    }
+  }
+}
+
 impl VideoDecoder {
-  /// Auto-probe hardware backends in the platform's default order, falling
-  /// back to software. The chosen backend is reported by [`Self::backend`].
+  /// Auto-probe hardware backends in the platform's default order.
+  ///
+  /// Each backend opens with a strict `get_format` callback. The first
+  /// backend whose `avcodec_open2` succeeds becomes active; if the first
+  /// frame from it fails (e.g. `get_format` returns `NONE` because the
+  /// backend can't handle this stream's profile/depth), the decoder is torn
+  /// down and the next backend is tried — packets sent so far are replayed
+  /// through the new decoder, transparently to the caller.
+  ///
+  /// [`Self::backend`] reflects whichever backend ultimately produced the
+  /// first frame. Software is the last entry in every probe order, so
+  /// `open` cannot return without a working decoder for codecs that
+  /// libavcodec supports at all.
   pub fn open(parameters: codec::Parameters) -> Result<Self> {
     let codec_id = codec::Id::from(unsafe { (*parameters.as_ptr()).codec_id });
     let codec = ffmpeg_next::decoder::find(codec_id).ok_or(Error::NoCodec(codec_id))?;
+    let order = backend::probe_order();
 
-    let mut attempts = Vec::new();
-    for &backend in backend::probe_order() {
-      match Self::try_open(parameters.clone(), codec, backend) {
-        Ok(decoder) => {
-          tracing::info!(?backend, "hwdecode: opened video decoder");
-          return Ok(decoder);
+    let mut attempts: Vec<(Backend, Box<Error>)> = Vec::new();
+    for (i, &backend) in order.iter().enumerate() {
+      match Self::build_state(parameters.clone(), codec, backend) {
+        Ok(state) => {
+          tracing::info!(?backend, "hwdecode: opened video decoder (probing)");
+          let remaining = order[(i + 1)..].to_vec();
+          let probe = (!remaining.is_empty()).then(|| ProbeState {
+            parameters,
+            codec,
+            remaining_backends: remaining,
+            buffered_packets: Vec::new(),
+            eof_sent: false,
+          });
+          return Ok(Self {
+            state,
+            hw_frame: frame::Video::empty(),
+            probe,
+          });
         }
         Err(e) => {
-          tracing::warn!(?backend, error = %e, "hwdecode: backend probe failed");
+          tracing::warn!(?backend, error = %e, "hwdecode: backend open failed");
           attempts.push((backend, Box::new(e)));
         }
       }
@@ -66,113 +149,260 @@ impl VideoDecoder {
   }
 
   /// Open the decoder with a specific backend. No probe, no fallback.
-  /// Returns an error if `backend` is not supported by the codec or fails to
-  /// initialise.
+  ///
+  /// If `backend` is a hardware backend that the codec can't actually use
+  /// for this stream, the failure surfaces from
+  /// [`Self::receive_frame`] (the strict `get_format` callback returns
+  /// `AV_PIX_FMT_NONE`, the decoder errors out). The caller is responsible
+  /// for retrying with `Backend::Software` or another backend if desired.
   pub fn open_with(parameters: codec::Parameters, backend: Backend) -> Result<Self> {
     let codec_id = codec::Id::from(unsafe { (*parameters.as_ptr()).codec_id });
     let codec = ffmpeg_next::decoder::find(codec_id).ok_or(Error::NoCodec(codec_id))?;
-    Self::try_open(parameters, codec, backend)
+    let state = Self::build_state(parameters, codec, backend)?;
+    Ok(Self {
+      state,
+      hw_frame: frame::Video::empty(),
+      probe: None,
+    })
   }
 
-  /// The backend that opened this decoder.
+  /// The backend currently producing frames. While the probe is still in
+  /// progress (no frame received yet) this returns the optimistically
+  /// selected backend; after the first frame, it is the backend that
+  /// actually produced it. Once stable, never changes again.
   pub fn backend(&self) -> Backend {
-    self.backend
+    self.state.backend
   }
 
   /// Decoder width in pixels.
   pub fn width(&self) -> u32 {
-    self.inner.width()
+    self.state.inner.width()
   }
 
   /// Decoder height in pixels.
   pub fn height(&self) -> u32 {
-    self.inner.height()
+    self.state.inner.height()
   }
 
-  /// Current pixel format of the codec context. For HW backends this is the
+  /// Codec context's current pixel format. For HW backends this is the
   /// hardware pixel format (e.g. `Pixel::VIDEOTOOLBOX`) once the first frame
-  /// has been negotiated; the format of frames returned from
-  /// [`Self::receive_frame`] is the *transferred* format (NV12 / P010) and
-  /// must be read from the frame itself.
+  /// has been negotiated; the caller-facing format produced by
+  /// [`Self::receive_frame`] is the *transferred* format (NV12 / P010 for
+  /// HW, codec-native for SW) and must be read from the frame itself.
   pub fn format(&self) -> Pixel {
-    self.inner.format()
+    self.state.inner.format()
   }
 
   /// Codec context time base.
   pub fn time_base(&self) -> Rational {
-    self.inner.time_base()
+    self.state.inner.time_base()
   }
 
   /// Frame rate from the codec context, if known.
   pub fn frame_rate(&self) -> Option<Rational> {
-    self.inner.frame_rate()
+    self.state.inner.frame_rate()
   }
 
-  /// Submit a packet to the decoder.
+  /// Submit a packet to the decoder. While the probe is active the packet is
+  /// also buffered for potential replay through a fallback backend.
   pub fn send_packet(&mut self, packet: &Packet) -> Result<()> {
-    self.inner.send_packet(packet).map_err(Error::Ffmpeg)
+    if let Some(probe) = self.probe.as_mut() {
+      probe.buffered_packets.push(packet.clone());
+    }
+    self.state.inner.send_packet(packet).map_err(Error::Ffmpeg)
   }
 
   /// Signal end-of-stream to the decoder; remaining frames can be drained
-  /// with [`Self::receive_frame`].
+  /// with [`Self::receive_frame`]. Recorded for replay if probe is active.
   pub fn send_eof(&mut self) -> Result<()> {
-    self.inner.send_eof().map_err(Error::Ffmpeg)
+    if let Some(probe) = self.probe.as_mut() {
+      probe.eof_sent = true;
+    }
+    self.state.inner.send_eof().map_err(Error::Ffmpeg)
   }
 
   /// Receive a CPU-side decoded frame.
   ///
-  /// For hardware backends the frame is transferred from GPU memory via
+  /// On the hardware path the frame is transferred from GPU memory via
   /// `av_hwframe_transfer_data` and frame metadata (pts, time_base, side
-  /// data, ...) is copied with `av_frame_copy_props`. For the software
-  /// backend this is a direct passthrough.
+  /// data, ...) is copied with `av_frame_copy_props`. The caller's frame is
+  /// always unref'd first so reuse across resolution changes or different
+  /// decoders is safe (mirrors `avcodec_receive_frame`'s own contract).
+  ///
+  /// While the probe window is open and the active backend produces a
+  /// non-transient error or a software-format frame instead of the
+  /// configured hardware format, the decoder is torn down and the next
+  /// backend in probe order is tried with all buffered packets replayed.
+  /// The caller observes only the eventual successful frame (or, if every
+  /// backend has been exhausted, the underlying error).
   ///
-  /// Returns the same errors as `ffmpeg::decoder::Video::receive_frame`,
-  /// e.g. `Error::Other { errno: EAGAIN }` when no frame is ready.
+  /// Returns the same transient signals as `ffmpeg::decoder::Video`:
+  /// `Error::Ffmpeg(Other { errno: EAGAIN })` when no frame is ready and
+  /// more packets must be sent, and `Error::Ffmpeg(Eof)` once fully drained.
   pub fn receive_frame(&mut self, frame: &mut frame::Video) -> Result<()> {
-    if self.backend == Backend::Software {
-      return self.inner.receive_frame(frame).map_err(Error::Ffmpeg);
-    }
+    loop {
+      let res = self.state.inner.receive_frame(&mut self.hw_frame);
+      match res {
+        Err(e) => {
+          if is_transient(&e) {
+            return Err(Error::Ffmpeg(e));
+          }
+          if self.probe.is_some() && self.advance_probe()? {
+            continue;
+          }
+          return Err(Error::Ffmpeg(e));
+        }
+        Ok(()) => {
+          // Compare format as i32 to avoid constructing an AVPixelFormat
+          // enum from an unvalidated integer. Library/header skew or a new
+          // hardware format would otherwise be UB.
+          let received_fmt: i32 = unsafe { (*self.hw_frame.as_ptr()).format };
 
-    // HW path: receive into our reusable hw_frame, then transfer.
-    self
-      .inner
-      .receive_frame(&mut self.hw_frame)
-      .map_err(Error::Ffmpeg)?;
+          if self.state.backend == Backend::Software {
+            // Pure SW path: just hand over the frame.
+            unsafe {
+              av_frame_unref(frame.as_mut_ptr());
+              av_frame_move_ref(frame.as_mut_ptr(), self.hw_frame.as_mut_ptr());
+            }
+            self.probe = None;
+            return Ok(());
+          }
 
-    // SAFETY: both frames are valid AVFrame pointers owned by us. transfer
-    // allocates buffers on `frame` as needed; copy_props moves timing and
-    // side data over (transfer_data does not).
-    unsafe {
-      let ret = av_hwframe_transfer_data(frame.as_mut_ptr(), self.hw_frame.as_ptr(), 0);
-      if ret < 0 {
-        return Err(Error::Ffmpeg(ffmpeg_next::Error::from(ret)));
-      }
-      let ret = av_frame_copy_props(frame.as_mut_ptr(), self.hw_frame.as_ptr());
-      if ret < 0 {
-        return Err(Error::Ffmpeg(ffmpeg_next::Error::from(ret)));
+          if received_fmt == self.state.hw_pix_fmt as i32 {
+            // True HW frame: download to CPU and copy timing/side data.
+            unsafe {
+              av_frame_unref(frame.as_mut_ptr());
+              let ret = av_hwframe_transfer_data(frame.as_mut_ptr(), self.hw_frame.as_ptr(), 0);
+              if ret < 0 {
+                return Err(Error::Ffmpeg(ffmpeg_next::Error::from(ret)));
+              }
+              let ret = av_frame_copy_props(frame.as_mut_ptr(), self.hw_frame.as_ptr());
+              if ret < 0 {
+                return Err(Error::Ffmpeg(ffmpeg_next::Error::from(ret)));
+              }
+            }
+            self.probe = None;
+            return Ok(());
+          }
+
+          // The decoder produced a CPU frame from a HW-opened context. With
+          // strict `get_format` this is unusual (the codec would normally
+          // error on get_format=NONE). If it does happen and we still have
+          // backends to try, treat it as a probe failure and advance.
+          if self.probe.is_some() && self.advance_probe()? {
+            continue;
+          }
+          // No fallback left; accept the SW frame and update the active
+          // backend so `backend()` reflects reality.
+          unsafe {
+            av_frame_unref(frame.as_mut_ptr());
+            av_frame_move_ref(frame.as_mut_ptr(), self.hw_frame.as_mut_ptr());
+          }
+          self.state.backend = Backend::Software;
+          self.probe = None;
+          return Ok(());
+        }
       }
     }
-    Ok(())
   }
 
-  /// Flush internal buffers (e.g. after a seek).
+  /// Flush internal buffers (e.g. after a seek). Resets probe-time buffer if
+  /// active, since post-seek packets do not align with replayed history.
   pub fn flush(&mut self) {
-    self.inner.flush();
+    self.state.inner.flush();
+    if let Some(probe) = self.probe.as_mut() {
+      probe.buffered_packets.clear();
+      probe.eof_sent = false;
+    }
+  }
+
+  /// Tear down the active decoder and bring up the next backend in
+  /// `remaining_backends`, replaying buffered packets. Returns `true` if a
+  /// new backend was successfully installed (caller should retry the
+  /// receive); `false` if the probe is exhausted.
+  fn advance_probe(&mut self) -> Result<bool> {
+    let next_backend = match self.probe.as_mut() {
+      Some(probe) if !probe.remaining_backends.is_empty() => probe.remaining_backends.remove(0),
+      _ => return Ok(false),
+    };
+    let prev_backend = self.state.backend;
+    tracing::warn!(
+      from = ?prev_backend,
+      to = ?next_backend,
+      "hwdecode: backend rejected stream, advancing probe"
+    );
+
+    // Snapshot probe inputs before mutating self.
+    let (parameters, codec, buffered_packets, eof_sent) = {
+      let probe = self.probe.as_mut().expect("probe state");
+      (
+        probe.parameters.clone(),
+        probe.codec,
+        std::mem::take(&mut probe.buffered_packets),
+        probe.eof_sent,
+      )
+    };
+
+    // Build the new state. If this open fails, we fall through to advancing
+    // again — which is what the caller's loop will do once it sees the next
+    // probe iteration also fail. To keep semantics simple, propagate the
+    // open error directly: the caller's loop will see it as the decode
+    // error and return upward; in practice probe order ends in Software
+    // which always opens.
+    let new_state = Self::build_state(parameters, codec, next_backend)?;
+
+    // Replace state. The old DecoderState's Drop runs here, in order:
+    // codec context first, then callback_state box, then hw_device_ref.
+    self.state = new_state;
+
+    // hw_frame may hold residual data from the old decoder. Clear it so
+    // the next receive starts clean.
+    unsafe {
+      av_frame_unref(self.hw_frame.as_mut_ptr());
+    }
+
+    // Replay buffered packets and (if previously sent) EOF through the new
+    // decoder. We re-buffer them on the way through so a subsequent probe
+    // advance still has the full history.
+    let probe = self.probe.as_mut().expect("probe still present");
+    probe.buffered_packets.clear();
+    probe.eof_sent = false;
+
+    for pkt in buffered_packets {
+      // Mirror `send_packet`'s buffering behaviour.
+      probe.buffered_packets.push(pkt.clone());
+      self.state.inner.send_packet(&pkt).map_err(Error::Ffmpeg)?;
+    }
+    if eof_sent {
+      self.probe.as_mut().expect("probe still present").eof_sent = true;
+      self.state.inner.send_eof().map_err(Error::Ffmpeg)?;
+    }
+
+    Ok(true)
   }
 
-  /// Inner open: tries one backend exactly, no probing.
-  fn try_open(parameters: codec::Parameters, codec: Codec, backend: Backend) -> Result<Self> {
+  /// Build raw FFmpeg state for one backend. Strict `get_format` (NONE on
+  /// missing HW format); cross-backend fallback is the caller's job.
+  fn build_state(
+    parameters: codec::Parameters,
+    codec: Codec,
+    backend: Backend,
+  ) -> Result<DecoderState> {
     let mut ctx = Context::from_parameters(parameters)?;
 
-    let (hw_device_ref, callback_state) = match backend.av_hwdevice_type() {
-      None => (ptr::null_mut(), ptr::null_mut()),
+    let (hw_device_ref, callback_state, hw_pix_fmt) = match backend.av_hwdevice_type() {
+      None => (
+        ptr::null_mut(),
+        ptr::null_mut(),
+        AVPixelFormat::AV_PIX_FMT_NONE,
+      ),
       Some(av_type) => {
         // Verify the codec advertises this hwaccel.
         let hw_pix_fmt = find_hw_pix_fmt(unsafe { codec.as_ptr() }, av_type)
           .ok_or(Error::BackendUnsupportedByCodec(backend))?;
 
         // Create the device context.
-        let mut hw_device_ref = ptr::null_mut();
+        let mut hw_device_ref: *mut AVBufferRef = ptr::null_mut();
         // SAFETY: `hw_device_ref` is a stack ptr we hand FFmpeg to fill.
         let ret = unsafe {
           av_hwdevice_ctx_create(&mut hw_device_ref, av_type, ptr::null(), ptr::null_mut(), 0)
@@ -184,8 +414,6 @@ impl VideoDecoder {
           });
         }
 
-        // Wire up the codec context: a fresh ref for FFmpeg, a heap
-        // pointer for the get_format callback to read.
         let callback_state = Box::into_raw(Box::new(CallbackState { wanted: hw_pix_fmt }));
         // SAFETY: ctx is a freshly-constructed AVCodecContext we own;
         // av_buffer_ref bumps the refcount of the device buffer for FFmpeg's
@@ -196,7 +424,7 @@ impl VideoDecoder {
           (*raw).opaque = callback_state.cast();
           (*raw).get_format = Some(get_hw_format);
         }
-        (hw_device_ref, callback_state)
+        (hw_device_ref, callback_state, hw_pix_fmt)
       }
     };
 
@@ -221,34 +449,21 @@ impl VideoDecoder {
       }
     };
 
-    Ok(Self {
+    Ok(DecoderState {
       inner: ManuallyDrop::new(opened),
       backend,
       hw_device_ref,
       callback_state,
-      hw_frame: frame::Video::empty(),
+      hw_pix_fmt,
     })
   }
 }
 
-impl Drop for VideoDecoder {
-  fn drop(&mut self) {
-    // Order matters:
-    //  1. Drop the codec context first. While it lives, FFmpeg may invoke
-    //     `get_format`, which dereferences `callback_state` via `opaque`.
-    //  2. Free the callback state heap allocation.
-    //  3. Release our hw device reference (FFmpeg released its own when
-    //     the codec context was freed in step 1).
-    unsafe {
-      ManuallyDrop::drop(&mut self.inner);
-      if !self.callback_state.is_null() {
-        drop(Box::from_raw(self.callback_state));
-      }
-      if !self.hw_device_ref.is_null() {
-        av_buffer_unref(&mut self.hw_device_ref);
-      }
-    }
-  }
+/// `EAGAIN` and `EOF` are normal flow signals from `avcodec_receive_frame`
+/// and must not be treated as backend failures.
+fn is_transient(e: &ffmpeg_next::Error) -> bool {
+  matches!(e, ffmpeg_next::Error::Other { errno } if *errno == ffmpeg_next::error::EAGAIN)
+    || matches!(e, ffmpeg_next::Error::Eof)
 }
 
 #[allow(dead_code)]
@@ -263,10 +478,6 @@ mod tests {
 
   #[test]
   fn no_codec_for_unknown_id() {
-    // Build a Parameters with an unknown id — easiest path is to allocate
-    // empty parameters and inspect; here we just confirm Error::NoCodec
-    // formats sensibly. (Open behavior is exercised by integration tests
-    // because it requires real stream params.)
     let err = Error::NoCodec(codec::Id::None);
     assert!(format!("{err}").contains("no decoder"));
   }
@@ -275,4 +486,15 @@ mod tests {
   fn videodecoder_is_send() {
     _assert_send();
   }
+
+  #[test]
+  fn is_transient_recognises_eagain_and_eof() {
+    let eagain = ffmpeg_next::Error::Other {
+      errno: ffmpeg_next::error::EAGAIN,
+    };
+    assert!(is_transient(&eagain));
+    assert!(is_transient(&ffmpeg_next::Error::Eof));
+    let other = ffmpeg_next::Error::InvalidData;
+    assert!(!is_transient(&other));
+  }
 }
diff --git a/src/ffi.rs b/src/ffi.rs
index 6020079..78ee80c 100644
--- a/src/ffi.rs
+++ b/src/ffi.rs
@@ -8,39 +8,48 @@ use ffmpeg_next::ffi::{
 
 /// State pointed to by `AVCodecContext::opaque` so [`get_hw_format`] can pick
 /// the correct hardware pixel format without globals. One instance per
-/// decoder; freed in [`crate::VideoDecoder::drop`].
+/// decoder; freed by [`crate::VideoDecoder`] after the codec context is
+/// dropped.
 #[repr(C)]
 pub(crate) struct CallbackState {
+  /// Hardware pixel format we want the decoder to produce.
   pub(crate) wanted: AVPixelFormat,
 }
 
 /// `AVCodecContext::get_format` callback. FFmpeg invokes it with the list of
-/// pixel formats the codec is willing to output for the current stream. We
-/// pick the hardware format we wired up at open time, or [`AVPixelFormat::AV_PIX_FMT_NONE`]
-/// to signal "no usable format" (which causes FFmpeg to error out — the caller
-/// then sees a normal `ffmpeg::Error` and probes the next backend).
+/// pixel formats the codec is willing to output for the current stream.
+///
+/// Returns the configured hardware format if present; otherwise
+/// [`AVPixelFormat::AV_PIX_FMT_NONE`], which causes the decoder to fail. The
+/// failure surfaces as a normal `Error::Ffmpeg` from
+/// [`crate::VideoDecoder::receive_frame`]; for `VideoDecoder::open` callers
+/// the probe loop tears down and retries with the next backend (replaying
+/// buffered packets), so software fallback happens at the decoder level
+/// rather than silently in-context.
 pub(crate) unsafe extern "C" fn get_hw_format(
   ctx: *mut AVCodecContext,
-  mut pix_fmts: *const AVPixelFormat,
+  pix_fmts: *const AVPixelFormat,
 ) -> AVPixelFormat {
   debug_assert!(!ctx.is_null());
   debug_assert!(!pix_fmts.is_null());
 
   // SAFETY: opaque was set by `try_open` to a valid `Box<CallbackState>`
   // pointer that outlives the codec context (we only free it after the
-  // codec context's drop runs).
+  // codec context's drop runs). When opaque is null we treat the call as
+  // strict — a stray invocation cannot silently downgrade.
   let state = unsafe { (*ctx).opaque as *const CallbackState };
-  if state.is_null() {
-    return AVPixelFormat::AV_PIX_FMT_NONE;
-  }
-  let wanted = unsafe { (*state).wanted };
+  let wanted = if state.is_null() {
+    AVPixelFormat::AV_PIX_FMT_NONE
+  } else {
+    unsafe { (*state).wanted }
+  };
 
-  // Walk the offered list looking for our format.
-  while unsafe { *pix_fmts } != AVPixelFormat::AV_PIX_FMT_NONE {
-    if unsafe { *pix_fmts } == wanted {
+  let mut p = pix_fmts;
+  while unsafe { *p } != AVPixelFormat::AV_PIX_FMT_NONE {
+    if unsafe { *p } == wanted {
       return wanted;
     }
-    pix_fmts = unsafe { pix_fmts.add(1) };
+    p = unsafe { p.add(1) };
   }
   AVPixelFormat::AV_PIX_FMT_NONE
 }
@@ -68,3 +77,76 @@ pub(crate) fn find_hw_pix_fmt(
     i += 1;
   }
 }
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+  use std::ptr;
+
+  // The callback derefs `(*ctx).opaque`, so we need a real-looking
+  // AVCodecContext. We construct a zeroed one (the callback only reads opaque).
+  struct FakeCtx(*mut AVCodecContext);
+  impl FakeCtx {
+    fn new(state: *mut CallbackState) -> Self {
+      let boxed: Box<AVCodecContext> = unsafe { Box::new(std::mem::zeroed()) };
+      let raw = Box::into_raw(boxed);
+      unsafe { (*raw).opaque = state.cast() };
+      Self(raw)
+    }
+  }
+  impl Drop for FakeCtx {
+    fn drop(&mut self) {
+      unsafe { drop(Box::from_raw(self.0)) };
+    }
+  }
+
+  fn run(state: &CallbackState, mut offered: Vec<AVPixelFormat>) -> AVPixelFormat {
+    offered.push(AVPixelFormat::AV_PIX_FMT_NONE);
+    let ctx = FakeCtx::new(state as *const _ as *mut _);
+    unsafe { get_hw_format(ctx.0, offered.as_ptr()) }
+  }
+
+  #[test]
+  fn returns_wanted_hw_format_when_offered() {
+    let state = CallbackState {
+      wanted: AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX,
+    };
+    let got = run(
+      &state,
+      vec![
+        AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX,
+        AVPixelFormat::AV_PIX_FMT_NV12,
+      ],
+    );
+    assert_eq!(got, AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX);
+  }
+
+  #[test]
+  fn returns_none_when_wanted_absent() {
+    let state = CallbackState {
+      wanted: AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX,
+    };
+    let got = run(
+      &state,
+      vec![
+        AVPixelFormat::AV_PIX_FMT_NV12,
+        AVPixelFormat::AV_PIX_FMT_YUV420P,
+      ],
+    );
+    assert_eq!(got, AVPixelFormat::AV_PIX_FMT_NONE);
+  }
+
+  #[test]
+  fn null_opaque_is_treated_as_strict() {
+    let boxed: Box<AVCodecContext> = unsafe { Box::new(std::mem::zeroed()) };
+    let ctx_raw = Box::into_raw(boxed);
+    unsafe { (*ctx_raw).opaque = ptr::null_mut() };
+    let offered = [
+      AVPixelFormat::AV_PIX_FMT_NV12,
+      AVPixelFormat::AV_PIX_FMT_NONE,
+    ];
+    let got = unsafe { get_hw_format(ctx_raw, offered.as_ptr()) };
+    assert_eq!(got, AVPixelFormat::AV_PIX_FMT_NONE);
+    unsafe { drop(Box::from_raw(ctx_raw)) };
+  }
+}
diff --git a/tests/hw_smoke.rs b/tests/hw_smoke.rs
index 5aa37c9..3084faf 100644
--- a/tests/hw_smoke.rs
+++ b/tests/hw_smoke.rs
@@ -26,14 +26,12 @@ fn auto_probe_picks_hardware_backend() {
   let stream_index = stream.index();
 
   let mut decoder = VideoDecoder::open(stream.parameters()).expect("open decoder");
-  eprintln!("auto-probe selected backend = {:?}", decoder.backend());
-  assert_ne!(
-    decoder.backend(),
-    Backend::Software,
-    "expected hardware backend; got Software"
-  );
+  eprintln!("auto-probe optimistic backend = {:?}", decoder.backend());
 
-  // Verify we can actually decode at least one HW frame end-to-end.
+  // Decode at least one frame so the probe collapses, then check the
+  // backend that actually produced it. Checking `decoder.backend()` before
+  // any frame has been received would observe the optimistic pre-probe
+  // value and could false-pass when a HW backend silently degrades.
   let mut frame = frame::Video::empty();
   let mut got_frame = false;
   for (s, packet) in input.packets() {
@@ -45,7 +43,8 @@ fn auto_probe_picks_hardware_backend() {
       Ok(()) => {
         got_frame = true;
         eprintln!(
-          "first hw frame: {}x{} fmt={:?}",
+          "first frame: backend={:?} {}x{} fmt={:?}",
+          decoder.backend(),
           frame.width(),
           frame.height(),
           frame.format()
@@ -61,4 +60,9 @@ fn auto_probe_picks_hardware_backend() {
     }
   }
   assert!(got_frame, "no frames decoded");
+  assert_ne!(
+    decoder.backend(),
+    Backend::Software,
+    "expected hardware backend after first frame; got Software"
+  );
 }

From 9dc8542d5de68f57fd5e5f11184380802bfd12e6 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 16:23:16 +1200
Subject: [PATCH 03/27] update

---
 benches/decode.rs  |   8 +-
 examples/decode.rs |  20 ++--
 src/decoder.rs     | 284 ++++++++++++++++++++++++++-------------------
 src/frame.rs       | 109 +++++++++++++++++
 src/lib.rs         |   2 +
 tests/decode.rs    |  10 +-
 tests/hw_smoke.rs  |  10 +-
 7 files changed, 302 insertions(+), 141 deletions(-)
 create mode 100644 src/frame.rs

diff --git a/benches/decode.rs b/benches/decode.rs
index 2433de9..82d0ba9 100644
--- a/benches/decode.rs
+++ b/benches/decode.rs
@@ -11,9 +11,9 @@
 use std::{path::PathBuf, time::Duration};
 
 use criterion::{criterion_group, criterion_main, Criterion};
-use ffmpeg::{format, frame, media};
+use ffmpeg::{format, media};
 use ffmpeg_next as ffmpeg;
-use hwdecode::{Backend, VideoDecoder};
+use hwdecode::{Backend, Frame, VideoDecoder};
 
 const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
 
@@ -36,7 +36,7 @@ fn decode_all(path: &PathBuf, backend: Backend) -> Result<usize, hwdecode::Error
     _ => VideoDecoder::open(stream.parameters())?,
   };
 
-  let mut frame = frame::Video::empty();
+  let mut frame = Frame::empty();
   let mut count = 0_usize;
 
   let mut drain = |decoder: &mut VideoDecoder, count: &mut usize| -> Result<(), hwdecode::Error> {
@@ -86,7 +86,7 @@ fn bench_decode(c: &mut Criterion) {
       .expect("video stream");
     let stream_index = stream.index();
     let mut dec = VideoDecoder::open(stream.parameters()).expect("auto-probe");
-    let mut frame = frame::Video::empty();
+    let mut frame = Frame::empty();
     'probe: for (s, packet) in input.packets() {
       if s.index() != stream_index {
         continue;
diff --git a/examples/decode.rs b/examples/decode.rs
index fa28582..69763bf 100644
--- a/examples/decode.rs
+++ b/examples/decode.rs
@@ -4,9 +4,9 @@
 //! cargo run --release --example decode -- /path/to/video.mp4
 //! ```
 
-use ffmpeg::{format, frame, media};
+use ffmpeg::{format, media};
 use ffmpeg_next as ffmpeg;
-use hwdecode::VideoDecoder;
+use hwdecode::{Frame, VideoDecoder};
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
   let path = std::env::args()
@@ -24,26 +24,25 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 
   let mut decoder = VideoDecoder::open(stream.parameters())?;
   println!(
-    "backend={:?} {}x{} codec_pix_fmt_initial={:?}",
+    "open: backend={:?} {}x{}",
     decoder.backend(),
     decoder.width(),
     decoder.height(),
-    decoder.format(),
   );
 
-  let mut frame = frame::Video::empty();
+  let mut frame = Frame::empty();
   let mut count: u64 = 0;
 
-  let drain = |decoder: &mut VideoDecoder, frame: &mut frame::Video, count: &mut u64| loop {
+  let drain = |decoder: &mut VideoDecoder, frame: &mut Frame, count: &mut u64| loop {
     match decoder.receive_frame(frame) {
       Ok(()) => {
         *count += 1;
         println!(
-          "frame#{count} pts={:?} {}x{} fmt={:?}",
+          "frame#{count} pts={:?} {}x{} pix_fmt={}",
           frame.pts(),
           frame.width(),
           frame.height(),
-          frame.format(),
+          frame.pix_fmt(),
         );
       }
       Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
@@ -69,6 +68,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
   decoder.send_eof()?;
   drain(&mut decoder, &mut frame, &mut count);
 
-  println!("decoded {count} frames");
+  println!(
+    "decoded {count} frames; final backend={:?}",
+    decoder.backend()
+  );
   Ok(())
 }
diff --git a/src/decoder.rs b/src/decoder.rs
index 76fe3e4..a776697 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -6,7 +6,6 @@ use ffmpeg_next::{
     av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_frame_move_ref, av_frame_unref,
     av_hwdevice_ctx_create, av_hwframe_transfer_data, AVBufferRef, AVPixelFormat,
   },
-  format::Pixel,
   frame, Codec, Packet, Rational,
 };
 
@@ -14,24 +13,30 @@ use crate::{
   backend::{self, Backend},
   error::{Error, Result},
   ffi::{find_hw_pix_fmt, get_hw_format, CallbackState},
+  frame::Frame,
 };
 
 /// Hardware-accelerated video decoder with software fallback.
 ///
 /// Mirrors `ffmpeg::decoder::Video`'s `send_packet`/`receive_frame` interface.
-/// Frames returned by [`Self::receive_frame`] are always CPU-side; for the
-/// hardware path they are downloaded with `av_hwframe_transfer_data` (NV12 /
-/// P010).
+/// Decoded frames are returned through [`crate::Frame`], a CPU-side wrapper
+/// whose accessors avoid the `AVPixelFormat`-enum UB that an unvalidated read
+/// of FFmpeg's raw integer pixel formats can trigger.
 ///
 /// `open` does a true probe: each backend opens with a strict `get_format`
-/// callback, and on the first non-transient error the decoder is torn down
-/// and the next backend is tried with all packets seen so far replayed
-/// through it. Once the first frame is successfully received the probe
-/// collapses and subsequent calls go straight to the active backend.
+/// callback. On the first non-transient error from a backend the decoder is
+/// torn down and the next backend in probe order is tried, with all packets
+/// seen so far replayed through it. The advance is *transactional* — the
+/// candidate backend must successfully build and accept the replayed packets
+/// before any probe state is consumed, so a failing backend in the middle of
+/// the order does not strand the caller without history. Once the first frame
+/// is delivered the probe collapses and subsequent calls go straight to the
+/// active backend.
 pub struct VideoDecoder {
   /// Live FFmpeg state for the currently active backend.
   state: DecoderState,
   /// Reusable frame buffer used for hw-side decoding before transfer / move.
+  /// Internal use only — never handed to callers.
   hw_frame: frame::Video,
   /// Probe state: present until the first frame is received from the active
   /// backend, then `None`. While `Some`, packets are buffered for replay and
@@ -67,7 +72,9 @@ struct ProbeState {
   /// Backends still to try, in order. Empty means "no more options after
   /// the active one fails".
   remaining_backends: Vec<Backend>,
-  /// Packets sent so far, kept for replay through the next backend.
+  /// Packets sent so far, kept for replay through any candidate backend.
+  /// Preserved across failed candidates — only cleared when the probe
+  /// collapses on a successful first frame.
   buffered_packets: Vec<Packet>,
   /// Whether `send_eof` has been called; replayed alongside packets.
   eof_sent: bool,
@@ -105,16 +112,18 @@ impl VideoDecoder {
   /// Auto-probe hardware backends in the platform's default order.
   ///
   /// Each backend opens with a strict `get_format` callback. The first
-  /// backend whose `avcodec_open2` succeeds becomes active; if the first
-  /// frame from it fails (e.g. `get_format` returns `NONE` because the
-  /// backend can't handle this stream's profile/depth), the decoder is torn
-  /// down and the next backend is tried — packets sent so far are replayed
-  /// through the new decoder, transparently to the caller.
+  /// backend whose `avcodec_open2` succeeds becomes active; if its first
+  /// frame is unusable (decode error, transfer failure, or a CPU-format
+  /// frame from a HW context) the decoder is torn down and the next backend
+  /// is tried — packets sent so far are replayed through the new decoder
+  /// transparently. The probe advance is transactional: the next backend
+  /// must build *and* accept the replayed history before any probe state is
+  /// consumed, so a misbehaving middle backend cannot strand the caller.
   ///
   /// [`Self::backend`] reflects whichever backend ultimately produced the
   /// first frame. Software is the last entry in every probe order, so
-  /// `open` cannot return without a working decoder for codecs that
-  /// libavcodec supports at all.
+  /// `open` cannot return without a working decoder for any codec libavcodec
+  /// supports.
   pub fn open(parameters: codec::Parameters) -> Result<Self> {
     let codec_id = codec::Id::from(unsafe { (*parameters.as_ptr()).codec_id });
     let codec = ffmpeg_next::decoder::find(codec_id).ok_or(Error::NoCodec(codec_id))?;
@@ -184,15 +193,6 @@ impl VideoDecoder {
     self.state.inner.height()
   }
 
-  /// Codec context's current pixel format. For HW backends this is the
-  /// hardware pixel format (e.g. `Pixel::VIDEOTOOLBOX`) once the first frame
-  /// has been negotiated; the caller-facing format produced by
-  /// [`Self::receive_frame`] is the *transferred* format (NV12 / P010 for
-  /// HW, codec-native for SW) and must be read from the frame itself.
-  pub fn format(&self) -> Pixel {
-    self.state.inner.format()
-  }
-
   /// Codec context time base.
   pub fn time_base(&self) -> Rational {
     self.state.inner.time_base()
@@ -223,23 +223,22 @@ impl VideoDecoder {
 
   /// Receive a CPU-side decoded frame.
   ///
-  /// On the hardware path the frame is transferred from GPU memory via
-  /// `av_hwframe_transfer_data` and frame metadata (pts, time_base, side
-  /// data, ...) is copied with `av_frame_copy_props`. The caller's frame is
-  /// always unref'd first so reuse across resolution changes or different
-  /// decoders is safe (mirrors `avcodec_receive_frame`'s own contract).
+  /// On the hardware path the frame is downloaded with
+  /// `av_hwframe_transfer_data` and metadata is copied via
+  /// `av_frame_copy_props`. The caller's frame is always unref'd first, so
+  /// reuse across resolution changes or different decoders is safe.
   ///
-  /// While the probe window is open and the active backend produces a
-  /// non-transient error or a software-format frame instead of the
-  /// configured hardware format, the decoder is torn down and the next
-  /// backend in probe order is tried with all buffered packets replayed.
+  /// While the probe window is open, *any* non-transient failure (decode
+  /// error, transfer error, copy_props error, or a CPU-format frame from a
+  /// HW-opened context) tears down the current decoder and advances to the
+  /// next backend in probe order, replaying buffered packets through it.
   /// The caller observes only the eventual successful frame (or, if every
   /// backend has been exhausted, the underlying error).
   ///
   /// Returns the same transient signals as `ffmpeg::decoder::Video`:
   /// `Error::Ffmpeg(Other { errno: EAGAIN })` when no frame is ready and
   /// more packets must be sent, and `Error::Ffmpeg(Eof)` once fully drained.
-  pub fn receive_frame(&mut self, frame: &mut frame::Video) -> Result<()> {
+  pub fn receive_frame(&mut self, frame: &mut Frame) -> Result<()> {
     loop {
       let res = self.state.inner.receive_frame(&mut self.hw_frame);
       match res {
@@ -253,50 +252,56 @@ impl VideoDecoder {
           return Err(Error::Ffmpeg(e));
         }
         Ok(()) => {
-          // Compare format as i32 to avoid constructing an AVPixelFormat
-          // enum from an unvalidated integer. Library/header skew or a new
-          // hardware format would otherwise be UB.
+          // Read AVFrame.format as i32 — avoid constructing an
+          // AVPixelFormat enum from a raw integer (UB on library/header skew).
           let received_fmt: i32 = unsafe { (*self.hw_frame.as_ptr()).format };
 
           if self.state.backend == Backend::Software {
-            // Pure SW path: just hand over the frame.
             unsafe {
-              av_frame_unref(frame.as_mut_ptr());
-              av_frame_move_ref(frame.as_mut_ptr(), self.hw_frame.as_mut_ptr());
+              av_frame_unref(frame.as_inner_mut().as_mut_ptr());
+              av_frame_move_ref(
+                frame.as_inner_mut().as_mut_ptr(),
+                self.hw_frame.as_mut_ptr(),
+              );
             }
             self.probe = None;
             return Ok(());
           }
 
           if received_fmt == self.state.hw_pix_fmt as i32 {
-            // True HW frame: download to CPU and copy timing/side data.
-            unsafe {
-              av_frame_unref(frame.as_mut_ptr());
-              let ret = av_hwframe_transfer_data(frame.as_mut_ptr(), self.hw_frame.as_ptr(), 0);
-              if ret < 0 {
-                return Err(Error::Ffmpeg(ffmpeg_next::Error::from(ret)));
+            // True HW frame: try to download to CPU.
+            let transfer_result = unsafe { transfer_hw_frame(frame, &mut self.hw_frame) };
+            match transfer_result {
+              Ok(()) => {
+                self.probe = None;
+                return Ok(());
               }
-              let ret = av_frame_copy_props(frame.as_mut_ptr(), self.hw_frame.as_ptr());
-              if ret < 0 {
-                return Err(Error::Ffmpeg(ffmpeg_next::Error::from(ret)));
+              Err(e) => {
+                // Transfer failures during the probe window are also
+                // backend-level failures — try the next backend.
+                if self.probe.is_some() && self.advance_probe()? {
+                  unsafe { av_frame_unref(frame.as_inner_mut().as_mut_ptr()) };
+                  continue;
+                }
+                return Err(Error::Ffmpeg(e));
               }
             }
-            self.probe = None;
-            return Ok(());
           }
 
-          // The decoder produced a CPU frame from a HW-opened context. With
-          // strict `get_format` this is unusual (the codec would normally
-          // error on get_format=NONE). If it does happen and we still have
-          // backends to try, treat it as a probe failure and advance.
+          // Decoder produced a CPU frame from a HW-opened context. With
+          // strict `get_format` this only happens if the codec ignores it
+          // (uncommon). Treat as a probe failure if we still have backends.
           if self.probe.is_some() && self.advance_probe()? {
             continue;
           }
           // No fallback left; accept the SW frame and update the active
           // backend so `backend()` reflects reality.
           unsafe {
-            av_frame_unref(frame.as_mut_ptr());
-            av_frame_move_ref(frame.as_mut_ptr(), self.hw_frame.as_mut_ptr());
+            av_frame_unref(frame.as_inner_mut().as_mut_ptr());
+            av_frame_move_ref(
+              frame.as_inner_mut().as_mut_ptr(),
+              self.hw_frame.as_mut_ptr(),
+            );
           }
           self.state.backend = Backend::Software;
           self.probe = None;
@@ -316,69 +321,92 @@ impl VideoDecoder {
     }
   }
 
-  /// Tear down the active decoder and bring up the next backend in
-  /// `remaining_backends`, replaying buffered packets. Returns `true` if a
-  /// new backend was successfully installed (caller should retry the
-  /// receive); `false` if the probe is exhausted.
+  /// Try the next backend in `remaining_backends`. Transactional: a
+  /// candidate must successfully build and accept the replayed history
+  /// before any probe state is consumed. Backends that fail to build or
+  /// reject the replay are skipped (with `tracing::warn!`) and the loop
+  /// continues to the next one. Returns:
+  /// - `Ok(true)` when a candidate is installed and replay completed.
+  /// - `Ok(false)` when the probe is exhausted (no more backends to try).
+  /// - `Err(_)` only for genuinely fatal conditions surfaced by `build_state`
+  ///   on the very first inspection (e.g. a malformed `Parameters`); the
+  ///   per-candidate failures during the loop are absorbed and logged.
   fn advance_probe(&mut self) -> Result<bool> {
-    let next_backend = match self.probe.as_mut() {
-      Some(probe) if !probe.remaining_backends.is_empty() => probe.remaining_backends.remove(0),
-      _ => return Ok(false),
-    };
-    let prev_backend = self.state.backend;
-    tracing::warn!(
-      from = ?prev_backend,
-      to = ?next_backend,
-      "hwdecode: backend rejected stream, advancing probe"
-    );
-
-    // Snapshot probe inputs before mutating self.
-    let (parameters, codec, buffered_packets, eof_sent) = {
-      let probe = self.probe.as_mut().expect("probe state");
-      (
-        probe.parameters.clone(),
-        probe.codec,
-        std::mem::take(&mut probe.buffered_packets),
-        probe.eof_sent,
-      )
-    };
-
-    // Build the new state. If this open fails, we fall through to advancing
-    // again — which is what the caller's loop will do once it sees the next
-    // probe iteration also fail. To keep semantics simple, propagate the
-    // open error directly: the caller's loop will see it as the decode
-    // error and return upward; in practice probe order ends in Software
-    // which always opens.
-    let new_state = Self::build_state(parameters, codec, next_backend)?;
-
-    // Replace state. The old DecoderState's Drop runs here, in order:
-    // codec context first, then callback_state box, then hw_device_ref.
-    self.state = new_state;
-
-    // hw_frame may hold residual data from the old decoder. Clear it so
-    // the next receive starts clean.
-    unsafe {
-      av_frame_unref(self.hw_frame.as_mut_ptr());
-    }
+    loop {
+      // Snapshot inputs without mutating probe state.
+      let (next_backend, parameters, codec) = match self.probe.as_ref() {
+        Some(probe) if !probe.remaining_backends.is_empty() => (
+          probe.remaining_backends[0],
+          probe.parameters.clone(),
+          probe.codec,
+        ),
+        _ => return Ok(false),
+      };
+
+      let prev_backend = self.state.backend;
+      tracing::warn!(from = ?prev_backend, to = ?next_backend, "hwdecode: advancing probe");
+
+      // Build candidate. On failure, pop and continue without touching the
+      // packet buffer.
+      let mut candidate_state = match Self::build_state(parameters, codec, next_backend) {
+        Ok(s) => s,
+        Err(e) => {
+          tracing::warn!(?next_backend, error = %e, "hwdecode: candidate build failed");
+          self
+            .probe
+            .as_mut()
+            .expect("probe state present")
+            .remaining_backends
+            .remove(0);
+          continue;
+        }
+      };
+
+      // Replay buffered history through the candidate WITHOUT installing it.
+      // We borrow the buffer immutably; if replay fails the candidate's Drop
+      // releases the FFmpeg state and the buffer is preserved for the next
+      // attempt.
+      let replay_result: std::result::Result<(), ffmpeg_next::Error> = {
+        let probe = self.probe.as_ref().expect("probe state present");
+        let mut r: std::result::Result<(), ffmpeg_next::Error> = Ok(());
+        for pkt in &probe.buffered_packets {
+          if let Err(e) = candidate_state.inner.send_packet(pkt) {
+            r = Err(e);
+            break;
+          }
+        }
+        if r.is_ok() && probe.eof_sent {
+          if let Err(e) = candidate_state.inner.send_eof() {
+            r = Err(e);
+          }
+        }
+        r
+      };
+
+      if let Err(e) = replay_result {
+        tracing::warn!(?next_backend, error = %e, "hwdecode: candidate replay failed");
+        // Drop candidate explicitly so its FFI cleanup runs now.
+        drop(candidate_state);
+        self
+          .probe
+          .as_mut()
+          .expect("probe state present")
+          .remaining_backends
+          .remove(0);
+        continue;
+      }
 
-    // Replay buffered packets and (if previously sent) EOF through the new
-    // decoder. We re-buffer them on the way through so a subsequent probe
-    // advance still has the full history.
-    let probe = self.probe.as_mut().expect("probe still present");
-    probe.buffered_packets.clear();
-    probe.eof_sent = false;
-
-    for pkt in buffered_packets {
-      // Mirror `send_packet`'s buffering behaviour.
-      probe.buffered_packets.push(pkt.clone());
-      self.state.inner.send_packet(&pkt).map_err(Error::Ffmpeg)?;
-    }
-    if eof_sent {
-      self.probe.as_mut().expect("probe still present").eof_sent = true;
-      self.state.inner.send_eof().map_err(Error::Ffmpeg)?;
+      // Commit: install the candidate, clear residual hw_frame, pop backend.
+      self.state = candidate_state;
+      unsafe { av_frame_unref(self.hw_frame.as_mut_ptr()) };
+      self
+        .probe
+        .as_mut()
+        .expect("probe state present")
+        .remaining_backends
+        .remove(0);
+      return Ok(true);
     }
-
-    Ok(true)
   }
 
   /// Build raw FFmpeg state for one backend. Strict `get_format` (NONE on
@@ -459,6 +487,26 @@ impl VideoDecoder {
   }
 }
 
+/// Download a HW frame into a CPU [`Frame`]. Always unrefs the destination
+/// first so reuse across resolution changes is safe.
+unsafe fn transfer_hw_frame(
+  dst: &mut Frame,
+  src: &mut frame::Video,
+) -> std::result::Result<(), ffmpeg_next::Error> {
+  unsafe {
+    av_frame_unref(dst.as_inner_mut().as_mut_ptr());
+    let ret = av_hwframe_transfer_data(dst.as_inner_mut().as_mut_ptr(), src.as_ptr(), 0);
+    if ret < 0 {
+      return Err(ffmpeg_next::Error::from(ret));
+    }
+    let ret = av_frame_copy_props(dst.as_inner_mut().as_mut_ptr(), src.as_ptr());
+    if ret < 0 {
+      return Err(ffmpeg_next::Error::from(ret));
+    }
+  }
+  Ok(())
+}
+
 /// `EAGAIN` and `EOF` are normal flow signals from `avcodec_receive_frame`
 /// and must not be treated as backend failures.
 fn is_transient(e: &ffmpeg_next::Error) -> bool {
diff --git a/src/frame.rs b/src/frame.rs
new file mode 100644
index 0000000..f221b68
--- /dev/null
+++ b/src/frame.rs
@@ -0,0 +1,109 @@
+//! CPU-side decoded video frame.
+//!
+//! Wraps `ffmpeg_next::frame::Video` so callers cannot reach the upstream
+//! `format()` accessor, which constructs an `AVPixelFormat` enum from the
+//! raw integer FFmpeg writes into `AVFrame.format`. That conversion is UB
+//! when the value isn't in the bindgen-generated enum (library/header skew,
+//! a new pixel format added upstream, etc.). The wrapper exposes
+//! [`Frame::pix_fmt`] which reads the field as a plain `i32` — sound for any
+//! value FFmpeg can produce — and accessors are limited to fields whose
+//! reads do not invoke the same hazard.
+//!
+//! Compare formats against integer constants taken from the FFI layer, e.g.
+//!
+//! ```ignore
+//! use ffmpeg_next::ffi::AVPixelFormat;
+//! if frame.pix_fmt() == AVPixelFormat::AV_PIX_FMT_NV12 as i32 { ... }
+//! ```
+
+use ffmpeg_next::frame;
+
+/// CPU-side decoded video frame produced by [`crate::VideoDecoder`].
+pub struct Frame {
+  inner: frame::Video,
+}
+
+impl Frame {
+  /// Construct an empty frame, suitable as the destination passed to
+  /// [`crate::VideoDecoder::receive_frame`].
+  pub fn empty() -> Self {
+    Self {
+      inner: frame::Video::empty(),
+    }
+  }
+
+  /// Width in pixels.
+  pub fn width(&self) -> u32 {
+    self.inner.width()
+  }
+
+  /// Height in pixels.
+  pub fn height(&self) -> u32 {
+    self.inner.height()
+  }
+
+  /// Pixel format, returned as the raw `i32` value FFmpeg wrote to
+  /// `AVFrame.format`. Sound regardless of the linked FFmpeg version —
+  /// no `AVPixelFormat` enum is constructed.
+  ///
+  /// Compare against integer constants from `ffmpeg_next::ffi`, e.g.
+  /// `frame.pix_fmt() == AVPixelFormat::AV_PIX_FMT_NV12 as i32`.
+  pub fn pix_fmt(&self) -> i32 {
+    // SAFETY: `AVFrame.format` is bound as `c_int`; reading it yields a
+    // plain integer with no validity invariants.
+    unsafe { (*self.inner.as_ptr()).format }
+  }
+
+  /// Presentation timestamp in stream time base, or `None` if the frame
+  /// carries `AV_NOPTS_VALUE`.
+  pub fn pts(&self) -> Option<i64> {
+    self.inner.pts()
+  }
+
+  /// Number of populated planes (e.g. 3 for `YUV420P`, 2 for `NV12`).
+  pub fn planes(&self) -> usize {
+    self.inner.planes()
+  }
+
+  /// Bytes per row for `plane`. Panics if `plane >= planes()`.
+  pub fn stride(&self, plane: usize) -> usize {
+    self.inner.stride(plane)
+  }
+
+  /// Pixel data for `plane`. Panics if `plane >= planes()`.
+  pub fn data(&self, plane: usize) -> &[u8] {
+    self.inner.data(plane)
+  }
+
+  /// Crate-internal: hand the wrapped frame to FFmpeg / our decoder code.
+  pub(crate) fn as_inner_mut(&mut self) -> &mut frame::Video {
+    &mut self.inner
+  }
+}
+
+impl Default for Frame {
+  fn default() -> Self {
+    Self::empty()
+  }
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  #[test]
+  fn empty_frame_has_zero_dimensions_and_no_pts() {
+    let f = Frame::empty();
+    assert_eq!(f.width(), 0);
+    assert_eq!(f.height(), 0);
+    assert_eq!(f.pts(), None);
+    // AVFrame.format defaults to -1 (AV_PIX_FMT_NONE) for an empty frame.
+    assert_eq!(f.pix_fmt(), -1);
+  }
+
+  #[test]
+  fn frame_is_send() {
+    fn check<T: Send>() {}
+    check::<Frame>();
+  }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 7d9c7bd..69a0660 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -19,7 +19,9 @@ mod backend;
 mod decoder;
 mod error;
 mod ffi;
+mod frame;
 
 pub use backend::Backend;
 pub use decoder::VideoDecoder;
 pub use error::{Error, Result};
+pub use frame::Frame;
diff --git a/tests/decode.rs b/tests/decode.rs
index a936ae3..bc15f30 100644
--- a/tests/decode.rs
+++ b/tests/decode.rs
@@ -4,9 +4,9 @@
 //!
 //! Set `HWDECODE_SAMPLE_VIDEO` to an absolute path to enable.
 
-use ffmpeg::{format, frame, media};
+use ffmpeg::{format, media};
 use ffmpeg_next as ffmpeg;
-use hwdecode::VideoDecoder;
+use hwdecode::{Frame, VideoDecoder};
 
 const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
 
@@ -29,12 +29,12 @@ fn auto_open_decodes_at_least_one_frame() {
   let expected_h = unsafe { (*stream.parameters().as_ptr()).height as u32 };
 
   let mut decoder = VideoDecoder::open(stream.parameters()).expect("open decoder");
-  eprintln!("backend = {:?}", decoder.backend());
+  eprintln!("optimistic backend = {:?}", decoder.backend());
 
   assert_eq!(decoder.width(), expected_w);
   assert_eq!(decoder.height(), expected_h);
 
-  let mut frame = frame::Video::empty();
+  let mut frame = Frame::empty();
   let mut count = 0_usize;
   let target = 30_usize;
 
@@ -64,5 +64,5 @@ fn auto_open_decodes_at_least_one_frame() {
   }
 
   assert!(count >= 1, "expected at least 1 decoded frame, got {count}");
-  eprintln!("decoded {count} frames");
+  eprintln!("decoded {count} frames via backend {:?}", decoder.backend());
 }
diff --git a/tests/hw_smoke.rs b/tests/hw_smoke.rs
index 3084faf..e734533 100644
--- a/tests/hw_smoke.rs
+++ b/tests/hw_smoke.rs
@@ -5,9 +5,9 @@
 //! HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo test --test hw_smoke -- --ignored
 //! ```
 
-use ffmpeg::{format, frame, media};
+use ffmpeg::{format, media};
 use ffmpeg_next as ffmpeg;
-use hwdecode::{Backend, VideoDecoder};
+use hwdecode::{Backend, Frame, VideoDecoder};
 
 const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
 
@@ -32,7 +32,7 @@ fn auto_probe_picks_hardware_backend() {
   // backend that actually produced it. Checking `decoder.backend()` before
   // any frame has been received would observe the optimistic pre-probe
   // value and could false-pass when a HW backend silently degrades.
-  let mut frame = frame::Video::empty();
+  let mut frame = Frame::empty();
   let mut got_frame = false;
   for (s, packet) in input.packets() {
     if s.index() != stream_index {
@@ -43,11 +43,11 @@ fn auto_probe_picks_hardware_backend() {
       Ok(()) => {
         got_frame = true;
         eprintln!(
-          "first frame: backend={:?} {}x{} fmt={:?}",
+          "first frame: backend={:?} {}x{} pix_fmt={}",
           decoder.backend(),
           frame.width(),
           frame.height(),
-          frame.format()
+          frame.pix_fmt()
         );
         break;
       }

From 979a2bfbe4a3ad87d1bfd1dbcc52d517c07a40bf Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 16:54:03 +1200
Subject: [PATCH 04/27] update

---
 benches/decode.rs  | 113 +++++++++++++++++++++----------
 examples/decode.rs |  16 ++++-
 src/backend.rs     |  90 ++++++++++---------------
 src/decoder.rs     | 164 +++++++++++++++++----------------------------
 src/error.rs       |   7 +-
 src/lib.rs         |   1 +
 src/pix_fmt.rs     | 113 +++++++++++++++++++++++++++++++
 tests/decode.rs    |  12 +++-
 tests/hw_smoke.rs  |   9 +--
 9 files changed, 319 insertions(+), 206 deletions(-)
 create mode 100644 src/pix_fmt.rs

diff --git a/benches/decode.rs b/benches/decode.rs
index 82d0ba9..5f53a66 100644
--- a/benches/decode.rs
+++ b/benches/decode.rs
@@ -1,8 +1,9 @@
-//! Benchmark comparing software-only decode against the auto-probed
+//! Benchmark comparing software-only decode (via `ffmpeg-next` directly,
+//! since `hwdecode` is hardware-only) against `hwdecode`'s auto-probed
 //! hardware backend on the same input file.
 //!
 //! Set `HWDECODE_SAMPLE_VIDEO` to a video file path. The hardware bench is
-//! skipped (with a notice) when the auto-probe falls back to software.
+//! skipped (with a notice) when no hardware backend is available on the host.
 //!
 //! ```sh
 //! HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo bench
@@ -11,9 +12,9 @@
 use std::{path::PathBuf, time::Duration};
 
 use criterion::{criterion_group, criterion_main, Criterion};
-use ffmpeg::{format, media};
+use ffmpeg::{codec::Context as CodecContext, format, frame, media};
 use ffmpeg_next as ffmpeg;
-use hwdecode::{Backend, Frame, VideoDecoder};
+use hwdecode::{Frame, VideoDecoder};
 
 const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
 
@@ -21,9 +22,8 @@ fn sample_path() -> Option<PathBuf> {
   std::env::var_os(SAMPLE_ENV).map(PathBuf::from)
 }
 
-/// Decode every video frame in the file using `decoder`, returning the count.
-/// Re-opens the input each call so each iteration measures a full decode pass.
-fn decode_all(path: &PathBuf, backend: Backend) -> Result<usize, hwdecode::Error> {
+/// Decode every frame using `hwdecode`'s auto-probed hardware backend.
+fn decode_all_hw(path: &PathBuf) -> Result<usize, hwdecode::Error> {
   let mut input = format::input(path).map_err(hwdecode::Error::Ffmpeg)?;
   let stream = input
     .streams()
@@ -31,11 +31,7 @@ fn decode_all(path: &PathBuf, backend: Backend) -> Result<usize, hwdecode::Error
     .ok_or(hwdecode::Error::Ffmpeg(ffmpeg::Error::StreamNotFound))?;
   let stream_index = stream.index();
 
-  let mut decoder = match backend {
-    Backend::Software => VideoDecoder::open_with(stream.parameters(), Backend::Software)?,
-    _ => VideoDecoder::open(stream.parameters())?,
-  };
-
+  let mut decoder = VideoDecoder::open(stream.parameters())?;
   let mut frame = Frame::empty();
   let mut count = 0_usize;
 
@@ -66,6 +62,46 @@ fn decode_all(path: &PathBuf, backend: Backend) -> Result<usize, hwdecode::Error
   Ok(count)
 }
 
+/// Decode every frame using a plain software `ffmpeg-next` decoder. Used as
+/// the SW baseline since `hwdecode` no longer exposes a software backend.
+fn decode_all_sw(path: &PathBuf) -> Result<usize, ffmpeg::Error> {
+  let mut input = format::input(path)?;
+  let stream = input
+    .streams()
+    .best(media::Type::Video)
+    .ok_or(ffmpeg::Error::StreamNotFound)?;
+  let stream_index = stream.index();
+  let mut decoder = CodecContext::from_parameters(stream.parameters())?
+    .decoder()
+    .video()?;
+
+  let mut frame = frame::Video::empty();
+  let mut count = 0_usize;
+
+  let mut drain =
+    |decoder: &mut ffmpeg::decoder::Video, count: &mut usize| -> Result<(), ffmpeg::Error> {
+      loop {
+        match decoder.receive_frame(&mut frame) {
+          Ok(()) => *count += 1,
+          Err(ffmpeg::Error::Other { errno }) if errno == ffmpeg::error::EAGAIN => return Ok(()),
+          Err(ffmpeg::Error::Eof) => return Ok(()),
+          Err(e) => return Err(e),
+        }
+      }
+    };
+
+  for (s, packet) in input.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet)?;
+    drain(&mut decoder, &mut count)?;
+  }
+  decoder.send_eof()?;
+  drain(&mut decoder, &mut count)?;
+  Ok(count)
+}
+
 fn bench_decode(c: &mut Criterion) {
   ffmpeg::init().expect("ffmpeg init");
 
@@ -75,9 +111,8 @@ fn bench_decode(c: &mut Criterion) {
   };
 
   // Probe by decoding one frame so the probe collapses to the backend that
-  // actually produced output. Reading `backend()` before the first frame
-  // would observe the optimistically-selected value and mislabel HW runs
-  // that silently degraded.
+  // actually produced output. None means no HW backend is available — we
+  // skip the HW arm and bench SW only.
   let probed_backend = {
     let mut input = format::input(&path).expect("open input");
     let stream = input
@@ -85,44 +120,50 @@ fn bench_decode(c: &mut Criterion) {
       .best(media::Type::Video)
       .expect("video stream");
     let stream_index = stream.index();
-    let mut dec = VideoDecoder::open(stream.parameters()).expect("auto-probe");
-    let mut frame = Frame::empty();
-    'probe: for (s, packet) in input.packets() {
-      if s.index() != stream_index {
-        continue;
-      }
-      dec.send_packet(&packet).expect("probe send_packet");
-      match dec.receive_frame(&mut frame) {
-        Ok(()) => break 'probe,
-        Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
-          if errno == ffmpeg::error::EAGAIN =>
-        {
-          continue;
+    match VideoDecoder::open(stream.parameters()) {
+      Ok(mut dec) => {
+        let mut frame = Frame::empty();
+        'probe: for (s, packet) in input.packets() {
+          if s.index() != stream_index {
+            continue;
+          }
+          dec.send_packet(&packet).expect("probe send_packet");
+          match dec.receive_frame(&mut frame) {
+            Ok(()) => break 'probe,
+            Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+              if errno == ffmpeg::error::EAGAIN =>
+            {
+              continue;
+            }
+            Err(e) => panic!("probe receive_frame: {e}"),
+          }
         }
-        Err(e) => panic!("probe receive_frame: {e}"),
+        Some(dec.backend())
       }
+      Err(hwdecode::Error::AllBackendsFailed { .. }) => None,
+      Err(e) => panic!("hwdecode probe: {e}"),
     }
-    dec.backend()
   };
-  eprintln!("auto-probe settled on backend: {probed_backend:?}");
+  match probed_backend {
+    Some(b) => eprintln!("auto-probe settled on backend: {b:?}"),
+    None => eprintln!("no hardware backend available — hardware bench will be skipped"),
+  }
 
   let mut group = c.benchmark_group("decode");
   group.measurement_time(Duration::from_secs(15));
   group.sample_size(20);
 
   group.bench_function("software", |b| {
-    b.iter(|| decode_all(&path, Backend::Software).expect("software decode"))
+    b.iter(|| decode_all_sw(&path).expect("software decode"))
   });
 
-  if probed_backend != Backend::Software {
+  if probed_backend.is_some() {
     group.bench_function("hardware", |b| {
       b.iter(|| {
-        let n = decode_all(&path, probed_backend).expect("hardware decode");
+        let n = decode_all_hw(&path).expect("hardware decode");
         std::hint::black_box(n);
       })
     });
-  } else {
-    eprintln!("skipping hardware bench: auto-probe fell back to Software");
   }
 
   group.finish();
diff --git a/examples/decode.rs b/examples/decode.rs
index 69763bf..a1439d7 100644
--- a/examples/decode.rs
+++ b/examples/decode.rs
@@ -22,7 +22,21 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     .ok_or("no video stream")?;
   let stream_index = stream.index();
 
-  let mut decoder = VideoDecoder::open(stream.parameters())?;
+  let mut decoder = match VideoDecoder::open(stream.parameters()) {
+    Ok(d) => d,
+    Err(hwdecode::Error::AllBackendsFailed { attempts }) => {
+      eprintln!(
+        "no hardware backend available; tried {} backend(s):",
+        attempts.len()
+      );
+      for (b, e) in &attempts {
+        eprintln!("  {b:?}: {e}");
+      }
+      eprintln!("(callers handle software fallback themselves — see ffmpeg::decoder::Video)");
+      return Ok(());
+    }
+    Err(e) => return Err(e.into()),
+  };
   println!(
     "open: backend={:?} {}x{}",
     decoder.backend(),
diff --git a/src/backend.rs b/src/backend.rs
index cfcd48b..bce8699 100644
--- a/src/backend.rs
+++ b/src/backend.rs
@@ -1,11 +1,15 @@
 use ffmpeg_next::{ffi::AVHWDeviceType, format::Pixel};
 
-/// Decoding backend selected (or forced) for a [`crate::VideoDecoder`].
+/// Hardware decoding backend.
+///
+/// `hwdecode` only manages **hardware** decoders — software fallback is
+/// out of scope. If no backend in [`probe_order`] for the current platform
+/// can decode a stream, [`crate::VideoDecoder::open`] returns
+/// [`crate::Error::AllBackendsFailed`] and the caller decides how to fall
+/// back (e.g. by opening an `ffmpeg::decoder::Video` directly).
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub enum Backend {
-  /// Pure software decode via libavcodec.
-  Software,
-  /// Apple VideoToolbox (macOS, iOS, iPadOS, tvOS).
+  /// Apple VideoToolbox (macOS, iOS, iPadOS, tvOS, visionOS).
   VideoToolbox,
   /// Linux Video Acceleration API (Intel / AMD GPUs).
   Vaapi,
@@ -16,37 +20,33 @@ pub enum Backend {
 }
 
 impl Backend {
-  /// `AVHWDeviceType` corresponding to this backend, or `None` for
-  /// [`Backend::Software`].
-  pub(crate) fn av_hwdevice_type(self) -> Option<AVHWDeviceType> {
+  /// `AVHWDeviceType` corresponding to this backend.
+  pub(crate) fn av_hwdevice_type(self) -> AVHWDeviceType {
     match self {
-      Self::Software => None,
-      Self::VideoToolbox => Some(AVHWDeviceType::AV_HWDEVICE_TYPE_VIDEOTOOLBOX),
-      Self::Vaapi => Some(AVHWDeviceType::AV_HWDEVICE_TYPE_VAAPI),
-      Self::Cuda => Some(AVHWDeviceType::AV_HWDEVICE_TYPE_CUDA),
-      Self::D3d11va => Some(AVHWDeviceType::AV_HWDEVICE_TYPE_D3D11VA),
+      Self::VideoToolbox => AVHWDeviceType::AV_HWDEVICE_TYPE_VIDEOTOOLBOX,
+      Self::Vaapi => AVHWDeviceType::AV_HWDEVICE_TYPE_VAAPI,
+      Self::Cuda => AVHWDeviceType::AV_HWDEVICE_TYPE_CUDA,
+      Self::D3d11va => AVHWDeviceType::AV_HWDEVICE_TYPE_D3D11VA,
     }
   }
 
   /// Hardware pixel format the codec is expected to produce when this
-  /// backend is in use. Used to inspect the result of `get_format`.
-  /// `None` for [`Backend::Software`].
+  /// backend is in use. (The post-`av_hwframe_transfer_data` CPU format is
+  /// typically `NV12` or `P010LE`; this is the *pre-transfer* sentinel.)
   #[allow(dead_code)] // surfaced for tests / future use
-  pub(crate) fn hw_pixel_format(self) -> Option<Pixel> {
+  pub(crate) fn hw_pixel_format(self) -> Pixel {
     match self {
-      Self::Software => None,
-      Self::VideoToolbox => Some(Pixel::VIDEOTOOLBOX),
-      Self::Vaapi => Some(Pixel::VAAPI),
-      Self::Cuda => Some(Pixel::CUDA),
-      Self::D3d11va => Some(Pixel::D3D11),
+      Self::VideoToolbox => Pixel::VIDEOTOOLBOX,
+      Self::Vaapi => Pixel::VAAPI,
+      Self::Cuda => Pixel::CUDA,
+      Self::D3d11va => Pixel::D3D11,
     }
   }
 }
 
-/// Probe order for `VideoDecoder::open` on the current target.
-///
-/// Always ends in [`Backend::Software`]; auto-probe never returns an empty
-/// list. Order is fixed at compile time per `target_os`.
+/// Probe order for `VideoDecoder::open` on the current target. Hardware
+/// backends only, in preference order. Empty for platforms with no known
+/// HW backend; on those `open()` returns `AllBackendsFailed` immediately.
 pub(crate) fn probe_order() -> &'static [Backend] {
   #[cfg(any(
     target_os = "macos",
@@ -55,15 +55,15 @@ pub(crate) fn probe_order() -> &'static [Backend] {
     target_os = "visionos",
   ))]
   {
-    &[Backend::VideoToolbox, Backend::Software]
+    &[Backend::VideoToolbox]
   }
   #[cfg(target_os = "linux")]
   {
-    &[Backend::Vaapi, Backend::Cuda, Backend::Software]
+    &[Backend::Vaapi, Backend::Cuda]
   }
   #[cfg(target_os = "windows")]
   {
-    &[Backend::D3d11va, Backend::Cuda, Backend::Software]
+    &[Backend::D3d11va, Backend::Cuda]
   }
   #[cfg(not(any(
     target_os = "macos",
@@ -74,7 +74,7 @@ pub(crate) fn probe_order() -> &'static [Backend] {
     target_os = "windows",
   )))]
   {
-    &[Backend::Software]
+    &[]
   }
 }
 
@@ -83,55 +83,33 @@ mod tests {
   use super::*;
 
   #[test]
-  fn probe_order_ends_in_software() {
-    let order = probe_order();
-    assert!(!order.is_empty());
-    assert_eq!(*order.last().unwrap(), Backend::Software);
-  }
-
-  #[test]
-  fn software_has_no_av_hwdevice_type() {
-    assert!(Backend::Software.av_hwdevice_type().is_none());
-    assert!(Backend::Software.hw_pixel_format().is_none());
-  }
-
-  #[test]
-  fn hw_backends_have_av_hwdevice_type() {
+  fn all_backends_have_hwdevice_type_and_pix_fmt() {
     for b in [
       Backend::VideoToolbox,
       Backend::Vaapi,
       Backend::Cuda,
       Backend::D3d11va,
     ] {
-      assert!(
-        b.av_hwdevice_type().is_some(),
-        "{b:?} missing hwdevice type"
-      );
-      assert!(b.hw_pixel_format().is_some(), "{b:?} missing hw pix fmt");
+      let _ = b.av_hwdevice_type();
+      let _ = b.hw_pixel_format();
     }
   }
 
   #[cfg(any(target_os = "macos", target_os = "ios", target_os = "tvos"))]
   #[test]
   fn apple_probe_order() {
-    assert_eq!(probe_order(), &[Backend::VideoToolbox, Backend::Software]);
+    assert_eq!(probe_order(), &[Backend::VideoToolbox]);
   }
 
   #[cfg(target_os = "linux")]
   #[test]
   fn linux_probe_order() {
-    assert_eq!(
-      probe_order(),
-      &[Backend::Vaapi, Backend::Cuda, Backend::Software]
-    );
+    assert_eq!(probe_order(), &[Backend::Vaapi, Backend::Cuda]);
   }
 
   #[cfg(target_os = "windows")]
   #[test]
   fn windows_probe_order() {
-    assert_eq!(
-      probe_order(),
-      &[Backend::D3d11va, Backend::Cuda, Backend::Software]
-    );
+    assert_eq!(probe_order(), &[Backend::D3d11va, Backend::Cuda]);
   }
 }
diff --git a/src/decoder.rs b/src/decoder.rs
index a776697..09f5c67 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -3,8 +3,8 @@ use std::{mem::ManuallyDrop, ptr};
 use ffmpeg_next::{
   codec::{self, Context},
   ffi::{
-    av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_frame_move_ref, av_frame_unref,
-    av_hwdevice_ctx_create, av_hwframe_transfer_data, AVBufferRef, AVPixelFormat,
+    av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_frame_unref, av_hwdevice_ctx_create,
+    av_hwframe_transfer_data, AVBufferRef,
   },
   frame, Codec, Packet, Rational,
 };
@@ -52,15 +52,11 @@ struct DecoderState {
   inner: ManuallyDrop<ffmpeg_next::decoder::Video>,
   /// Backend driving this state.
   backend: Backend,
-  /// Owned reference produced by `av_hwdevice_ctx_create`. Null for software.
+  /// Owned reference produced by `av_hwdevice_ctx_create`.
   hw_device_ref: *mut AVBufferRef,
-  /// Owned `Box<CallbackState>` raw pointer; `AVCodecContext::opaque` aliases
-  /// it. Null for software.
+  /// Owned `Box<CallbackState>` raw pointer; `AVCodecContext::opaque`
+  /// aliases it.
   callback_state: *mut CallbackState,
-  /// Hardware pixel format we asked the decoder to produce. Compared (as
-  /// `i32` to avoid enum-discriminant UB) against each received frame's
-  /// format. `AV_PIX_FMT_NONE` for the software path.
-  hw_pix_fmt: AVPixelFormat,
 }
 
 /// State carried only during the probe window (before the first successful
@@ -223,17 +219,22 @@ impl VideoDecoder {
 
   /// Receive a CPU-side decoded frame.
   ///
-  /// On the hardware path the frame is downloaded with
-  /// `av_hwframe_transfer_data` and metadata is copied via
-  /// `av_frame_copy_props`. The caller's frame is always unref'd first, so
-  /// reuse across resolution changes or different decoders is safe.
+  /// The frame is downloaded with `av_hwframe_transfer_data` and metadata
+  /// is copied via `av_frame_copy_props`. The caller's frame is always
+  /// unref'd first, so reuse across resolution changes or different
+  /// decoders is safe.
   ///
   /// While the probe window is open, *any* non-transient failure (decode
   /// error, transfer error, copy_props error, or a CPU-format frame from a
   /// HW-opened context) tears down the current decoder and advances to the
-  /// next backend in probe order, replaying buffered packets through it.
-  /// The caller observes only the eventual successful frame (or, if every
-  /// backend has been exhausted, the underlying error).
+  /// next hardware backend in probe order, replaying buffered packets
+  /// through it. The caller observes only the eventual successful frame
+  /// (or, if every backend has been exhausted, the underlying error).
+  ///
+  /// This crate is hardware-only: there is no software fallback inside the
+  /// decoder. If every backend is exhausted, the failure surfaces as the
+  /// last decoder error (or [`Error::HwBackendProducedSwFrame`] for the
+  /// degraded-CPU-frame case). Callers handle software fallback themselves.
   ///
   /// Returns the same transient signals as `ffmpeg::decoder::Video`:
   /// `Error::Ffmpeg(Other { errno: EAGAIN })` when no frame is ready and
@@ -252,60 +253,25 @@ impl VideoDecoder {
           return Err(Error::Ffmpeg(e));
         }
         Ok(()) => {
-          // Read AVFrame.format as i32 — avoid constructing an
-          // AVPixelFormat enum from a raw integer (UB on library/header skew).
-          let received_fmt: i32 = unsafe { (*self.hw_frame.as_ptr()).format };
-
-          if self.state.backend == Backend::Software {
-            unsafe {
-              av_frame_unref(frame.as_inner_mut().as_mut_ptr());
-              av_frame_move_ref(
-                frame.as_inner_mut().as_mut_ptr(),
-                self.hw_frame.as_mut_ptr(),
-              );
+          // Always attempt the HW→CPU transfer. With strict `get_format`,
+          // libavcodec can only deliver frames in the wired-up HW format
+          // (or fail). If a misbehaving codec ever hands us a CPU-side
+          // frame anyway, `av_hwframe_transfer_data` returns AVERROR(EINVAL)
+          // (neither src nor dst has an AVHWFramesContext attached) and we
+          // route through the same error path below.
+          match unsafe { transfer_hw_frame(frame, &mut self.hw_frame) } {
+            Ok(()) => {
+              self.probe = None;
+              return Ok(());
             }
-            self.probe = None;
-            return Ok(());
-          }
-
-          if received_fmt == self.state.hw_pix_fmt as i32 {
-            // True HW frame: try to download to CPU.
-            let transfer_result = unsafe { transfer_hw_frame(frame, &mut self.hw_frame) };
-            match transfer_result {
-              Ok(()) => {
-                self.probe = None;
-                return Ok(());
-              }
-              Err(e) => {
-                // Transfer failures during the probe window are also
-                // backend-level failures — try the next backend.
-                if self.probe.is_some() && self.advance_probe()? {
-                  unsafe { av_frame_unref(frame.as_inner_mut().as_mut_ptr()) };
-                  continue;
-                }
-                return Err(Error::Ffmpeg(e));
+            Err(e) => {
+              if self.probe.is_some() && self.advance_probe()? {
+                unsafe { av_frame_unref(frame.as_inner_mut().as_mut_ptr()) };
+                continue;
               }
+              return Err(Error::Ffmpeg(e));
             }
           }
-
-          // Decoder produced a CPU frame from a HW-opened context. With
-          // strict `get_format` this only happens if the codec ignores it
-          // (uncommon). Treat as a probe failure if we still have backends.
-          if self.probe.is_some() && self.advance_probe()? {
-            continue;
-          }
-          // No fallback left; accept the SW frame and update the active
-          // backend so `backend()` reflects reality.
-          unsafe {
-            av_frame_unref(frame.as_inner_mut().as_mut_ptr());
-            av_frame_move_ref(
-              frame.as_inner_mut().as_mut_ptr(),
-              self.hw_frame.as_mut_ptr(),
-            );
-          }
-          self.state.backend = Backend::Software;
-          self.probe = None;
-          return Ok(());
         }
       }
     }
@@ -409,52 +375,43 @@ impl VideoDecoder {
     }
   }
 
-  /// Build raw FFmpeg state for one backend. Strict `get_format` (NONE on
-  /// missing HW format); cross-backend fallback is the caller's job.
+  /// Build raw FFmpeg state for one hardware backend. Strict `get_format`
+  /// (NONE on missing HW format); cross-backend fallback is the caller's job.
   fn build_state(
     parameters: codec::Parameters,
     codec: Codec,
     backend: Backend,
   ) -> Result<DecoderState> {
     let mut ctx = Context::from_parameters(parameters)?;
+    let av_type = backend.av_hwdevice_type();
 
-    let (hw_device_ref, callback_state, hw_pix_fmt) = match backend.av_hwdevice_type() {
-      None => (
-        ptr::null_mut(),
-        ptr::null_mut(),
-        AVPixelFormat::AV_PIX_FMT_NONE,
-      ),
-      Some(av_type) => {
-        // Verify the codec advertises this hwaccel.
-        let hw_pix_fmt = find_hw_pix_fmt(unsafe { codec.as_ptr() }, av_type)
-          .ok_or(Error::BackendUnsupportedByCodec(backend))?;
-
-        // Create the device context.
-        let mut hw_device_ref: *mut AVBufferRef = ptr::null_mut();
-        // SAFETY: `hw_device_ref` is a stack ptr we hand FFmpeg to fill.
-        let ret = unsafe {
-          av_hwdevice_ctx_create(&mut hw_device_ref, av_type, ptr::null(), ptr::null_mut(), 0)
-        };
-        if ret < 0 {
-          return Err(Error::HwDeviceInitFailed {
-            backend,
-            source: ffmpeg_next::Error::from(ret),
-          });
-        }
+    // Verify the codec advertises this hwaccel.
+    let hw_pix_fmt = find_hw_pix_fmt(unsafe { codec.as_ptr() }, av_type)
+      .ok_or(Error::BackendUnsupportedByCodec(backend))?;
 
-        let callback_state = Box::into_raw(Box::new(CallbackState { wanted: hw_pix_fmt }));
-        // SAFETY: ctx is a freshly-constructed AVCodecContext we own;
-        // av_buffer_ref bumps the refcount of the device buffer for FFmpeg's
-        // use (we keep our own ref in `hw_device_ref` for cleanup).
-        unsafe {
-          let raw = ctx.as_mut_ptr();
-          (*raw).hw_device_ctx = av_buffer_ref(hw_device_ref);
-          (*raw).opaque = callback_state.cast();
-          (*raw).get_format = Some(get_hw_format);
-        }
-        (hw_device_ref, callback_state, hw_pix_fmt)
-      }
+    // Create the device context.
+    let mut hw_device_ref: *mut AVBufferRef = ptr::null_mut();
+    // SAFETY: `hw_device_ref` is a stack ptr we hand FFmpeg to fill.
+    let ret = unsafe {
+      av_hwdevice_ctx_create(&mut hw_device_ref, av_type, ptr::null(), ptr::null_mut(), 0)
     };
+    if ret < 0 {
+      return Err(Error::HwDeviceInitFailed {
+        backend,
+        source: ffmpeg_next::Error::from(ret),
+      });
+    }
+
+    let callback_state = Box::into_raw(Box::new(CallbackState { wanted: hw_pix_fmt }));
+    // SAFETY: ctx is a freshly-constructed AVCodecContext we own;
+    // av_buffer_ref bumps the refcount of the device buffer for FFmpeg's
+    // use (we keep our own ref in `hw_device_ref` for cleanup).
+    unsafe {
+      let raw = ctx.as_mut_ptr();
+      (*raw).hw_device_ctx = av_buffer_ref(hw_device_ref);
+      (*raw).opaque = callback_state.cast();
+      (*raw).get_format = Some(get_hw_format);
+    }
 
     // Open the decoder. On any failure, release the resources we just
     // allocated so we don't leak.
@@ -482,7 +439,6 @@ impl VideoDecoder {
       backend,
       hw_device_ref,
       callback_state,
-      hw_pix_fmt,
     })
   }
 }
diff --git a/src/error.rs b/src/error.rs
index 92cb2d1..ef5373c 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -28,8 +28,11 @@ pub enum Error {
     source: ffmpeg_next::Error,
   },
 
-  /// Auto-probe exhausted every backend in the platform's order.
-  #[error("all backends failed; attempts: {attempts:?}")]
+  /// Auto-probe exhausted every backend in the platform's order. Empty
+  /// `attempts` means the platform has no hardware backends listed in
+  /// [`crate::Backend`] for the current `target_os` — callers must
+  /// fall back to a software decoder of their choice.
+  #[error("all hardware backends failed; attempts: {attempts:?}")]
   AllBackendsFailed {
     /// Per-backend errors collected during probing, in the order tried.
     attempts: Vec<(Backend, Box<Error>)>,
diff --git a/src/lib.rs b/src/lib.rs
index 69a0660..e6c12ce 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -20,6 +20,7 @@ mod decoder;
 mod error;
 mod ffi;
 mod frame;
+pub mod pix_fmt;
 
 pub use backend::Backend;
 pub use decoder::VideoDecoder;
diff --git a/src/pix_fmt.rs b/src/pix_fmt.rs
new file mode 100644
index 0000000..f3c594e
--- /dev/null
+++ b/src/pix_fmt.rs
@@ -0,0 +1,113 @@
+//! Stable `i32` constants for the pixel formats produced by `hwdecode`'s
+//! hardware decoders after `av_hwframe_transfer_data`.
+//!
+//! `Frame::pix_fmt()` returns the raw integer FFmpeg wrote to `AVFrame.format`
+//! (as a plain `i32` to avoid the enum-construction UB that an unvalidated
+//! cast would invoke). This module names the constants relevant to dispatch
+//! after a successful hardware decode.
+//!
+//! Because `hwdecode` is hardware-only, the formats listed here cover what
+//! the supported HW backends actually produce — the **NV** family (semi-
+//! planar 8-bit) and the **P0xx / P2xx / P4xx** family (semi-planar 10/12/16
+//! bit). VideoToolbox, VAAPI, NVDEC, and D3D11VA all download into one of
+//! these.
+//!
+//! Software-decoder output formats (`YUV420P`, `YUV422P`, `RGB24`, etc.) are
+//! intentionally **not** listed: callers handle software fallback outside
+//! this crate, and dispatch tables for those formats belong with the SW
+//! pipeline.
+//!
+//! For values not listed here, write `AVPixelFormat::AV_PIX_FMT_X as i32`
+//! directly — that's exactly the cast we use to define these constants.
+//!
+//! ```ignore
+//! use hwdecode::{pix_fmt, Frame};
+//! match frame.pix_fmt() {
+//!     pix_fmt::NV12   => /* 8-bit 4:2:0  → colconv::frame::Nv12Frame  */,
+//!     pix_fmt::P010LE => /* 10-bit 4:2:0 → colconv::frame::PnFrame<10> */,
+//!     other           => unimplemented!("pix_fmt {other}"),
+//! }
+//! ```
+
+use ffmpeg_next::ffi::AVPixelFormat;
+
+// --- semi-planar YUV (NV*) — 8-bit hardware download outputs ----------------
+
+/// 4:2:0, 8-bit, Y plane + interleaved Cb/Cr (`AV_PIX_FMT_NV12`). The
+/// dominant 8-bit HW download format on every supported backend.
+pub const NV12: i32 = AVPixelFormat::AV_PIX_FMT_NV12 as i32;
+/// 4:2:0, 8-bit, Y plane + interleaved Cr/Cb (`AV_PIX_FMT_NV21`).
+pub const NV21: i32 = AVPixelFormat::AV_PIX_FMT_NV21 as i32;
+/// 4:2:2, 8-bit, Y plane + interleaved Cb/Cr (`AV_PIX_FMT_NV16`).
+pub const NV16: i32 = AVPixelFormat::AV_PIX_FMT_NV16 as i32;
+/// 4:4:4, 8-bit, Y plane + interleaved Cb/Cr (`AV_PIX_FMT_NV24`).
+pub const NV24: i32 = AVPixelFormat::AV_PIX_FMT_NV24 as i32;
+
+// --- semi-planar YUV (P0xx) — 4:2:0 high-bit-depth HW downloads -------------
+
+/// 4:2:0, 10-bit, semi-planar little-endian (`AV_PIX_FMT_P010LE`). The
+/// dominant 10-bit HW download format.
+pub const P010LE: i32 = AVPixelFormat::AV_PIX_FMT_P010LE as i32;
+/// 4:2:0, 10-bit, semi-planar big-endian (`AV_PIX_FMT_P010BE`).
+pub const P010BE: i32 = AVPixelFormat::AV_PIX_FMT_P010BE as i32;
+/// 4:2:0, 12-bit, semi-planar little-endian (`AV_PIX_FMT_P012LE`).
+pub const P012LE: i32 = AVPixelFormat::AV_PIX_FMT_P012LE as i32;
+/// 4:2:0, 16-bit, semi-planar little-endian (`AV_PIX_FMT_P016LE`).
+pub const P016LE: i32 = AVPixelFormat::AV_PIX_FMT_P016LE as i32;
+
+// --- semi-planar YUV (P2xx) — 4:2:2 high-bit-depth HW downloads -------------
+
+/// 4:2:2, 10-bit, semi-planar little-endian (`AV_PIX_FMT_P210LE`).
+pub const P210LE: i32 = AVPixelFormat::AV_PIX_FMT_P210LE as i32;
+/// 4:2:2, 12-bit, semi-planar little-endian (`AV_PIX_FMT_P212LE`, FFmpeg 5.0+).
+pub const P212LE: i32 = AVPixelFormat::AV_PIX_FMT_P212LE as i32;
+/// 4:2:2, 16-bit, semi-planar little-endian (`AV_PIX_FMT_P216LE`).
+pub const P216LE: i32 = AVPixelFormat::AV_PIX_FMT_P216LE as i32;
+
+// --- semi-planar YUV (P4xx) — 4:4:4 high-bit-depth HW downloads -------------
+
+/// 4:4:4, 10-bit, semi-planar little-endian (`AV_PIX_FMT_P410LE`).
+pub const P410LE: i32 = AVPixelFormat::AV_PIX_FMT_P410LE as i32;
+/// 4:4:4, 12-bit, semi-planar little-endian (`AV_PIX_FMT_P412LE`, FFmpeg 5.0+).
+pub const P412LE: i32 = AVPixelFormat::AV_PIX_FMT_P412LE as i32;
+/// 4:4:4, 16-bit, semi-planar little-endian (`AV_PIX_FMT_P416LE`).
+pub const P416LE: i32 = AVPixelFormat::AV_PIX_FMT_P416LE as i32;
+
+// --- sentinel ---------------------------------------------------------------
+
+/// Sentinel value FFmpeg writes to `AVFrame.format` for an unset frame
+/// (`AV_PIX_FMT_NONE`). [`crate::Frame::empty`] returns this until the frame
+/// is filled by a decoder.
+pub const NONE: i32 = AVPixelFormat::AV_PIX_FMT_NONE as i32;
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  /// Regression check: if the underlying `AVPixelFormat` discriminants ever
+  /// change in `ffmpeg-sys-next`'s bindings, this catches it.
+  #[test]
+  fn constants_match_bindings() {
+    assert_eq!(NV12, AVPixelFormat::AV_PIX_FMT_NV12 as i32);
+    assert_eq!(P010LE, AVPixelFormat::AV_PIX_FMT_P010LE as i32);
+    assert_eq!(P416LE, AVPixelFormat::AV_PIX_FMT_P416LE as i32);
+    assert_eq!(NONE, -1, "AV_PIX_FMT_NONE must be -1 (FFmpeg ABI sentinel)");
+  }
+
+  #[test]
+  fn match_dispatch_compiles() {
+    fn classify(v: i32) -> &'static str {
+      match v {
+        NV12 => "nv12",
+        NV21 => "nv21",
+        P010LE => "p010le",
+        P210LE => "p210le",
+        P410LE => "p410le",
+        _ => "other",
+      }
+    }
+    assert_eq!(classify(NV12), "nv12");
+    assert_eq!(classify(P010LE), "p010le");
+    assert_eq!(classify(NONE), "other");
+  }
+}
diff --git a/tests/decode.rs b/tests/decode.rs
index bc15f30..10a8bcb 100644
--- a/tests/decode.rs
+++ b/tests/decode.rs
@@ -28,7 +28,17 @@ fn auto_open_decodes_at_least_one_frame() {
   let expected_w = unsafe { (*stream.parameters().as_ptr()).width as u32 };
   let expected_h = unsafe { (*stream.parameters().as_ptr()).height as u32 };
 
-  let mut decoder = VideoDecoder::open(stream.parameters()).expect("open decoder");
+  let mut decoder = match VideoDecoder::open(stream.parameters()) {
+    Ok(d) => d,
+    Err(hwdecode::Error::AllBackendsFailed { attempts }) => {
+      eprintln!(
+        "skipping: no hardware backend available ({} attempts)",
+        attempts.len()
+      );
+      return;
+    }
+    Err(e) => panic!("open decoder: {e}"),
+  };
   eprintln!("optimistic backend = {:?}", decoder.backend());
 
   assert_eq!(decoder.width(), expected_w);
diff --git a/tests/hw_smoke.rs b/tests/hw_smoke.rs
index e734533..6e11765 100644
--- a/tests/hw_smoke.rs
+++ b/tests/hw_smoke.rs
@@ -7,7 +7,7 @@
 
 use ffmpeg::{format, media};
 use ffmpeg_next as ffmpeg;
-use hwdecode::{Backend, Frame, VideoDecoder};
+use hwdecode::{Frame, VideoDecoder};
 
 const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
 
@@ -60,9 +60,6 @@ fn auto_probe_picks_hardware_backend() {
     }
   }
   assert!(got_frame, "no frames decoded");
-  assert_ne!(
-    decoder.backend(),
-    Backend::Software,
-    "expected hardware backend after first frame; got Software"
-  );
+  // hwdecode is hardware-only — `backend()` after a successful first frame
+  // is by construction one of the HW variants. Logged above for visibility.
 }

From 189b6ba0161e29af5ae976593973d7c7fdd0c3fc Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 17:15:59 +1200
Subject: [PATCH 05/27] update

---
 src/backend.rs |  17 +++---
 src/decoder.rs |  38 +++++++++---
 src/ffi.rs     | 156 +++++++++++++++++++++++++++++++++--------------
 src/frame.rs   | 162 ++++++++++++++++++++++++++++++++++++++++---------
 4 files changed, 284 insertions(+), 89 deletions(-)

diff --git a/src/backend.rs b/src/backend.rs
index bce8699..00cf82e 100644
--- a/src/backend.rs
+++ b/src/backend.rs
@@ -1,4 +1,4 @@
-use ffmpeg_next::{ffi::AVHWDeviceType, format::Pixel};
+use ffmpeg_next::ffi::{AVHWDeviceType, AVPixelFormat};
 
 /// Hardware decoding backend.
 ///
@@ -33,13 +33,16 @@ impl Backend {
   /// Hardware pixel format the codec is expected to produce when this
   /// backend is in use. (The post-`av_hwframe_transfer_data` CPU format is
   /// typically `NV12` or `P010LE`; this is the *pre-transfer* sentinel.)
-  #[allow(dead_code)] // surfaced for tests / future use
-  pub(crate) fn hw_pixel_format(self) -> Pixel {
+  ///
+  /// Returns a `AVPixelFormat` value constructed from a hardcoded constant
+  /// in our bindings — never reads an enum value supplied by FFmpeg, so
+  /// no enum-discriminant UB risk.
+  pub(crate) fn hw_pixel_format(self) -> AVPixelFormat {
     match self {
-      Self::VideoToolbox => Pixel::VIDEOTOOLBOX,
-      Self::Vaapi => Pixel::VAAPI,
-      Self::Cuda => Pixel::CUDA,
-      Self::D3d11va => Pixel::D3D11,
+      Self::VideoToolbox => AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX,
+      Self::Vaapi => AVPixelFormat::AV_PIX_FMT_VAAPI,
+      Self::Cuda => AVPixelFormat::AV_PIX_FMT_CUDA,
+      Self::D3d11va => AVPixelFormat::AV_PIX_FMT_D3D11,
     }
   }
 }
diff --git a/src/decoder.rs b/src/decoder.rs
index 09f5c67..d6cffe3 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -12,7 +12,7 @@ use ffmpeg_next::{
 use crate::{
   backend::{self, Backend},
   error::{Error, Result},
-  ffi::{find_hw_pix_fmt, get_hw_format, CallbackState},
+  ffi::{codec_supports_hwaccel, get_hw_format, CallbackState},
   frame::Frame,
 };
 
@@ -199,22 +199,32 @@ impl VideoDecoder {
     self.state.inner.frame_rate()
   }
 
-  /// Submit a packet to the decoder. While the probe is active the packet is
-  /// also buffered for potential replay through a fallback backend.
+  /// Submit a packet to the decoder. On success — and only on success —
+  /// the packet is buffered for potential replay through a fallback backend
+  /// while the probe is active. A failed send (including EAGAIN) does not
+  /// mutate replay state, so a later probe advance only replays history
+  /// FFmpeg actually accepted.
   pub fn send_packet(&mut self, packet: &Packet) -> Result<()> {
+    self
+      .state
+      .inner
+      .send_packet(packet)
+      .map_err(Error::Ffmpeg)?;
     if let Some(probe) = self.probe.as_mut() {
       probe.buffered_packets.push(packet.clone());
     }
-    self.state.inner.send_packet(packet).map_err(Error::Ffmpeg)
+    Ok(())
   }
 
   /// Signal end-of-stream to the decoder; remaining frames can be drained
-  /// with [`Self::receive_frame`]. Recorded for replay if probe is active.
+  /// with [`Self::receive_frame`]. Recorded for replay only if the underlying
+  /// `send_eof` succeeds.
   pub fn send_eof(&mut self) -> Result<()> {
+    self.state.inner.send_eof().map_err(Error::Ffmpeg)?;
     if let Some(probe) = self.probe.as_mut() {
       probe.eof_sent = true;
     }
-    self.state.inner.send_eof().map_err(Error::Ffmpeg)
+    Ok(())
   }
 
   /// Receive a CPU-side decoded frame.
@@ -385,9 +395,14 @@ impl VideoDecoder {
     let mut ctx = Context::from_parameters(parameters)?;
     let av_type = backend.av_hwdevice_type();
 
-    // Verify the codec advertises this hwaccel.
-    let hw_pix_fmt = find_hw_pix_fmt(unsafe { codec.as_ptr() }, av_type)
-      .ok_or(Error::BackendUnsupportedByCodec(backend))?;
+    // Verify the codec advertises this hwaccel. We do *not* read the
+    // codec's advertised pix_fmt — we use the hardcoded constant from
+    // `Backend::hw_pixel_format` so no FFmpeg-supplied enum value is ever
+    // interpreted as `AVPixelFormat`.
+    if !codec_supports_hwaccel(unsafe { codec.as_ptr() }, av_type) {
+      return Err(Error::BackendUnsupportedByCodec(backend));
+    }
+    let hw_pix_fmt = backend.hw_pixel_format();
 
     // Create the device context.
     let mut hw_device_ref: *mut AVBufferRef = ptr::null_mut();
@@ -402,7 +417,10 @@ impl VideoDecoder {
       });
     }
 
-    let callback_state = Box::into_raw(Box::new(CallbackState { wanted: hw_pix_fmt }));
+    let callback_state = Box::into_raw(Box::new(CallbackState {
+      wanted: hw_pix_fmt,
+      wanted_int: hw_pix_fmt as i32,
+    }));
     // SAFETY: ctx is a freshly-constructed AVCodecContext we own;
     // av_buffer_ref bumps the refcount of the device buffer for FFmpeg's
     // use (we keep our own ref in `hw_device_ref` for cleanup).
diff --git a/src/ffi.rs b/src/ffi.rs
index 78ee80c..794d474 100644
--- a/src/ffi.rs
+++ b/src/ffi.rs
@@ -1,5 +1,15 @@
 //! FFI shims used by the decoder. Kept in one place so the unsafe surface is
 //! easy to audit.
+//!
+//! All reads of `AVPixelFormat` / `AVHWDeviceType` values returned by FFmpeg
+//! at runtime go through `ptr::read::<i32>` after a pointer cast, never
+//! through the bindgen-generated Rust enum. The enums are `#[repr(i32)]`
+//! and constructing them from a value not in the listed discriminants is
+//! undefined behavior — exactly the situation header/library skew creates.
+//! See the doc comments on individual functions for what is read as raw
+//! integer vs. constructed from a known constant.
+
+use std::ptr;
 
 use ffmpeg_next::ffi::{
   avcodec_get_hw_config, AVCodec, AVCodecContext, AVHWDeviceType, AVPixelFormat,
@@ -10,22 +20,29 @@ use ffmpeg_next::ffi::{
 /// the correct hardware pixel format without globals. One instance per
 /// decoder; freed by [`crate::VideoDecoder`] after the codec context is
 /// dropped.
+///
+/// `wanted` is set from a hardcoded `AVPixelFormat` constant in our bindings
+/// (via `Backend::hw_pixel_format`), so it is always a valid enum value. We
+/// also store its raw `i32` so the callback can compare against the offered
+/// list without going through enum reads.
 #[repr(C)]
 pub(crate) struct CallbackState {
-  /// Hardware pixel format we want the decoder to produce.
+  /// Hardware pixel format we want the decoder to produce. Constructed
+  /// from a known constant; safe to use as the callback's return value.
   pub(crate) wanted: AVPixelFormat,
+  /// Same value as `wanted` cast to `i32`, cached so the callback's
+  /// pix_fmts walk doesn't have to convert per iteration.
+  pub(crate) wanted_int: i32,
 }
 
 /// `AVCodecContext::get_format` callback. FFmpeg invokes it with the list of
 /// pixel formats the codec is willing to output for the current stream.
 ///
-/// Returns the configured hardware format if present; otherwise
-/// [`AVPixelFormat::AV_PIX_FMT_NONE`], which causes the decoder to fail. The
-/// failure surfaces as a normal `Error::Ffmpeg` from
-/// [`crate::VideoDecoder::receive_frame`]; for `VideoDecoder::open` callers
-/// the probe loop tears down and retries with the next backend (replaying
-/// buffered packets), so software fallback happens at the decoder level
-/// rather than silently in-context.
+/// The offered list is walked as `*const i32` (cast from `*const AVPixelFormat`)
+/// to avoid constructing the bindgen enum from values that may not be in our
+/// build's discriminant set. The return value is either `wanted` (a known
+/// constant) or `AV_PIX_FMT_NONE` (also a known constant) — both safe to
+/// produce as `AVPixelFormat`.
 pub(crate) unsafe extern "C" fn get_hw_format(
   ctx: *mut AVCodecContext,
   pix_fmts: *const AVPixelFormat,
@@ -38,41 +55,68 @@ pub(crate) unsafe extern "C" fn get_hw_format(
   // codec context's drop runs). When opaque is null we treat the call as
   // strict — a stray invocation cannot silently downgrade.
   let state = unsafe { (*ctx).opaque as *const CallbackState };
-  let wanted = if state.is_null() {
-    AVPixelFormat::AV_PIX_FMT_NONE
+  let (wanted, wanted_int) = if state.is_null() {
+    (
+      AVPixelFormat::AV_PIX_FMT_NONE,
+      AVPixelFormat::AV_PIX_FMT_NONE as i32,
+    )
   } else {
-    unsafe { (*state).wanted }
+    unsafe { ((*state).wanted, (*state).wanted_int) }
   };
 
-  let mut p = pix_fmts;
-  while unsafe { *p } != AVPixelFormat::AV_PIX_FMT_NONE {
-    if unsafe { *p } == wanted {
+  // Walk the offered list as i32. The pointer cast is sound because
+  // `AVPixelFormat` is `#[repr(i32)]` (same size and alignment as i32).
+  // Reading as i32 cannot be UB regardless of the value FFmpeg wrote.
+  let mut p = pix_fmts as *const i32;
+  let none_int = AVPixelFormat::AV_PIX_FMT_NONE as i32;
+  loop {
+    // SAFETY: FFmpeg guarantees the list is terminated by AV_PIX_FMT_NONE.
+    // We bail at the sentinel; reads up to and including it are in-bounds.
+    let v = unsafe { ptr::read(p) };
+    if v == none_int {
+      return AVPixelFormat::AV_PIX_FMT_NONE;
+    }
+    if v == wanted_int {
       return wanted;
     }
     p = unsafe { p.add(1) };
   }
-  AVPixelFormat::AV_PIX_FMT_NONE
 }
 
-/// Walk the codec's `AVCodecHWConfig` table and return the hardware pixel
-/// format associated with `device_type`, if the codec advertises one that
-/// uses the `HW_DEVICE_CTX` setup method.
-pub(crate) fn find_hw_pix_fmt(
-  codec: *const AVCodec,
-  device_type: AVHWDeviceType,
-) -> Option<AVPixelFormat> {
+/// Walk the codec's `AVCodecHWConfig` table and return whether the codec
+/// advertises support for `device_type` via the `HW_DEVICE_CTX` setup method.
+///
+/// We do not return the codec's advertised `pix_fmt` — we know it already
+/// from [`crate::backend::Backend::hw_pixel_format`] (a hardcoded constant
+/// from our bindings). All reads from the FFmpeg-supplied `AVCodecHWConfig`
+/// are performed as raw integers via `addr_of!` + `ptr::read::<i32>` to
+/// avoid copying or interpreting enum-typed fields whose runtime values
+/// might not match our build's discriminant set.
+pub(crate) fn codec_supports_hwaccel(codec: *const AVCodec, device_type: AVHWDeviceType) -> bool {
   debug_assert!(!codec.is_null());
+  let device_type_int = device_type as i32;
   let mut i = 0;
   loop {
     // SAFETY: `avcodec_get_hw_config` returns null past the end; we stop then.
     let cfg = unsafe { avcodec_get_hw_config(codec, i) };
     if cfg.is_null() {
-      return None;
+      return false;
     }
-    let cfg = unsafe { *cfg };
-    let supports_device_ctx = cfg.methods & (AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX as i32) != 0;
-    if supports_device_ctx && cfg.device_type == device_type {
-      return Some(cfg.pix_fmt);
+    // Read each field as raw integer rather than copying the whole struct
+    // (which would interpret `pix_fmt` and `device_type` as their enum types).
+    // SAFETY: `cfg` is non-null and points to a valid `AVCodecHWConfig` for
+    // the lifetime of the call; `addr_of!` projects to a sized field; the
+    // `*const i32` cast is sound because `methods` is `c_int` (i32) and
+    // `device_type` is `AVHWDeviceType` (`#[repr(u32)]`, but FFmpeg's
+    // assigned values fit in i32 and the runtime layout is i32-sized).
+    let methods: i32 = unsafe { ptr::read(ptr::addr_of!((*cfg).methods)) };
+    let cfg_device_type_int: i32 =
+      unsafe { ptr::read(ptr::addr_of!((*cfg).device_type) as *const i32) };
+
+    if methods & (AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX as i32) != 0
+      && cfg_device_type_int == device_type_int
+    {
+      return true;
     }
     i += 1;
   }
@@ -81,7 +125,6 @@ pub(crate) fn find_hw_pix_fmt(
 #[cfg(test)]
 mod tests {
   use super::*;
-  use std::ptr;
 
   // The callback derefs `(*ctx).opaque`, so we need a real-looking
   // AVCodecContext. We construct a zeroed one (the callback only reads opaque).
@@ -100,22 +143,32 @@ mod tests {
     }
   }
 
-  fn run(state: &CallbackState, mut offered: Vec<AVPixelFormat>) -> AVPixelFormat {
-    offered.push(AVPixelFormat::AV_PIX_FMT_NONE);
+  fn make_state(wanted: AVPixelFormat) -> CallbackState {
+    CallbackState {
+      wanted,
+      wanted_int: wanted as i32,
+    }
+  }
+
+  fn run(state: &CallbackState, mut offered: Vec<i32>) -> AVPixelFormat {
+    // Build the offered list as raw i32, terminated by AV_PIX_FMT_NONE.
+    offered.push(AVPixelFormat::AV_PIX_FMT_NONE as i32);
     let ctx = FakeCtx::new(state as *const _ as *mut _);
-    unsafe { get_hw_format(ctx.0, offered.as_ptr()) }
+    // SAFETY: we cast the i32 buffer pointer to *const AVPixelFormat
+    // because that's the function's declared signature. The callback only
+    // ever reads through *const i32 internally, so this transit through
+    // *const AVPixelFormat is purely a type system formality.
+    unsafe { get_hw_format(ctx.0, offered.as_ptr() as *const AVPixelFormat) }
   }
 
   #[test]
   fn returns_wanted_hw_format_when_offered() {
-    let state = CallbackState {
-      wanted: AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX,
-    };
+    let state = make_state(AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX);
     let got = run(
       &state,
       vec![
-        AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX,
-        AVPixelFormat::AV_PIX_FMT_NV12,
+        AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX as i32,
+        AVPixelFormat::AV_PIX_FMT_NV12 as i32,
       ],
     );
     assert_eq!(got, AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX);
@@ -123,14 +176,12 @@ mod tests {
 
   #[test]
   fn returns_none_when_wanted_absent() {
-    let state = CallbackState {
-      wanted: AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX,
-    };
+    let state = make_state(AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX);
     let got = run(
       &state,
       vec![
-        AVPixelFormat::AV_PIX_FMT_NV12,
-        AVPixelFormat::AV_PIX_FMT_YUV420P,
+        AVPixelFormat::AV_PIX_FMT_NV12 as i32,
+        AVPixelFormat::AV_PIX_FMT_YUV420P as i32,
       ],
     );
     assert_eq!(got, AVPixelFormat::AV_PIX_FMT_NONE);
@@ -142,11 +193,28 @@ mod tests {
     let ctx_raw = Box::into_raw(boxed);
     unsafe { (*ctx_raw).opaque = ptr::null_mut() };
     let offered = [
-      AVPixelFormat::AV_PIX_FMT_NV12,
-      AVPixelFormat::AV_PIX_FMT_NONE,
+      AVPixelFormat::AV_PIX_FMT_NV12 as i32,
+      AVPixelFormat::AV_PIX_FMT_NONE as i32,
     ];
-    let got = unsafe { get_hw_format(ctx_raw, offered.as_ptr()) };
+    let got = unsafe { get_hw_format(ctx_raw, offered.as_ptr() as *const AVPixelFormat) };
     assert_eq!(got, AVPixelFormat::AV_PIX_FMT_NONE);
     unsafe { drop(Box::from_raw(ctx_raw)) };
   }
+
+  #[test]
+  fn unknown_offered_value_is_skipped_without_ub() {
+    // Simulate a header-skewed FFmpeg that offers a pixel-format value we
+    // don't have a binding constant for (e.g. some future format). The
+    // callback walks the list as i32 — no enum is constructed from that
+    // value, so this read is sound.
+    let state = make_state(AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX);
+    let got = run(
+      &state,
+      vec![
+        99_999_i32, // imaginary unknown
+        AVPixelFormat::AV_PIX_FMT_NV12 as i32,
+      ],
+    );
+    assert_eq!(got, AVPixelFormat::AV_PIX_FMT_NONE);
+  }
 }
diff --git a/src/frame.rs b/src/frame.rs
index f221b68..d6ceca6 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -1,23 +1,27 @@
 //! CPU-side decoded video frame.
 //!
-//! Wraps `ffmpeg_next::frame::Video` so callers cannot reach the upstream
-//! `format()` accessor, which constructs an `AVPixelFormat` enum from the
-//! raw integer FFmpeg writes into `AVFrame.format`. That conversion is UB
-//! when the value isn't in the bindgen-generated enum (library/header skew,
-//! a new pixel format added upstream, etc.). The wrapper exposes
-//! [`Frame::pix_fmt`] which reads the field as a plain `i32` — sound for any
-//! value FFmpeg can produce — and accessors are limited to fields whose
-//! reads do not invoke the same hazard.
+//! Wraps `ffmpeg_next::frame::Video`. All accessors read from raw `AVFrame`
+//! fields (`format`, `linesize`, `data`, `width`, `height`, `pts`) directly
+//! and never go through ffmpeg-next's `Video::format()` / `plane_height()`
+//! / `plane_width()` / `data()` — those construct `AVPixelFormat` from the
+//! frame's raw `format` integer via `transmute`, which is undefined behavior
+//! when the value isn't in the build's bindgen-generated discriminant set
+//! (the exact failure mode this crate is designed to survive).
 //!
-//! Compare formats against integer constants taken from the FFI layer, e.g.
+//! Plane lengths for [`Frame::data`] are computed from a hardcoded chroma-
+//! subsampling table keyed on the safe `pix_fmt()` integer, covering only
+//! the formats `hwdecode` produces (the NV* and P0xx/P2xx/P4xx families
+//! after `av_hwframe_transfer_data`). For any other format, [`Frame::data`]
+//! returns `None` rather than guessing at a slice length.
 //!
-//! ```ignore
-//! use ffmpeg_next::ffi::AVPixelFormat;
-//! if frame.pix_fmt() == AVPixelFormat::AV_PIX_FMT_NV12 as i32 { ... }
-//! ```
+//! Compare formats against integer constants in [`crate::pix_fmt`].
+
+use std::slice;
 
 use ffmpeg_next::frame;
 
+use crate::pix_fmt;
+
 /// CPU-side decoded video frame produced by [`crate::VideoDecoder`].
 pub struct Frame {
   inner: frame::Video,
@@ -34,45 +38,93 @@ impl Frame {
 
   /// Width in pixels.
   pub fn width(&self) -> u32 {
-    self.inner.width()
+    // SAFETY: AVFrame.width is c_int; safe to read regardless of value.
+    unsafe { (*self.inner.as_ptr()).width as u32 }
   }
 
   /// Height in pixels.
   pub fn height(&self) -> u32 {
-    self.inner.height()
+    // SAFETY: AVFrame.height is c_int.
+    unsafe { (*self.inner.as_ptr()).height as u32 }
   }
 
   /// Pixel format, returned as the raw `i32` value FFmpeg wrote to
   /// `AVFrame.format`. Sound regardless of the linked FFmpeg version —
   /// no `AVPixelFormat` enum is constructed.
   ///
-  /// Compare against integer constants from `ffmpeg_next::ffi`, e.g.
-  /// `frame.pix_fmt() == AVPixelFormat::AV_PIX_FMT_NV12 as i32`.
+  /// Compare against constants in [`crate::pix_fmt`].
   pub fn pix_fmt(&self) -> i32 {
-    // SAFETY: `AVFrame.format` is bound as `c_int`; reading it yields a
-    // plain integer with no validity invariants.
+    // SAFETY: AVFrame.format is bound as c_int.
     unsafe { (*self.inner.as_ptr()).format }
   }
 
-  /// Presentation timestamp in stream time base, or `None` if the frame
-  /// carries `AV_NOPTS_VALUE`.
+  /// Presentation timestamp in stream time base, or `None` for
+  /// `AV_NOPTS_VALUE`.
   pub fn pts(&self) -> Option<i64> {
+    // ffmpeg-next's Frame::pts performs no enum conversion; safe to use.
     self.inner.pts()
   }
 
-  /// Number of populated planes (e.g. 3 for `YUV420P`, 2 for `NV12`).
+  /// Number of populated planes (1 for packed formats, 2 for NV12/P010,
+  /// 3 for planar YUV, etc.). Computed by scanning `linesize` for the
+  /// first zero entry — no enum reads.
   pub fn planes(&self) -> usize {
-    self.inner.planes()
+    // SAFETY: AVFrame.linesize is `[c_int; 8]`; reads are sound.
+    unsafe {
+      let linesize = &(*self.inner.as_ptr()).linesize;
+      for (i, ls) in linesize.iter().enumerate() {
+        if *ls == 0 {
+          return i;
+        }
+      }
+      linesize.len()
+    }
   }
 
-  /// Bytes per row for `plane`. Panics if `plane >= planes()`.
+  /// Bytes per row for `plane`. Reads `AVFrame.linesize[plane]` directly.
+  /// Panics if `plane >= planes()`.
   pub fn stride(&self, plane: usize) -> usize {
-    self.inner.stride(plane)
+    let n = self.planes();
+    assert!(
+      plane < n,
+      "stride: plane {plane} out of bounds (planes={n})"
+    );
+    // SAFETY: bounds-checked above; linesize is `[c_int; 8]`.
+    unsafe { (*self.inner.as_ptr()).linesize[plane] as usize }
   }
 
-  /// Pixel data for `plane`. Panics if `plane >= planes()`.
-  pub fn data(&self, plane: usize) -> &[u8] {
-    self.inner.data(plane)
+  /// Pixel data for `plane`.
+  ///
+  /// Returns `None` when the frame's pixel format is not one of the
+  /// hardware-output formats listed in [`crate::pix_fmt`] — we cannot
+  /// safely compute the plane size for an unknown layout. Returns `None`
+  /// for an out-of-bounds plane index, a null data pointer, or an empty
+  /// frame.
+  ///
+  /// Currently supported (post-`av_hwframe_transfer_data`):
+  /// - 4:2:0 semi-planar 8-bit: `NV12`, `NV21`
+  /// - 4:2:2 semi-planar 8-bit: `NV16`
+  /// - 4:4:4 semi-planar 8-bit: `NV24`
+  /// - 4:2:0 semi-planar 10/12/16-bit: `P010LE`/`P010BE`/`P012LE`/`P016LE`
+  /// - 4:2:2 semi-planar 10/12/16-bit: `P210LE`/`P212LE`/`P216LE`
+  /// - 4:4:4 semi-planar 10/12/16-bit: `P410LE`/`P412LE`/`P416LE`
+  pub fn data(&self, plane: usize) -> Option<&[u8]> {
+    if plane >= self.planes() {
+      return None;
+    }
+    let stride = self.stride(plane);
+    let plane_height = plane_height_for(self.pix_fmt(), plane, self.height() as usize)?;
+    let len = stride.checked_mul(plane_height)?;
+    // SAFETY: bounds-checked plane index above. We trust FFmpeg to populate
+    // `data[plane]` validly when `linesize[plane]` is non-zero (which we
+    // verified via `planes()`); null-check guards against edge cases.
+    unsafe {
+      let ptr = (*self.inner.as_ptr()).data[plane];
+      if ptr.is_null() {
+        return None;
+      }
+      Some(slice::from_raw_parts(ptr, len))
+    }
   }
 
   /// Crate-internal: hand the wrapped frame to FFmpeg / our decoder code.
@@ -87,6 +139,37 @@ impl Default for Frame {
   }
 }
 
+/// Number of rows in `plane` for a frame of `frame_height` and the given
+/// pixel format. `None` for formats not in the supported HW-output set.
+fn plane_height_for(pix_fmt_int: i32, plane: usize, frame_height: usize) -> Option<usize> {
+  match pix_fmt_int {
+    // 4:2:0 semi-planar — Y full height, chroma half height.
+    pix_fmt::NV12
+    | pix_fmt::NV21
+    | pix_fmt::P010LE
+    | pix_fmt::P010BE
+    | pix_fmt::P012LE
+    | pix_fmt::P016LE => match plane {
+      0 => Some(frame_height),
+      1 => Some(frame_height.div_ceil(2)),
+      _ => None,
+    },
+    // 4:2:2 / 4:4:4 semi-planar — both planes full height.
+    pix_fmt::NV16
+    | pix_fmt::NV24
+    | pix_fmt::P210LE
+    | pix_fmt::P212LE
+    | pix_fmt::P216LE
+    | pix_fmt::P410LE
+    | pix_fmt::P412LE
+    | pix_fmt::P416LE => match plane {
+      0 | 1 => Some(frame_height),
+      _ => None,
+    },
+    _ => None,
+  }
+}
+
 #[cfg(test)]
 mod tests {
   use super::*;
@@ -99,6 +182,15 @@ mod tests {
     assert_eq!(f.pts(), None);
     // AVFrame.format defaults to -1 (AV_PIX_FMT_NONE) for an empty frame.
     assert_eq!(f.pix_fmt(), -1);
+    // No active planes for an empty frame (all linesize entries are 0).
+    assert_eq!(f.planes(), 0);
+  }
+
+  #[test]
+  fn data_returns_none_for_unknown_format() {
+    let f = Frame::empty();
+    // pix_fmt is NONE (-1), not in the supported set.
+    assert!(f.data(0).is_none());
   }
 
   #[test]
@@ -106,4 +198,18 @@ mod tests {
     fn check<T: Send>() {}
     check::<Frame>();
   }
+
+  #[test]
+  fn plane_height_table_covers_supported_formats() {
+    // Spot-check the chroma subsampling table.
+    assert_eq!(plane_height_for(pix_fmt::NV12, 0, 1080), Some(1080));
+    assert_eq!(plane_height_for(pix_fmt::NV12, 1, 1080), Some(540));
+    assert_eq!(plane_height_for(pix_fmt::NV12, 1, 1081), Some(541));
+    assert_eq!(plane_height_for(pix_fmt::P010LE, 1, 1080), Some(540));
+    assert_eq!(plane_height_for(pix_fmt::NV16, 1, 1080), Some(1080));
+    assert_eq!(plane_height_for(pix_fmt::NV24, 1, 1080), Some(1080));
+    assert_eq!(plane_height_for(pix_fmt::P416LE, 1, 1080), Some(1080));
+    assert_eq!(plane_height_for(pix_fmt::NONE, 0, 1080), None);
+    assert_eq!(plane_height_for(pix_fmt::NV12, 2, 1080), None);
+  }
 }

From fa30939968282e84a3752df080c96e9346307a88 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 18:09:51 +1200
Subject: [PATCH 06/27] update

---
 src/decoder.rs | 261 +++++++++++++++++++++++++++++++++++++++++--------
 src/error.rs   |   9 +-
 src/frame.rs   |  99 ++++++++++++++++---
 3 files changed, 315 insertions(+), 54 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index d6cffe3..fcaea44 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -1,14 +1,27 @@
-use std::{mem::ManuallyDrop, ptr};
+use std::{collections::VecDeque, mem::ManuallyDrop, ptr};
 
 use ffmpeg_next::{
   codec::{self, Context},
   ffi::{
-    av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_frame_unref, av_hwdevice_ctx_create,
-    av_hwframe_transfer_data, AVBufferRef,
+    av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_frame_move_ref, av_frame_unref,
+    av_hwdevice_ctx_create, av_hwframe_transfer_data, AVBufferRef, AVCodec,
   },
   frame, Codec, Packet, Rational,
 };
 
+/// Local FFI shim: `avcodec_find_decoder` declared with `c_int` instead of
+/// the bindgen `AVCodecID` enum. Constructing `AVCodecID` from a runtime
+/// integer that isn't in our build's discriminant set is UB; calling the
+/// C function with a raw int avoids that boundary entirely. Both Rust
+/// declarations resolve to the same C symbol at link time.
+mod c_shims {
+  use super::AVCodec;
+  use libc::c_int;
+  extern "C" {
+    pub fn avcodec_find_decoder(id: c_int) -> *const AVCodec;
+  }
+}
+
 use crate::{
   backend::{self, Backend},
   error::{Error, Result},
@@ -42,6 +55,13 @@ pub struct VideoDecoder {
   /// backend, then `None`. While `Some`, packets are buffered for replay and
   /// non-transient errors / decoder failures advance to the next backend.
   probe: Option<ProbeState>,
+  /// CPU-side frames produced by a candidate decoder during probe replay
+  /// (when its internal queue filled and we had to drain output before the
+  /// next `send_packet`). Already transferred from the candidate's
+  /// `AVHWFramesContext` to a CPU frame, so they remain valid after the
+  /// candidate state is committed. [`Self::receive_frame`] dequeues these
+  /// FIFO before reading from `state.inner`.
+  pending_frames: VecDeque<frame::Video>,
 }
 
 /// Owned FFmpeg state for one open codec context. Has its own `Drop` so we
@@ -121,8 +141,7 @@ impl VideoDecoder {
   /// `open` cannot return without a working decoder for any codec libavcodec
   /// supports.
   pub fn open(parameters: codec::Parameters) -> Result<Self> {
-    let codec_id = codec::Id::from(unsafe { (*parameters.as_ptr()).codec_id });
-    let codec = ffmpeg_next::decoder::find(codec_id).ok_or(Error::NoCodec(codec_id))?;
+    let codec = find_decoder(&parameters)?;
     let order = backend::probe_order();
 
     let mut attempts: Vec<(Backend, Box<Error>)> = Vec::new();
@@ -142,6 +161,7 @@ impl VideoDecoder {
             state,
             hw_frame: frame::Video::empty(),
             probe,
+            pending_frames: VecDeque::new(),
           });
         }
         Err(e) => {
@@ -161,13 +181,13 @@ impl VideoDecoder {
   /// `AV_PIX_FMT_NONE`, the decoder errors out). The caller is responsible
   /// for retrying with `Backend::Software` or another backend if desired.
   pub fn open_with(parameters: codec::Parameters, backend: Backend) -> Result<Self> {
-    let codec_id = codec::Id::from(unsafe { (*parameters.as_ptr()).codec_id });
-    let codec = ffmpeg_next::decoder::find(codec_id).ok_or(Error::NoCodec(codec_id))?;
+    let codec = find_decoder(&parameters)?;
     let state = Self::build_state(parameters, codec, backend)?;
     Ok(Self {
       state,
       hw_frame: frame::Video::empty(),
       probe: None,
+      pending_frames: VecDeque::new(),
     })
   }
 
@@ -199,32 +219,64 @@ impl VideoDecoder {
     self.state.inner.frame_rate()
   }
 
-  /// Submit a packet to the decoder. On success — and only on success —
-  /// the packet is buffered for potential replay through a fallback backend
-  /// while the probe is active. A failed send (including EAGAIN) does not
-  /// mutate replay state, so a later probe advance only replays history
-  /// FFmpeg actually accepted.
+  /// Submit a packet to the decoder.
+  ///
+  /// On success — and only on success — the packet is buffered for potential
+  /// replay through a fallback backend while the probe is active. EAGAIN
+  /// (decoder needs `receive_frame` to drain output first) propagates as
+  /// normal backpressure; the caller drains then retries.
+  ///
+  /// While the probe is active, a non-transient error (e.g. the active HW
+  /// backend rejecting this stream's geometry on first packet) advances the
+  /// probe to the next candidate and retries the packet there. The caller
+  /// observes only the eventual success or, if the probe is exhausted, the
+  /// final error.
   pub fn send_packet(&mut self, packet: &Packet) -> Result<()> {
-    self
-      .state
-      .inner
-      .send_packet(packet)
-      .map_err(Error::Ffmpeg)?;
-    if let Some(probe) = self.probe.as_mut() {
-      probe.buffered_packets.push(packet.clone());
+    loop {
+      match self.state.inner.send_packet(packet) {
+        Ok(()) => {
+          if let Some(probe) = self.probe.as_mut() {
+            probe.buffered_packets.push(packet.clone());
+          }
+          return Ok(());
+        }
+        Err(e) if is_transient(&e) => {
+          // Normal backpressure / EOF — pass through unchanged.
+          return Err(Error::Ffmpeg(e));
+        }
+        Err(e) => {
+          if self.probe.is_some() && self.advance_probe()? {
+            continue;
+          }
+          return Err(Error::Ffmpeg(e));
+        }
+      }
     }
-    Ok(())
   }
 
-  /// Signal end-of-stream to the decoder; remaining frames can be drained
-  /// with [`Self::receive_frame`]. Recorded for replay only if the underlying
-  /// `send_eof` succeeds.
+  /// Signal end-of-stream to the decoder.
+  ///
+  /// Recorded for replay only if the underlying `send_eof` succeeds. While
+  /// the probe is active, non-transient errors trigger probe advance and
+  /// retry, matching `send_packet`'s behaviour.
   pub fn send_eof(&mut self) -> Result<()> {
-    self.state.inner.send_eof().map_err(Error::Ffmpeg)?;
-    if let Some(probe) = self.probe.as_mut() {
-      probe.eof_sent = true;
+    loop {
+      match self.state.inner.send_eof() {
+        Ok(()) => {
+          if let Some(probe) = self.probe.as_mut() {
+            probe.eof_sent = true;
+          }
+          return Ok(());
+        }
+        Err(e) if is_transient(&e) => return Err(Error::Ffmpeg(e)),
+        Err(e) => {
+          if self.probe.is_some() && self.advance_probe()? {
+            continue;
+          }
+          return Err(Error::Ffmpeg(e));
+        }
+      }
     }
-    Ok(())
   }
 
   /// Receive a CPU-side decoded frame.
@@ -238,18 +290,25 @@ impl VideoDecoder {
   /// error, transfer error, copy_props error, or a CPU-format frame from a
   /// HW-opened context) tears down the current decoder and advances to the
   /// next hardware backend in probe order, replaying buffered packets
-  /// through it. The caller observes only the eventual successful frame
-  /// (or, if every backend has been exhausted, the underlying error).
+  /// through it. Frames the candidate produced during replay (drained when
+  /// `send_packet` returned EAGAIN) are queued and delivered FIFO via this
+  /// method, so the caller never loses initial frames after a fallback.
   ///
   /// This crate is hardware-only: there is no software fallback inside the
   /// decoder. If every backend is exhausted, the failure surfaces as the
-  /// last decoder error (or [`Error::HwBackendProducedSwFrame`] for the
-  /// degraded-CPU-frame case). Callers handle software fallback themselves.
+  /// last decoder error. Callers handle software fallback themselves.
   ///
   /// Returns the same transient signals as `ffmpeg::decoder::Video`:
   /// `Error::Ffmpeg(Other { errno: EAGAIN })` when no frame is ready and
   /// more packets must be sent, and `Error::Ffmpeg(Eof)` once fully drained.
   pub fn receive_frame(&mut self, frame: &mut Frame) -> Result<()> {
+    // Pre-drain frames queued during probe replay. They are already CPU-side
+    // (transferred at drain time, when the candidate's HW context was alive)
+    // so we just move them into the caller's slot.
+    if self.try_pop_pending(frame) {
+      return Ok(());
+    }
+
     loop {
       let res = self.state.inner.receive_frame(&mut self.hw_frame);
       match res {
@@ -258,6 +317,11 @@ impl VideoDecoder {
             return Err(Error::Ffmpeg(e));
           }
           if self.probe.is_some() && self.advance_probe()? {
+            // Probe advance may have populated `pending_frames`; deliver
+            // one of those before reading more from the new candidate.
+            if self.try_pop_pending(frame) {
+              return Ok(());
+            }
             continue;
           }
           return Err(Error::Ffmpeg(e));
@@ -277,6 +341,9 @@ impl VideoDecoder {
             Err(e) => {
               if self.probe.is_some() && self.advance_probe()? {
                 unsafe { av_frame_unref(frame.as_inner_mut().as_mut_ptr()) };
+                if self.try_pop_pending(frame) {
+                  return Ok(());
+                }
                 continue;
               }
               return Err(Error::Ffmpeg(e));
@@ -287,6 +354,24 @@ impl VideoDecoder {
     }
   }
 
+  /// Pop one queued frame (produced by a candidate decoder during probe
+  /// replay) into the caller's slot. Returns `true` when a frame was
+  /// delivered, `false` when the queue was empty.
+  fn try_pop_pending(&mut self, frame: &mut Frame) -> bool {
+    let Some(mut buffered) = self.pending_frames.pop_front() else {
+      return false;
+    };
+    // SAFETY: `buffered` is a CPU-side AVFrame we previously transferred
+    // and pushed into the queue; both pointers are valid.
+    unsafe {
+      av_frame_unref(frame.as_inner_mut().as_mut_ptr());
+      av_frame_move_ref(frame.as_inner_mut().as_mut_ptr(), buffered.as_mut_ptr());
+    }
+    // Probe semantics: delivering a frame collapses the probe.
+    self.probe = None;
+    true
+  }
+
   /// Flush internal buffers (e.g. after a seek). Resets probe-time buffer if
   /// active, since post-seek packets do not align with replayed history.
   pub fn flush(&mut self) {
@@ -342,13 +427,39 @@ impl VideoDecoder {
       // We borrow the buffer immutably; if replay fails the candidate's Drop
       // releases the FFmpeg state and the buffer is preserved for the next
       // attempt.
+      //
+      // EAGAIN handling: `avcodec_send_packet` may return EAGAIN when its
+      // internal queue is full and the user is expected to drain output
+      // first (B-frame buffering, candidate-specific queue depth, etc.).
+      // This is normal flow — we drain frames out of the candidate, transfer
+      // each one to a CPU frame, and stash them in `local_pending`. After
+      // commit they move to `self.pending_frames` and are delivered FIFO
+      // by `receive_frame`, so the caller never loses initial frames.
+      let mut local_pending: VecDeque<frame::Video> = VecDeque::new();
       let replay_result: std::result::Result<(), ffmpeg_next::Error> = {
         let probe = self.probe.as_ref().expect("probe state present");
+        let mut hw_buf = frame::Video::empty();
         let mut r: std::result::Result<(), ffmpeg_next::Error> = Ok(());
-        for pkt in &probe.buffered_packets {
-          if let Err(e) = candidate_state.inner.send_packet(pkt) {
-            r = Err(e);
-            break;
+
+        'replay: for pkt in &probe.buffered_packets {
+          loop {
+            match candidate_state.inner.send_packet(pkt) {
+              Ok(()) => break,
+              Err(e) if is_eagain(&e) => {
+                // Drain candidate output (transferring + queueing each frame)
+                // and retry the same packet.
+                if let Err(de) =
+                  drain_into_pending(&mut candidate_state.inner, &mut hw_buf, &mut local_pending)
+                {
+                  r = Err(de);
+                  break 'replay;
+                }
+              }
+              Err(e) => {
+                r = Err(e);
+                break 'replay;
+              }
+            }
           }
         }
         if r.is_ok() && probe.eof_sent {
@@ -361,8 +472,11 @@ impl VideoDecoder {
 
       if let Err(e) = replay_result {
         tracing::warn!(?next_backend, error = %e, "hwdecode: candidate replay failed");
-        // Drop candidate explicitly so its FFI cleanup runs now.
+        // Drop candidate explicitly so its FFI cleanup runs now. Discard any
+        // frames we drained from this candidate — they're tied to a decoder
+        // we're throwing away.
         drop(candidate_state);
+        drop(local_pending);
         self
           .probe
           .as_mut()
@@ -372,9 +486,11 @@ impl VideoDecoder {
         continue;
       }
 
-      // Commit: install the candidate, clear residual hw_frame, pop backend.
+      // Commit: install the candidate, clear residual hw_frame, queue the
+      // drained frames for the caller, and pop the now-active backend.
       self.state = candidate_state;
       unsafe { av_frame_unref(self.hw_frame.as_mut_ptr()) };
+      self.pending_frames.append(&mut local_pending);
       self
         .probe
         .as_mut()
@@ -484,8 +600,75 @@ unsafe fn transfer_hw_frame(
 /// `EAGAIN` and `EOF` are normal flow signals from `avcodec_receive_frame`
 /// and must not be treated as backend failures.
 fn is_transient(e: &ffmpeg_next::Error) -> bool {
+  is_eagain(e) || matches!(e, ffmpeg_next::Error::Eof)
+}
+
+/// Just `EAGAIN` (separate from EOF — the FFmpeg send/receive state machine
+/// distinguishes "drain output and retry" from "stream over").
+fn is_eagain(e: &ffmpeg_next::Error) -> bool {
   matches!(e, ffmpeg_next::Error::Other { errno } if *errno == ffmpeg_next::error::EAGAIN)
-    || matches!(e, ffmpeg_next::Error::Eof)
+}
+
+/// Look up the decoder for `parameters` without going through the bindgen
+/// `AVCodecID` Rust enum. Reads the codec_id field as raw `u32` via
+/// `addr_of!` + `ptr::read` so a value not in our build's discriminant
+/// set never invokes UB.
+fn find_decoder(parameters: &codec::Parameters) -> Result<Codec> {
+  // SAFETY: parameters owns a valid AVCodecParameters; addr_of! projects
+  // to the codec_id field; the *const u32 cast is sound because AVCodecID
+  // is `#[repr(u32)]` (same size and alignment as u32). Reading as u32
+  // cannot be UB regardless of the value FFmpeg wrote.
+  let raw_id: u32 =
+    unsafe { ptr::read(ptr::addr_of!((*parameters.as_ptr()).codec_id) as *const u32) };
+
+  // Call C `avcodec_find_decoder` via our local `c_int`-typed shim — we
+  // never construct an `AVCodecID` enum from `raw_id`. The C function
+  // returns NULL for unknown ids, which we surface as `Error::NoCodec`.
+  // SAFETY: avcodec_find_decoder is a pure FFmpeg lookup; passing any
+  // c_int is sound (returns NULL for unknown).
+  let codec_ptr = unsafe { c_shims::avcodec_find_decoder(raw_id as libc::c_int) };
+  if codec_ptr.is_null() {
+    return Err(Error::NoCodec(raw_id));
+  }
+  // SAFETY: codec_ptr is a non-null *const AVCodec into FFmpeg's static
+  // codec table; it lives for the duration of the program.
+  Ok(unsafe { Codec::wrap(codec_ptr) })
+}
+
+/// Drain output frames from a candidate decoder during probe replay,
+/// transferring each one from the candidate's HW context to a fresh CPU
+/// frame and queueing it. Returns `Ok(())` once the candidate signals
+/// EAGAIN/EOF. The transfer happens while the candidate is still alive
+/// (its `AVHWFramesContext` is reachable); the resulting CPU frames remain
+/// valid after the candidate is committed because they hold their own
+/// buffer references with no dependency on the original device context.
+fn drain_into_pending(
+  decoder: &mut ffmpeg_next::decoder::Video,
+  hw_buf: &mut frame::Video,
+  pending: &mut VecDeque<frame::Video>,
+) -> std::result::Result<(), ffmpeg_next::Error> {
+  loop {
+    match decoder.receive_frame(hw_buf) {
+      Ok(()) => {
+        let mut cpu = frame::Video::empty();
+        // SAFETY: hw_buf is a freshly-decoded HW frame; av_hwframe_transfer_data
+        // allocates buffers on `cpu`. copy_props moves timing/side data over.
+        unsafe {
+          let r1 = av_hwframe_transfer_data(cpu.as_mut_ptr(), hw_buf.as_ptr(), 0);
+          if r1 < 0 {
+            return Err(ffmpeg_next::Error::from(r1));
+          }
+          let r2 = av_frame_copy_props(cpu.as_mut_ptr(), hw_buf.as_ptr());
+          if r2 < 0 {
+            return Err(ffmpeg_next::Error::from(r2));
+          }
+        }
+        pending.push_back(cpu);
+      }
+      Err(e) if is_transient(&e) => return Ok(()),
+      Err(e) => return Err(e),
+    }
+  }
 }
 
 #[allow(dead_code)]
@@ -500,7 +683,7 @@ mod tests {
 
   #[test]
   fn no_codec_for_unknown_id() {
-    let err = Error::NoCodec(codec::Id::None);
+    let err = Error::NoCodec(0);
     assert!(format!("{err}").contains("no decoder"));
   }
 
diff --git a/src/error.rs b/src/error.rs
index ef5373c..955d215 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -10,9 +10,12 @@ pub enum Error {
   #[error("ffmpeg error: {0}")]
   Ffmpeg(#[from] ffmpeg_next::Error),
 
-  /// `avcodec_find_decoder` returned null for the input codec id.
-  #[error("no decoder for codec id {0:?}")]
-  NoCodec(ffmpeg_next::codec::Id),
+  /// `avcodec_find_decoder` returned null for the input codec id. The id
+  /// is reported as the raw integer (`AVCodecID` discriminant) — we do not
+  /// construct the bindgen `AVCodecID` enum from a runtime value, since
+  /// values outside our build's discriminant set would invoke UB.
+  #[error("no decoder for codec id {0}")]
+  NoCodec(u32),
 
   /// The codec does not advertise a hardware configuration matching the
   /// requested backend (via `avcodec_get_hw_config`).
diff --git a/src/frame.rs b/src/frame.rs
index d6ceca6..15e903c 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -82,7 +82,9 @@ impl Frame {
   }
 
   /// Bytes per row for `plane`. Reads `AVFrame.linesize[plane]` directly.
-  /// Panics if `plane >= planes()`.
+  /// Panics if `plane >= planes()` or the linesize is non-positive (FFmpeg
+  /// allows negative linesize for vertically-flipped formats; this crate
+  /// does not surface those — call [`Self::data`] first to test safely).
   pub fn stride(&self, plane: usize) -> usize {
     let n = self.planes();
     assert!(
@@ -90,16 +92,29 @@ impl Frame {
       "stride: plane {plane} out of bounds (planes={n})"
     );
     // SAFETY: bounds-checked above; linesize is `[c_int; 8]`.
-    unsafe { (*self.inner.as_ptr()).linesize[plane] as usize }
+    let linesize: i32 = unsafe { (*self.inner.as_ptr()).linesize[plane] };
+    assert!(
+      linesize > 0,
+      "stride: non-positive linesize {linesize} for plane {plane} \
+       (negative linesize means vertically-flipped — not supported)"
+    );
+    linesize as usize
   }
 
   /// Pixel data for `plane`.
   ///
-  /// Returns `None` when the frame's pixel format is not one of the
-  /// hardware-output formats listed in [`crate::pix_fmt`] — we cannot
-  /// safely compute the plane size for an unknown layout. Returns `None`
-  /// for an out-of-bounds plane index, a null data pointer, or an empty
-  /// frame.
+  /// Returns `None` for any of the following — never panics:
+  /// - The frame's pixel format is not one of the hardware-output formats
+  ///   listed in [`crate::pix_fmt`] (we cannot safely compute the plane
+  ///   size for an unknown layout).
+  /// - The plane index is out of range.
+  /// - `AVFrame.linesize[plane]` is `<= 0` (negative linesize signals
+  ///   vertically-flipped FFmpeg layouts which we do not surface; zero is
+  ///   "no plane").
+  /// - `AVFrame.height` is `<= 0`.
+  /// - The computed slice length would overflow or exceed `isize::MAX`
+  ///   (a precondition of [`std::slice::from_raw_parts`]).
+  /// - The plane's data pointer is null.
   ///
   /// Currently supported (post-`av_hwframe_transfer_data`):
   /// - 4:2:0 semi-planar 8-bit: `NV12`, `NV21`
@@ -112,12 +127,26 @@ impl Frame {
     if plane >= self.planes() {
       return None;
     }
-    let stride = self.stride(plane);
-    let plane_height = plane_height_for(self.pix_fmt(), plane, self.height() as usize)?;
+
+    // SAFETY: bounds-checked plane index; `linesize` and `height` are
+    // primitive c_int reads that cannot themselves be UB.
+    let linesize: i32 = unsafe { (*self.inner.as_ptr()).linesize[plane] };
+    let height_int: i32 = unsafe { (*self.inner.as_ptr()).height };
+    if linesize <= 0 || height_int <= 0 {
+      return None;
+    }
+    let stride = linesize as usize;
+
+    let plane_height = plane_height_for(self.pix_fmt(), plane, height_int as usize)?;
     let len = stride.checked_mul(plane_height)?;
-    // SAFETY: bounds-checked plane index above. We trust FFmpeg to populate
-    // `data[plane]` validly when `linesize[plane]` is non-zero (which we
-    // verified via `planes()`); null-check guards against edge cases.
+    if len > isize::MAX as usize {
+      return None;
+    }
+
+    // SAFETY: linesize > 0 and height > 0 verified; len <= isize::MAX
+    // verified — both preconditions of `slice::from_raw_parts`. We trust
+    // FFmpeg to populate `data[plane]` validly when linesize[plane] is
+    // non-zero; the null check is a final defensive guard.
     unsafe {
       let ptr = (*self.inner.as_ptr()).data[plane];
       if ptr.is_null() {
@@ -193,6 +222,52 @@ mod tests {
     assert!(f.data(0).is_none());
   }
 
+  /// Synthesize a frame with a negative linesize (FFmpeg's vertical-flip
+  /// convention) and assert `data()` refuses to construct a slice. Without
+  /// the linesize > 0 check, the negative `i32 as usize` would produce a
+  /// huge positive length and `from_raw_parts` would be UB.
+  #[test]
+  fn data_returns_none_for_negative_linesize() {
+    let mut f = Frame::empty();
+    unsafe {
+      let raw = f.inner.as_mut_ptr();
+      (*raw).format = pix_fmt::NV12;
+      (*raw).width = 1920;
+      (*raw).height = 1080;
+      (*raw).linesize[0] = -1920; // vertically-flipped
+      (*raw).linesize[1] = -1920;
+      // data pointers stay null; `data()` would return None on the null
+      // check anyway, but should bail earlier on the linesize sign.
+    }
+    assert!(f.data(0).is_none());
+    assert!(f.data(1).is_none());
+  }
+
+  #[test]
+  fn data_returns_none_for_non_positive_height() {
+    let mut f = Frame::empty();
+    unsafe {
+      let raw = f.inner.as_mut_ptr();
+      (*raw).format = pix_fmt::NV12;
+      (*raw).width = 1920;
+      (*raw).height = 0;
+      (*raw).linesize[0] = 1920;
+      (*raw).linesize[1] = 1920;
+    }
+    assert!(f.data(0).is_none());
+  }
+
+  #[test]
+  #[should_panic(expected = "non-positive linesize")]
+  fn stride_panics_on_negative_linesize() {
+    let mut f = Frame::empty();
+    unsafe {
+      let raw = f.inner.as_mut_ptr();
+      (*raw).linesize[0] = -1920;
+    }
+    let _ = f.stride(0);
+  }
+
   #[test]
   fn frame_is_send() {
     fn check<T: Send>() {}

From b48e5329253cdf4f6ae6740c950dd4486e0e898a Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 18:37:37 +1200
Subject: [PATCH 07/27] update

---
 src/decoder.rs | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index fcaea44..0bd638e 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -313,9 +313,14 @@ impl VideoDecoder {
       let res = self.state.inner.receive_frame(&mut self.hw_frame);
       match res {
         Err(e) => {
-          if is_transient(&e) {
+          // EAGAIN is normal backpressure — pass through unconditionally.
+          if is_eagain(&e) {
             return Err(Error::Ffmpeg(e));
           }
+          // EOF (and every other non-transient error): if we are still
+          // probing, treat it as candidate failure — a backend that drains
+          // to EOF without ever producing a frame should not silently
+          // present as "stream over" to the caller. Advance and retry.
           if self.probe.is_some() && self.advance_probe()? {
             // Probe advance may have populated `pending_frames`; deliver
             // one of those before reading more from the new candidate.
@@ -324,6 +329,8 @@ impl VideoDecoder {
             }
             continue;
           }
+          // Probe collapsed or exhausted — surface the error (including EOF
+          // for a genuinely empty stream).
           return Err(Error::Ffmpeg(e));
         }
         Ok(()) => {
@@ -372,10 +379,20 @@ impl VideoDecoder {
     true
   }
 
-  /// Flush internal buffers (e.g. after a seek). Resets probe-time buffer if
-  /// active, since post-seek packets do not align with replayed history.
+  /// Flush internal buffers (e.g. after a seek).
+  ///
+  /// Discards every frame buffered by the decoder, every frame queued during
+  /// probe replay (`pending_frames`), and the residual `hw_frame` scratch
+  /// buffer. Probe-time replay state (buffered packets, EOF marker) is also
+  /// cleared since post-seek packets do not align with the previously
+  /// captured history. After a flush, the next `receive_frame` waits for new
+  /// post-seek input.
   pub fn flush(&mut self) {
     self.state.inner.flush();
+    // SAFETY: hw_frame is a valid AVFrame we own; av_frame_unref is a no-op
+    // for an already-empty frame.
+    unsafe { av_frame_unref(self.hw_frame.as_mut_ptr()) };
+    self.pending_frames.clear();
     if let Some(probe) = self.probe.as_mut() {
       probe.buffered_packets.clear();
       probe.eof_sent = false;
@@ -540,9 +557,28 @@ impl VideoDecoder {
     // SAFETY: ctx is a freshly-constructed AVCodecContext we own;
     // av_buffer_ref bumps the refcount of the device buffer for FFmpeg's
     // use (we keep our own ref in `hw_device_ref` for cleanup).
+    // av_buffer_ref returns NULL on allocation failure; we must check it
+    // before assigning, otherwise the codec context would be opened with a
+    // HW-flagged setup but no actual device reference.
+    let device_ref_for_ctx = unsafe { av_buffer_ref(hw_device_ref) };
+    if device_ref_for_ctx.is_null() {
+      // SAFETY: rolling back what we just allocated above. hw_device_ref
+      // is non-null (we checked after av_hwdevice_ctx_create); callback_state
+      // was just freshly Box::into_raw'd.
+      unsafe {
+        let mut hw = hw_device_ref;
+        av_buffer_unref(&mut hw);
+        drop(Box::from_raw(callback_state));
+      }
+      return Err(Error::Ffmpeg(ffmpeg_next::Error::Other {
+        errno: libc::ENOMEM,
+      }));
+    }
+    // SAFETY: device_ref_for_ctx is a valid AVBufferRef* from av_buffer_ref;
+    // ctx is freshly built and owned by us.
     unsafe {
       let raw = ctx.as_mut_ptr();
-      (*raw).hw_device_ctx = av_buffer_ref(hw_device_ref);
+      (*raw).hw_device_ctx = device_ref_for_ctx;
       (*raw).opaque = callback_state.cast();
       (*raw).get_format = Some(get_hw_format);
     }

From 4dc0be9ed900f0d0ca2477ebdf052dddc5110a8a Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 18:52:46 +1200
Subject: [PATCH 08/27] update

---
 src/decoder.rs | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index 0bd638e..542b36f 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -79,6 +79,20 @@ struct DecoderState {
   callback_state: *mut CallbackState,
 }
 
+/// Maximum number of packets we are willing to buffer for probe replay
+/// before abandoning the fallback safety net. Set high enough to absorb
+/// long B-frame GOPs and codec setup latency, low enough to bound memory
+/// against malicious / pathological streams that never produce a first
+/// frame.
+const MAX_PROBE_PACKETS: usize = 256;
+
+/// Maximum total compressed-byte size of buffered probe packets. Each
+/// `Packet` clone holds a refcounted reference to the demuxer's bitstream
+/// data — even though the clone itself is shallow, the underlying buffers
+/// stay alive until we drop them. 64 MiB is generous for normal video and
+/// gives untrusted media a hard ceiling.
+const MAX_PROBE_PACKET_BYTES: usize = 64 * 1024 * 1024;
+
 /// State carried only during the probe window (before the first successful
 /// frame). Holds enough information to tear down the current decoder and
 /// retry with the next backend.
@@ -90,8 +104,12 @@ struct ProbeState {
   remaining_backends: Vec<Backend>,
   /// Packets sent so far, kept for replay through any candidate backend.
   /// Preserved across failed candidates — only cleared when the probe
-  /// collapses on a successful first frame.
+  /// collapses on a successful first frame, or when the probe is
+  /// abandoned due to the size caps.
   buffered_packets: Vec<Packet>,
+  /// Cumulative size (in compressed bytes) of `buffered_packets`. Tracked
+  /// incrementally so we don't have to re-sum on every send.
+  buffered_bytes: usize,
   /// Whether `send_eof` has been called; replayed alongside packets.
   eof_sent: bool,
 }
@@ -155,6 +173,7 @@ impl VideoDecoder {
             codec,
             remaining_backends: remaining,
             buffered_packets: Vec::new(),
+            buffered_bytes: 0,
             eof_sent: false,
           });
           return Ok(Self {
@@ -231,12 +250,37 @@ impl VideoDecoder {
   /// probe to the next candidate and retries the packet there. The caller
   /// observes only the eventual success or, if the probe is exhausted, the
   /// final error.
+  ///
+  /// If the probe window grows beyond [`MAX_PROBE_PACKETS`] or
+  /// [`MAX_PROBE_PACKET_BYTES`] without producing a first frame (a stream
+  /// the active backend is silently mishandling, or pathological input),
+  /// the probe is **abandoned**: replay history is dropped, queued frames
+  /// are cleared, and `self.probe = None`. The active backend continues
+  /// serving the caller without fallback. A `tracing::warn!` records this
+  /// so it is visible in production logs.
   pub fn send_packet(&mut self, packet: &Packet) -> Result<()> {
     loop {
       match self.state.inner.send_packet(packet) {
         Ok(()) => {
           if let Some(probe) = self.probe.as_mut() {
-            probe.buffered_packets.push(packet.clone());
+            let pkt_size = packet.size();
+            let new_count = probe.buffered_packets.len() + 1;
+            let new_bytes = probe.buffered_bytes.saturating_add(pkt_size);
+            if new_count > MAX_PROBE_PACKETS || new_bytes > MAX_PROBE_PACKET_BYTES {
+              tracing::warn!(
+                packets = new_count,
+                bytes = new_bytes,
+                max_packets = MAX_PROBE_PACKETS,
+                max_bytes = MAX_PROBE_PACKET_BYTES,
+                "hwdecode: probe window exceeded caps without first frame; \
+                 abandoning fallback safety net"
+              );
+              self.probe = None;
+              self.pending_frames.clear();
+            } else {
+              probe.buffered_packets.push(packet.clone());
+              probe.buffered_bytes = new_bytes;
+            }
           }
           return Ok(());
         }
@@ -395,6 +439,7 @@ impl VideoDecoder {
     self.pending_frames.clear();
     if let Some(probe) = self.probe.as_mut() {
       probe.buffered_packets.clear();
+      probe.buffered_bytes = 0;
       probe.eof_sent = false;
     }
   }

From 964e9e7c571269bce792094fa09bf566e1d4e32d Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 21:35:50 +1200
Subject: [PATCH 09/27] update

---
 README.md      |   4 +-
 docs/design.md |   4 +-
 src/decoder.rs | 112 +++++++++++++++++++++++++++++++++++++++++++++++--
 src/frame.rs   |  10 ++++-
 4 files changed, 124 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index bcfb058..007eac3 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,9 @@ unconditionally.
 
 ## Build requirements
 
-- A system FFmpeg ≥ 4.x linkable via `pkg-config`. Verify with
+- A system FFmpeg ≥ **5.1** linkable via `pkg-config` (we reference
+  `AV_PIX_FMT_P212LE` / `AV_PIX_FMT_P412LE`, which were added in 5.1).
+  Tested against 8.1. Verify with
   `ffmpeg -hwaccels` that your build has the backends you expect compiled in
   (e.g. `videotoolbox` on macOS, `vaapi` / `cuda` on Linux,
   `d3d11va` / `cuda` on Windows).
diff --git a/docs/design.md b/docs/design.md
index 056bb4f..6acc8c5 100644
--- a/docs/design.md
+++ b/docs/design.md
@@ -130,7 +130,9 @@ No other modules. Keep the surface small.
 
 No platform-specific Cargo features. `cfg!(target_os = ...)` selects which `AVHWDeviceType` constants we even attempt — the FFI symbols are linked unconditionally via `ffmpeg-sys-next`.
 
-System FFmpeg ≥ 4.x. Verified against the user's macOS Homebrew build (FFmpeg 8.1, VideoToolbox enabled).
+System FFmpeg ≥ **5.1** (we reference `AV_PIX_FMT_P212LE` / `AV_PIX_FMT_P412LE`,
+added upstream in 5.1). Verified against the macOS Homebrew build (FFmpeg 8.1,
+VideoToolbox enabled).
 
 ## Testing
 
diff --git a/src/decoder.rs b/src/decoder.rs
index 542b36f..3a8067e 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -62,6 +62,10 @@ pub struct VideoDecoder {
   /// candidate state is committed. [`Self::receive_frame`] dequeues these
   /// FIFO before reading from `state.inner`.
   pending_frames: VecDeque<frame::Video>,
+  /// Per-decoder byte budget for [`Self::pending_frames`] during probe
+  /// replay. Defaults to [`DEFAULT_MAX_PROBE_PENDING_BYTES`]; override via
+  /// [`Self::with_max_probe_pending_bytes`].
+  max_probe_pending_bytes: usize,
 }
 
 /// Owned FFmpeg state for one open codec context. Has its own `Drop` so we
@@ -93,6 +97,30 @@ const MAX_PROBE_PACKETS: usize = 256;
 /// gives untrusted media a hard ceiling.
 const MAX_PROBE_PACKET_BYTES: usize = 64 * 1024 * 1024;
 
+/// Maximum number of CPU frames we are willing to queue from a candidate
+/// during probe replay. Each frame is a fully-allocated CPU buffer
+/// (~3 MiB for 1080p NV12, ~24 MiB for 4K P010, ~96 MiB for 8K P010), so
+/// an unbounded queue would OOM on a candidate with a shallow internal
+/// queue against a deep replay history. Drained candidate frames in
+/// excess of this cap (or [`DEFAULT_MAX_PROBE_PENDING_BYTES`], whichever
+/// hits first) are discarded with a `tracing::warn!`; we still drain so
+/// `send_packet` can keep feeding the candidate.
+const MAX_PROBE_PENDING_FRAMES: usize = 16;
+
+/// Default byte budget for probe-replay drained frames. 256 MiB is enough
+/// for 16 frames at 4K P010 (~24 MiB each = 384 MiB worst case under the
+/// count cap), and is the cap that fires first for very high-resolution
+/// content (8K P010: ~96 MiB per frame → only ~2 frames fit).
+///
+/// Override per-decoder with [`VideoDecoder::with_max_probe_pending_bytes`]
+/// when targeting 8K+ workloads or memory-constrained environments.
+///
+/// TODO: when frames significantly exceed typical sizes, consider
+/// memmap-backed pending buffers (write transferred frames to a temp file
+/// or shared-memory segment) so the resident set stays bounded even when
+/// the byte cap is raised. Out of scope for v0.0.0.
+pub const DEFAULT_MAX_PROBE_PENDING_BYTES: usize = 256 * 1024 * 1024;
+
 /// State carried only during the probe window (before the first successful
 /// frame). Holds enough information to tear down the current decoder and
 /// retry with the next backend.
@@ -181,6 +209,7 @@ impl VideoDecoder {
             hw_frame: frame::Video::empty(),
             probe,
             pending_frames: VecDeque::new(),
+            max_probe_pending_bytes: DEFAULT_MAX_PROBE_PENDING_BYTES,
           });
         }
         Err(e) => {
@@ -207,9 +236,30 @@ impl VideoDecoder {
       hw_frame: frame::Video::empty(),
       probe: None,
       pending_frames: VecDeque::new(),
+      max_probe_pending_bytes: DEFAULT_MAX_PROBE_PENDING_BYTES,
     })
   }
 
+  /// Override the byte budget for probe-replay queued frames. Defaults to
+  /// [`DEFAULT_MAX_PROBE_PENDING_BYTES`]. Use a higher value when targeting
+  /// 8K+ workloads where 16 frames at full size could exceed the default;
+  /// use a lower value in memory-constrained services to bound peak
+  /// allocation more tightly.
+  ///
+  /// Setting after the first frame has been delivered is harmless but has
+  /// no observable effect — the probe has already collapsed and the cap
+  /// only applies during replay drain.
+  ///
+  /// Returns `self` for builder-style chaining:
+  /// ```ignore
+  /// let decoder = VideoDecoder::open(params)?
+  ///     .with_max_probe_pending_bytes(1024 * 1024 * 1024); // 1 GiB
+  /// ```
+  pub fn with_max_probe_pending_bytes(mut self, bytes: usize) -> Self {
+    self.max_probe_pending_bytes = bytes;
+    self
+  }
+
   /// The backend currently producing frames. While the probe is still in
   /// progress (no frame received yet) this returns the optimistically
   /// selected backend; after the first frame, it is the backend that
@@ -498,6 +548,8 @@ impl VideoDecoder {
       // commit they move to `self.pending_frames` and are delivered FIFO
       // by `receive_frame`, so the caller never loses initial frames.
       let mut local_pending: VecDeque<frame::Video> = VecDeque::new();
+      let mut local_pending_bytes: usize = 0;
+      let max_pending_bytes = self.max_probe_pending_bytes;
       let replay_result: std::result::Result<(), ffmpeg_next::Error> = {
         let probe = self.probe.as_ref().expect("probe state present");
         let mut hw_buf = frame::Video::empty();
@@ -510,9 +562,13 @@ impl VideoDecoder {
               Err(e) if is_eagain(&e) => {
                 // Drain candidate output (transferring + queueing each frame)
                 // and retry the same packet.
-                if let Err(de) =
-                  drain_into_pending(&mut candidate_state.inner, &mut hw_buf, &mut local_pending)
-                {
+                if let Err(de) = drain_into_pending(
+                  &mut candidate_state.inner,
+                  &mut hw_buf,
+                  &mut local_pending,
+                  &mut local_pending_bytes,
+                  max_pending_bytes,
+                ) {
                   r = Err(de);
                   break 'replay;
                 }
@@ -727,10 +783,33 @@ fn drain_into_pending(
   decoder: &mut ffmpeg_next::decoder::Video,
   hw_buf: &mut frame::Video,
   pending: &mut VecDeque<frame::Video>,
+  pending_bytes: &mut usize,
+  max_bytes: usize,
 ) -> std::result::Result<(), ffmpeg_next::Error> {
   loop {
     match decoder.receive_frame(hw_buf) {
       Ok(()) => {
+        // Either cap (count or bytes) closes the queue. We still drain so
+        // `send_packet` can resume on the next iteration; we just stop
+        // accumulating.
+        //
+        // TODO: at very large frame sizes (8K HDR P010, > ~96 MiB each)
+        // even a single retained frame is significant. Future direction:
+        // memmap-backed pending frames (write to a temp file or shared
+        // memory segment) so the resident set stays bounded even when the
+        // byte cap is raised. Out of scope for v0.0.0.
+        if pending.len() >= MAX_PROBE_PENDING_FRAMES || *pending_bytes >= max_bytes {
+          tracing::warn!(
+            frames = pending.len(),
+            bytes = *pending_bytes,
+            max_frames = MAX_PROBE_PENDING_FRAMES,
+            max_bytes = max_bytes,
+            "hwdecode: probe pending cap reached; discarding drained candidate frame"
+          );
+          // SAFETY: hw_buf is owned and valid; unref of an empty frame is a no-op.
+          unsafe { av_frame_unref(hw_buf.as_mut_ptr()) };
+          continue;
+        }
         let mut cpu = frame::Video::empty();
         // SAFETY: hw_buf is a freshly-decoded HW frame; av_hwframe_transfer_data
         // allocates buffers on `cpu`. copy_props moves timing/side data over.
@@ -744,6 +823,7 @@ fn drain_into_pending(
             return Err(ffmpeg_next::Error::from(r2));
           }
         }
+        *pending_bytes = pending_bytes.saturating_add(cpu_frame_bytes(&cpu));
         pending.push_back(cpu);
       }
       Err(e) if is_transient(&e) => return Ok(()),
@@ -752,6 +832,32 @@ fn drain_into_pending(
   }
 }
 
+/// Approximate resident size of a CPU frame: sum of `linesize[plane] *
+/// plane_height` across populated planes. Returns 0 for unknown formats
+/// (we under-count rather than over-count, on the principle that under-
+/// counting only delays the cap firing, while over-counting could starve
+/// legitimate streams).
+fn cpu_frame_bytes(frame: &frame::Video) -> usize {
+  // SAFETY: AVFrame.height / format / linesize are c_int reads.
+  let (height, pix_fmt, linesizes) = unsafe {
+    let raw = frame.as_ptr();
+    ((*raw).height as usize, (*raw).format, (*raw).linesize)
+  };
+  let mut total: usize = 0;
+  for (plane, linesize) in linesizes.iter().enumerate() {
+    if *linesize <= 0 {
+      break;
+    }
+    let stride = *linesize as usize;
+    let Some(plane_h) = crate::frame::plane_height_for(pix_fmt, plane, height) else {
+      // Unknown format / unsupported plane index — bail out, accept under-count.
+      break;
+    };
+    total = total.saturating_add(stride.saturating_mul(plane_h));
+  }
+  total
+}
+
 #[allow(dead_code)]
 fn _assert_send() {
   fn check<T: Send>() {}
diff --git a/src/frame.rs b/src/frame.rs
index 15e903c..22f7783 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -170,7 +170,15 @@ impl Default for Frame {
 
 /// Number of rows in `plane` for a frame of `frame_height` and the given
 /// pixel format. `None` for formats not in the supported HW-output set.
-fn plane_height_for(pix_fmt_int: i32, plane: usize, frame_height: usize) -> Option<usize> {
+///
+/// Crate-internal so the decoder's probe-replay accountant can compute
+/// per-frame byte sizes without re-implementing the chroma-subsampling
+/// table.
+pub(crate) fn plane_height_for(
+  pix_fmt_int: i32,
+  plane: usize,
+  frame_height: usize,
+) -> Option<usize> {
   match pix_fmt_int {
     // 4:2:0 semi-planar — Y full height, chroma half height.
     pix_fmt::NV12

From d2d96a8b00d99aaa4c1f249477163eb47f5c814f Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 21:59:19 +1200
Subject: [PATCH 10/27] update

---
 benches/decode.rs  |  4 +--
 examples/decode.rs |  2 +-
 src/decoder.rs     | 76 ++++++++++++++++++++++++++++++++++++----------
 src/frame.rs       | 39 +++++++++++++++---------
 tests/decode.rs    |  2 +-
 tests/hw_smoke.rs  |  2 +-
 6 files changed, 90 insertions(+), 35 deletions(-)

diff --git a/benches/decode.rs b/benches/decode.rs
index 5f53a66..9e53f0a 100644
--- a/benches/decode.rs
+++ b/benches/decode.rs
@@ -32,7 +32,7 @@ fn decode_all_hw(path: &PathBuf) -> Result<usize, hwdecode::Error> {
   let stream_index = stream.index();
 
   let mut decoder = VideoDecoder::open(stream.parameters())?;
-  let mut frame = Frame::empty();
+  let mut frame = Frame::empty()?;
   let mut count = 0_usize;
 
   let mut drain = |decoder: &mut VideoDecoder, count: &mut usize| -> Result<(), hwdecode::Error> {
@@ -122,7 +122,7 @@ fn bench_decode(c: &mut Criterion) {
     let stream_index = stream.index();
     match VideoDecoder::open(stream.parameters()) {
       Ok(mut dec) => {
-        let mut frame = Frame::empty();
+        let mut frame = Frame::empty().expect("alloc probe frame");
         'probe: for (s, packet) in input.packets() {
           if s.index() != stream_index {
             continue;
diff --git a/examples/decode.rs b/examples/decode.rs
index a1439d7..1d14de1 100644
--- a/examples/decode.rs
+++ b/examples/decode.rs
@@ -44,7 +44,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     decoder.height(),
   );
 
-  let mut frame = Frame::empty();
+  let mut frame = Frame::empty()?;
   let mut count: u64 = 0;
 
   let drain = |decoder: &mut VideoDecoder, frame: &mut Frame, count: &mut u64| loop {
diff --git a/src/decoder.rs b/src/decoder.rs
index 3a8067e..a7739c2 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -206,7 +206,7 @@ impl VideoDecoder {
           });
           return Ok(Self {
             state,
-            hw_frame: frame::Video::empty(),
+            hw_frame: alloc_av_frame().map_err(Error::Ffmpeg)?,
             probe,
             pending_frames: VecDeque::new(),
             max_probe_pending_bytes: DEFAULT_MAX_PROBE_PENDING_BYTES,
@@ -233,7 +233,7 @@ impl VideoDecoder {
     let state = Self::build_state(parameters, codec, backend)?;
     Ok(Self {
       state,
-      hw_frame: frame::Video::empty(),
+      hw_frame: alloc_av_frame().map_err(Error::Ffmpeg)?,
       probe: None,
       pending_frames: VecDeque::new(),
       max_probe_pending_bytes: DEFAULT_MAX_PROBE_PENDING_BYTES,
@@ -552,7 +552,10 @@ impl VideoDecoder {
       let max_pending_bytes = self.max_probe_pending_bytes;
       let replay_result: std::result::Result<(), ffmpeg_next::Error> = {
         let probe = self.probe.as_ref().expect("probe state present");
-        let mut hw_buf = frame::Video::empty();
+        let mut hw_buf = match alloc_av_frame() {
+          Ok(f) => f,
+          Err(e) => return Err(Error::Ffmpeg(e)),
+        };
         let mut r: std::result::Result<(), ffmpeg_next::Error> = Ok(());
 
         'replay: for pkt in &probe.buffered_packets {
@@ -740,6 +743,21 @@ fn is_transient(e: &ffmpeg_next::Error) -> bool {
   is_eagain(e) || matches!(e, ffmpeg_next::Error::Eof)
 }
 
+/// Allocate a fresh `frame::Video`, checking that `av_frame_alloc` did not
+/// return NULL. ffmpeg-next's `frame::Video::empty()` does not surface that
+/// failure and the resulting null pointer would be UB on the next field
+/// access; this wrapper catches it and surfaces it as `ENOMEM`.
+fn alloc_av_frame() -> std::result::Result<frame::Video, ffmpeg_next::Error> {
+  let inner = frame::Video::empty();
+  // SAFETY: as_ptr() just exposes the inner pointer for inspection.
+  if unsafe { inner.as_ptr() }.is_null() {
+    return Err(ffmpeg_next::Error::Other {
+      errno: libc::ENOMEM,
+    });
+  }
+  Ok(inner)
+}
+
 /// Just `EAGAIN` (separate from EOF — the FFmpeg send/receive state machine
 /// distinguishes "drain output and retry" from "stream over").
 fn is_eagain(e: &ffmpeg_next::Error) -> bool {
@@ -810,7 +828,7 @@ fn drain_into_pending(
           unsafe { av_frame_unref(hw_buf.as_mut_ptr()) };
           continue;
         }
-        let mut cpu = frame::Video::empty();
+        let mut cpu = alloc_av_frame()?;
         // SAFETY: hw_buf is a freshly-decoded HW frame; av_hwframe_transfer_data
         // allocates buffers on `cpu`. copy_props moves timing/side data over.
         unsafe {
@@ -823,8 +841,26 @@ fn drain_into_pending(
             return Err(ffmpeg_next::Error::from(r2));
           }
         }
-        *pending_bytes = pending_bytes.saturating_add(cpu_frame_bytes(&cpu));
-        pending.push_back(cpu);
+        // Conservative byte-cap accounting: if we can't size this frame
+        // (unknown CPU pix_fmt — should not happen with strict get_format,
+        // but a misbehaving codec could surface one), discard rather than
+        // queue an unaccounted-for allocation. Never push something whose
+        // size we can't deduct from the budget.
+        match cpu_frame_bytes(&cpu) {
+          Some(bytes) => {
+            *pending_bytes = pending_bytes.saturating_add(bytes);
+            pending.push_back(cpu);
+          }
+          None => {
+            // SAFETY: AVFrame.format is c_int, safe to read.
+            let pix_fmt: i32 = unsafe { (*cpu.as_ptr()).format };
+            tracing::warn!(
+              pix_fmt,
+              "hwdecode: cannot size unknown CPU pix_fmt during replay; discarding drained frame"
+            );
+            // cpu drops here, freeing its buffers via Frame::drop.
+          }
+        }
       }
       Err(e) if is_transient(&e) => return Ok(()),
       Err(e) => return Err(e),
@@ -833,29 +869,37 @@ fn drain_into_pending(
 }
 
 /// Approximate resident size of a CPU frame: sum of `linesize[plane] *
-/// plane_height` across populated planes. Returns 0 for unknown formats
-/// (we under-count rather than over-count, on the principle that under-
-/// counting only delays the cap firing, while over-counting could starve
-/// legitimate streams).
-fn cpu_frame_bytes(frame: &frame::Video) -> usize {
+/// plane_height` across populated planes.
+///
+/// Returns `None` for pixel formats not in our chroma-subsampling table,
+/// so the caller can refuse to queue an allocation it can't account for.
+/// Returning 0 for unknown formats would silently bypass the byte cap and
+/// let an unbounded number of large frames into `pending_frames`.
+fn cpu_frame_bytes(frame: &frame::Video) -> Option<usize> {
   // SAFETY: AVFrame.height / format / linesize are c_int reads.
   let (height, pix_fmt, linesizes) = unsafe {
     let raw = frame.as_ptr();
     ((*raw).height as usize, (*raw).format, (*raw).linesize)
   };
   let mut total: usize = 0;
+  let mut any_plane = false;
   for (plane, linesize) in linesizes.iter().enumerate() {
     if *linesize <= 0 {
       break;
     }
+    any_plane = true;
     let stride = *linesize as usize;
-    let Some(plane_h) = crate::frame::plane_height_for(pix_fmt, plane, height) else {
-      // Unknown format / unsupported plane index — bail out, accept under-count.
-      break;
-    };
+    // If we can't size *any* populated plane, the format is outside our
+    // table — refuse to size the frame at all (conservative; discarding
+    // is safer than under-counting against the byte cap).
+    let plane_h = crate::frame::plane_height_for(pix_fmt, plane, height)?;
     total = total.saturating_add(stride.saturating_mul(plane_h));
   }
-  total
+  if !any_plane {
+    // Genuinely empty frame (no populated planes) — nothing to account for.
+    return Some(0);
+  }
+  Some(total)
 }
 
 #[allow(dead_code)]
diff --git a/src/frame.rs b/src/frame.rs
index 22f7783..65ec63d 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -20,7 +20,10 @@ use std::slice;
 
 use ffmpeg_next::frame;
 
-use crate::pix_fmt;
+use crate::{
+  error::{Error, Result},
+  pix_fmt,
+};
 
 /// CPU-side decoded video frame produced by [`crate::VideoDecoder`].
 pub struct Frame {
@@ -30,10 +33,20 @@ pub struct Frame {
 impl Frame {
   /// Construct an empty frame, suitable as the destination passed to
   /// [`crate::VideoDecoder::receive_frame`].
-  pub fn empty() -> Self {
-    Self {
-      inner: frame::Video::empty(),
+  ///
+  /// Returns `Err(Error::Ffmpeg(Other { errno: ENOMEM }))` when the
+  /// underlying `av_frame_alloc()` returns NULL — `ffmpeg_next` does not
+  /// surface that failure, so we check it here rather than letting a null
+  /// pointer flow into the safe accessors and become UB on first read.
+  pub fn empty() -> Result<Self> {
+    // SAFETY: as_ptr() is safe; we just inspect the value (potentially null).
+    let inner = frame::Video::empty();
+    if unsafe { inner.as_ptr() }.is_null() {
+      return Err(Error::Ffmpeg(ffmpeg_next::Error::Other {
+        errno: libc::ENOMEM,
+      }));
     }
+    Ok(Self { inner })
   }
 
   /// Width in pixels.
@@ -162,11 +175,9 @@ impl Frame {
   }
 }
 
-impl Default for Frame {
-  fn default() -> Self {
-    Self::empty()
-  }
-}
+// `Default` intentionally omitted: constructing a frame can fail (OOM
+// in `av_frame_alloc`), and a panicking `default()` would defeat the
+// safety stance of [`Frame::empty`]. Use `Frame::empty()?` directly.
 
 /// Number of rows in `plane` for a frame of `frame_height` and the given
 /// pixel format. `None` for formats not in the supported HW-output set.
@@ -213,7 +224,7 @@ mod tests {
 
   #[test]
   fn empty_frame_has_zero_dimensions_and_no_pts() {
-    let f = Frame::empty();
+    let f = Frame::empty().expect("alloc");
     assert_eq!(f.width(), 0);
     assert_eq!(f.height(), 0);
     assert_eq!(f.pts(), None);
@@ -225,7 +236,7 @@ mod tests {
 
   #[test]
   fn data_returns_none_for_unknown_format() {
-    let f = Frame::empty();
+    let f = Frame::empty().expect("alloc");
     // pix_fmt is NONE (-1), not in the supported set.
     assert!(f.data(0).is_none());
   }
@@ -236,7 +247,7 @@ mod tests {
   /// huge positive length and `from_raw_parts` would be UB.
   #[test]
   fn data_returns_none_for_negative_linesize() {
-    let mut f = Frame::empty();
+    let mut f = Frame::empty().expect("alloc");
     unsafe {
       let raw = f.inner.as_mut_ptr();
       (*raw).format = pix_fmt::NV12;
@@ -253,7 +264,7 @@ mod tests {
 
   #[test]
   fn data_returns_none_for_non_positive_height() {
-    let mut f = Frame::empty();
+    let mut f = Frame::empty().expect("alloc");
     unsafe {
       let raw = f.inner.as_mut_ptr();
       (*raw).format = pix_fmt::NV12;
@@ -268,7 +279,7 @@ mod tests {
   #[test]
   #[should_panic(expected = "non-positive linesize")]
   fn stride_panics_on_negative_linesize() {
-    let mut f = Frame::empty();
+    let mut f = Frame::empty().expect("alloc");
     unsafe {
       let raw = f.inner.as_mut_ptr();
       (*raw).linesize[0] = -1920;
diff --git a/tests/decode.rs b/tests/decode.rs
index 10a8bcb..2431ff1 100644
--- a/tests/decode.rs
+++ b/tests/decode.rs
@@ -44,7 +44,7 @@ fn auto_open_decodes_at_least_one_frame() {
   assert_eq!(decoder.width(), expected_w);
   assert_eq!(decoder.height(), expected_h);
 
-  let mut frame = Frame::empty();
+  let mut frame = Frame::empty().expect("alloc frame");
   let mut count = 0_usize;
   let target = 30_usize;
 
diff --git a/tests/hw_smoke.rs b/tests/hw_smoke.rs
index 6e11765..372c8f7 100644
--- a/tests/hw_smoke.rs
+++ b/tests/hw_smoke.rs
@@ -32,7 +32,7 @@ fn auto_probe_picks_hardware_backend() {
   // backend that actually produced it. Checking `decoder.backend()` before
   // any frame has been received would observe the optimistic pre-probe
   // value and could false-pass when a HW backend silently degrades.
-  let mut frame = Frame::empty();
+  let mut frame = Frame::empty().expect("alloc frame");
   let mut got_frame = false;
   for (s, packet) in input.packets() {
     if s.index() != stream_index {

From e1899e63e1aea2aabb3da6cca5382d40ddf9bb8f Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:17:29 +1200
Subject: [PATCH 11/27] update

---
 Cargo.toml     |  2 +-
 README.md      |  8 +++++--
 docs/design.md | 36 ++++++++++++++++++----------
 src/decoder.rs | 64 +++++++++++++++++++++++++++++++++-----------------
 src/lib.rs     | 21 +++++++++--------
 5 files changed, 84 insertions(+), 47 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 8e4ea79..7691656 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ name = "hwdecode"
 version = "0.0.0"
 edition = "2021"
 rust-version = "1.95"
-description = "Cross-platform hardware-accelerated video decoder built on top of ffmpeg-next, with auto-probe and software fallback."
+description = "Cross-platform hardware-only video decoder built on top of ffmpeg-next, with auto-probe across HW backends. Callers handle software fallback."
 repository = "https://github.com/findit-ai/hwdecode"
 homepage = "https://github.com/findit-ai/hwdecode"
 documentation = "https://docs.rs/hwdecode"
diff --git a/README.md b/README.md
index 007eac3..c4e9108 100644
--- a/README.md
+++ b/README.md
@@ -50,13 +50,17 @@ while decoder.receive_frame(&mut frame).is_ok() {
 }
 ```
 
-To force a specific backend (no probe, no fallback):
+To force a specific hardware backend (no probe, no fallback):
 
 ```rust
 use hwdecode::{Backend, VideoDecoder};
-let decoder = VideoDecoder::open_with(parameters, Backend::Software)?;
+let decoder = VideoDecoder::open_with(parameters, Backend::VideoToolbox)?;
 ```
 
+`hwdecode` is hardware-only: there is no `Backend::Software`. If `open`
+returns `Error::AllBackendsFailed`, fall back to a software decoder
+yourself (typically `ffmpeg::decoder::Video`).
+
 ## Running tests and benches
 
 The integration test and benchmark expect a real video file. Set
diff --git a/docs/design.md b/docs/design.md
index 6acc8c5..2c54aee 100644
--- a/docs/design.md
+++ b/docs/design.md
@@ -1,12 +1,20 @@
 # hwdecode — design
 
-Cross-platform hardware-accelerated video decoder built on top of `ffmpeg-next` 8.1.
+Cross-platform **hardware-only** video decoder built on top of `ffmpeg-next` 8.1.
+
+> **Status note.** This document was the original spec from the brainstorm
+> phase and parts have evolved since: the crate is hardware-only (no
+> `Backend::Software`), `Frame` is its own safe wrapper, and several pixel-
+> format / safety details were tightened during review. For the canonical
+> behavior, read `src/lib.rs` and `README.md`. Sections below have been
+> trimmed where they conflicted; the spec is otherwise preserved as
+> historical context.
 
 ## Goals
 
 - Drop-in replacement for `ffmpeg::decoder::Video` at the call site (`send_packet` / `receive_frame` / `send_eof` / `flush`).
-- Auto-probe the platform's hardware backends and silently fall back to software if none open. Caller never has to think about hwaccel availability.
-- Hand back native-format CPU frames (NV12/P010 from the HW path, codec-native from the SW path). Pixel-format conversion is the caller's responsibility (e.g. via `colconv`).
+- Auto-probe the platform's hardware backends. **No software fallback inside this crate** — callers handle that themselves (e.g. via `ffmpeg::decoder::Video`) when `open` returns `Error::AllBackendsFailed`.
+- Hand back native-format CPU frames (NV12/P010 from the HW path post-transfer). Pixel-format conversion is the caller's responsibility (e.g. via `colconv`).
 - Cross-platform: macOS / iOS / iPadOS / tvOS, Linux (Intel/AMD/NVIDIA), Windows (any GPU + CUDA on NVIDIA).
 
 ## Non-goals
@@ -22,8 +30,10 @@ Cross-platform hardware-accelerated video decoder built on top of `ffmpeg-next`
 pub struct VideoDecoder { /* private */ }
 
 impl VideoDecoder {
-    /// Auto-probe HW backends in platform order; fall back to software.
-    /// On success, `backend()` reports the one that won.
+    /// Auto-probe HW backends in platform order. Returns
+    /// `Error::AllBackendsFailed` if no backend can decode this stream;
+    /// caller falls back to software decoder of choice. On success,
+    /// `backend()` reports the one that won.
     pub fn open(parameters: ffmpeg::codec::Parameters) -> Result<Self, Error>;
 
     /// Force a specific backend. No probe, no fallback.
@@ -39,11 +49,11 @@ impl VideoDecoder {
     pub fn send_packet(&mut self, packet: &ffmpeg::Packet) -> Result<(), Error>;
     pub fn send_eof(&mut self) -> Result<(), Error>;
 
-    /// Receive a CPU-side frame. For HW backends, internally calls
+    /// Receive a CPU-side frame. Internally calls
     /// `av_hwframe_transfer_data` and copies PTS/timing onto the result;
-    /// output format is NV12 (8-bit) or P010 (10-bit). For SW, the frame
-    /// is in the codec's native format.
-    pub fn receive_frame(&mut self, frame: &mut ffmpeg::frame::Video) -> Result<(), Error>;
+    /// output format is NV12 (8-bit) or P010 (10-bit) per the HW backend's
+    /// `AVHWFramesContext::sw_format`.
+    pub fn receive_frame(&mut self, frame: &mut Frame) -> Result<(), Error>;
 
     pub fn flush(&mut self);
 }
@@ -89,7 +99,7 @@ Always device 0 / system default (`av_hwdevice_ctx_create(.., NULL, ..)`). No en
 
 ### `get_format` callback
 
-A static `extern "C"` callback. The decoder context's `opaque` field points to a small heap-allocated `CallbackState { wanted: AVPixelFormat }`. The callback walks the offered `pix_fmts` list, returns `wanted` if present, else `AV_PIX_FMT_NONE` (which forces FFmpeg to retry with software). This is the standard pattern from `doc/examples/hw_decode.c`.
+A static `extern "C"` callback. The decoder context's `opaque` field points to a small heap-allocated `CallbackState`. The callback walks the offered `pix_fmts` list as raw `i32` (avoiding bindgen-enum UB on header skew), returns `wanted` if present, else `AV_PIX_FMT_NONE` (which causes the decoder to fail; the caller-side probe loop then tears down and tries the next hardware backend).
 
 ### Frame transfer
 
@@ -138,7 +148,7 @@ VideoToolbox enabled).
 
 1. **Unit tests** (`src/backend.rs`, `src/error.rs`) — pure-Rust: probe-order construction per platform, `Backend` ↔ `AVHWDeviceType` mapping, error formatting.
 2. **Integration** (`tests/decode.rs`) — opens a sample H.264 file via `ffmpeg::format::input`, decodes 30 frames through `VideoDecoder::open` (auto-probe), asserts frame count and dimensions. Sample path comes from env var `HWDECODE_SAMPLE_VIDEO`; test is skipped with a clear `eprintln!` if unset.
-3. **HW smoke** (`tests/hw_smoke.rs`, `#[ignore]`) — same decode, but additionally asserts `decoder.backend() != Backend::Software`. CI runs this on platform-matched runners.
+3. **HW smoke** (`tests/hw_smoke.rs`, `#[ignore]`) — same decode, asserts `decoder.backend()` returns one of the hardware variants (the enum no longer has a Software variant; this is a sanity check against accidental no-op selection). CI runs this on platform-matched runners.
 
 Sample-file env var keeps the repo binary-free. Documented in `README.md`.
 
@@ -146,8 +156,8 @@ Sample-file env var keeps the repo binary-free. Documented in `README.md`.
 
 `benches/decode.rs` (criterion) — two functions:
 
-- `bench_software_decode` — `VideoDecoder::open_with(.., Backend::Software)`, decode all frames of the sample, measure wall-clock per frame.
-- `bench_hardware_decode` — `VideoDecoder::open(..)` (auto-probe). Skipped (`return`) if `decoder.backend() == Backend::Software` (no HW available).
+- `bench_software_decode` — drives `ffmpeg::decoder::Video` directly (this crate has no software backend), decodes all frames, measures wall-clock per frame.
+- `bench_hardware_decode` — `VideoDecoder::open(..)` (auto-probe). Skipped if `open` returns `AllBackendsFailed` (no HW backend available on this host).
 
 Both use the same `HWDECODE_SAMPLE_VIDEO` file. Bench prints which backend the HW run actually used, so results are interpretable across machines.
 
diff --git a/src/decoder.rs b/src/decoder.rs
index a7739c2..cbc995b 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -29,7 +29,13 @@ use crate::{
   frame::Frame,
 };
 
-/// Hardware-accelerated video decoder with software fallback.
+/// Hardware-accelerated video decoder.
+///
+/// Hardware-only — there is no software fallback inside this crate. If
+/// every hardware backend in the platform's probe order fails to open,
+/// `open` returns [`Error::AllBackendsFailed`] and the caller is
+/// responsible for falling back to a software decoder of their choice
+/// (e.g. `ffmpeg::decoder::Video`).
 ///
 /// Mirrors `ffmpeg::decoder::Video`'s `send_packet`/`receive_frame` interface.
 /// Decoded frames are returned through [`crate::Frame`], a CPU-side wrapper
@@ -183,9 +189,9 @@ impl VideoDecoder {
   /// consumed, so a misbehaving middle backend cannot strand the caller.
   ///
   /// [`Self::backend`] reflects whichever backend ultimately produced the
-  /// first frame. Software is the last entry in every probe order, so
-  /// `open` cannot return without a working decoder for any codec libavcodec
-  /// supports.
+  /// first frame. If no hardware backend in the platform's probe order can
+  /// decode this stream, `open` returns [`Error::AllBackendsFailed`];
+  /// callers handle software fallback themselves.
   pub fn open(parameters: codec::Parameters) -> Result<Self> {
     let codec = find_decoder(&parameters)?;
     let order = backend::probe_order();
@@ -223,11 +229,11 @@ impl VideoDecoder {
 
   /// Open the decoder with a specific backend. No probe, no fallback.
   ///
-  /// If `backend` is a hardware backend that the codec can't actually use
-  /// for this stream, the failure surfaces from
-  /// [`Self::receive_frame`] (the strict `get_format` callback returns
+  /// If `backend` cannot actually decode this stream, the failure surfaces
+  /// from [`Self::receive_frame`] (the strict `get_format` callback returns
   /// `AV_PIX_FMT_NONE`, the decoder errors out). The caller is responsible
-  /// for retrying with `Backend::Software` or another backend if desired.
+  /// for retrying with another hardware backend or falling back to a
+  /// software decoder of their choice (e.g. `ffmpeg::decoder::Video`).
   pub fn open_with(parameters: codec::Parameters, backend: Backend) -> Result<Self> {
     let codec = find_decoder(&parameters)?;
     let state = Self::build_state(parameters, codec, backend)?;
@@ -807,9 +813,11 @@ fn drain_into_pending(
   loop {
     match decoder.receive_frame(hw_buf) {
       Ok(()) => {
-        // Either cap (count or bytes) closes the queue. We still drain so
-        // `send_packet` can resume on the next iteration; we just stop
-        // accumulating.
+        // Pre-transfer cap check: if we are already at or over either cap,
+        // the candidate is producing more than we can hold. Treat as an
+        // explicit candidate failure so `advance_probe` can try the next
+        // backend instead of committing a stream with silently-dropped
+        // frames in the middle.
         //
         // TODO: at very large frame sizes (8K HDR P010, > ~96 MiB each)
         // even a single retained frame is significant. Future direction:
@@ -822,11 +830,13 @@ fn drain_into_pending(
             bytes = *pending_bytes,
             max_frames = MAX_PROBE_PENDING_FRAMES,
             max_bytes = max_bytes,
-            "hwdecode: probe pending cap reached; discarding drained candidate frame"
+            "hwdecode: probe pending cap reached; failing candidate replay"
           );
           // SAFETY: hw_buf is owned and valid; unref of an empty frame is a no-op.
           unsafe { av_frame_unref(hw_buf.as_mut_ptr()) };
-          continue;
+          return Err(ffmpeg_next::Error::Other {
+            errno: libc::ENOMEM,
+          });
         }
         let mut cpu = alloc_av_frame()?;
         // SAFETY: hw_buf is a freshly-decoded HW frame; av_hwframe_transfer_data
@@ -841,14 +851,26 @@ fn drain_into_pending(
             return Err(ffmpeg_next::Error::from(r2));
           }
         }
-        // Conservative byte-cap accounting: if we can't size this frame
-        // (unknown CPU pix_fmt — should not happen with strict get_format,
-        // but a misbehaving codec could surface one), discard rather than
-        // queue an unaccounted-for allocation. Never push something whose
-        // size we can't deduct from the budget.
+        // Post-transfer accounting: size the frame and confirm we can fit
+        // it without exceeding the byte budget. If sizing fails (unknown
+        // pix_fmt) we still queue the frame — the count cap (16) bounds
+        // memory — but log that byte accounting under-counts.
         match cpu_frame_bytes(&cpu) {
           Some(bytes) => {
-            *pending_bytes = pending_bytes.saturating_add(bytes);
+            let new_total = pending_bytes.saturating_add(bytes);
+            if new_total > max_bytes {
+              tracing::warn!(
+                pending_bytes = *pending_bytes,
+                frame_bytes = bytes,
+                max_bytes,
+                "hwdecode: queueing this frame would exceed byte cap; failing candidate replay"
+              );
+              // cpu drops here.
+              return Err(ffmpeg_next::Error::Other {
+                errno: libc::ENOMEM,
+              });
+            }
+            *pending_bytes = new_total;
             pending.push_back(cpu);
           }
           None => {
@@ -856,9 +878,9 @@ fn drain_into_pending(
             let pix_fmt: i32 = unsafe { (*cpu.as_ptr()).format };
             tracing::warn!(
               pix_fmt,
-              "hwdecode: cannot size unknown CPU pix_fmt during replay; discarding drained frame"
+              "hwdecode: unknown CPU pix_fmt during replay; queueing without byte accounting (count cap still applies)"
             );
-            // cpu drops here, freeing its buffers via Frame::drop.
+            pending.push_back(cpu);
           }
         }
       }
diff --git a/src/lib.rs b/src/lib.rs
index e6c12ce..b487132 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,16 +1,17 @@
-//! Cross-platform hardware-accelerated video decoder built on top of `ffmpeg-next`.
+//! Cross-platform **hardware** video decoder built on top of `ffmpeg-next`.
 //!
 //! [`VideoDecoder`] mirrors the surface of `ffmpeg::decoder::Video`
-//! (`send_packet`/`receive_frame`/`send_eof`/`flush`) and silently picks the best
-//! hardware backend for the host platform, falling back to software if none open.
+//! (`send_packet`/`receive_frame`/`send_eof`/`flush`) and auto-probes the
+//! host's hardware backends (VideoToolbox / VAAPI / NVDEC / D3D11VA).
+//! There is **no software fallback inside this crate** — if no hardware
+//! backend can decode the stream, [`VideoDecoder::open`] returns
+//! [`Error::AllBackendsFailed`] and the caller picks how to fall back
+//! (e.g. by opening an `ffmpeg::decoder::Video` directly).
 //!
-//! Output frames returned by [`VideoDecoder::receive_frame`] are CPU-side. For
-//! hardware backends they are downloaded with `av_hwframe_transfer_data` (NV12
-//! for 8-bit input, P010 for 10-bit). For software backends the frame is in the
-//! codec's native format.
-//!
-//! Pixel-format conversion is intentionally out of scope; downstream code is
-//! expected to handle that (e.g. via `colconv`).
+//! Output frames returned by [`VideoDecoder::receive_frame`] are CPU-side
+//! and downloaded via `av_hwframe_transfer_data` (NV12 for 8-bit input,
+//! P010 for 10-bit). Pixel-format conversion is intentionally out of
+//! scope; downstream code handles that (e.g. via `colconv`).
 #![cfg_attr(docsrs, feature(doc_cfg))]
 #![cfg_attr(docsrs, allow(unused_attributes))]
 #![deny(missing_docs)]

From d351f1fb1a5539e54bc307067b01dfa9a36db960 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:31:58 +1200
Subject: [PATCH 12/27] update

---
 src/decoder.rs | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index cbc995b..06b9504 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -202,8 +202,15 @@ impl VideoDecoder {
         Ok(state) => {
           tracing::info!(?backend, "hwdecode: opened video decoder (probing)");
           let remaining = order[(i + 1)..].to_vec();
+          // Deep-copy the caller's `parameters` before storing in ProbeState.
+          // `codec::Parameters` from `stream.parameters()` carries an Rc
+          // owner pointing at the demuxer; moving that Rc to a worker
+          // thread (when VideoDecoder is sent) would race with the demuxer's
+          // Rc on the original thread. `Parameters::clone()` does
+          // `avcodec_parameters_copy` and returns a fully owned Parameters
+          // with `owner: None`, severing the link.
           let probe = (!remaining.is_empty()).then(|| ProbeState {
-            parameters,
+            parameters: parameters.clone(),
             codec,
             remaining_backends: remaining,
             buffered_packets: Vec::new(),
@@ -852,9 +859,9 @@ fn drain_into_pending(
           }
         }
         // Post-transfer accounting: size the frame and confirm we can fit
-        // it without exceeding the byte budget. If sizing fails (unknown
-        // pix_fmt) we still queue the frame — the count cap (16) bounds
-        // memory — but log that byte accounting under-counts.
+        // it without exceeding the byte budget. Both cap-hit and inability
+        // to size the frame are treated as candidate failures, so the byte
+        // budget is *strict* — we never queue a frame we can't account for.
         match cpu_frame_bytes(&cpu) {
           Some(bytes) => {
             let new_total = pending_bytes.saturating_add(bytes);
@@ -874,13 +881,21 @@ fn drain_into_pending(
             pending.push_back(cpu);
           }
           None => {
+            // Unknown pix_fmt — we cannot bound this frame's contribution
+            // against the byte cap, so up to MAX_PROBE_PENDING_FRAMES of
+            // them could exhaust memory. Fail the candidate so probing
+            // tries the next backend rather than queueing untracked
+            // allocations.
             // SAFETY: AVFrame.format is c_int, safe to read.
             let pix_fmt: i32 = unsafe { (*cpu.as_ptr()).format };
             tracing::warn!(
               pix_fmt,
-              "hwdecode: unknown CPU pix_fmt during replay; queueing without byte accounting (count cap still applies)"
+              "hwdecode: cannot size unknown CPU pix_fmt during replay; failing candidate"
             );
-            pending.push_back(cpu);
+            // cpu drops here.
+            return Err(ffmpeg_next::Error::Other {
+              errno: libc::ENOMEM,
+            });
           }
         }
       }

From 3b888d81b77027bb2650e23234b15f32d55124cf Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:52:12 +1200
Subject: [PATCH 13/27] update

---
 src/decoder.rs | 95 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 90 insertions(+), 5 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index 06b9504..cb325fd 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -1,10 +1,16 @@
 use std::{collections::VecDeque, mem::ManuallyDrop, ptr};
 
 use ffmpeg_next::{
-  codec::{self, Context},
+  codec::{
+    self,
+    packet::{Mut as PacketMut, Ref as PacketRef},
+    Context,
+  },
   ffi::{
     av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_frame_move_ref, av_frame_unref,
-    av_hwdevice_ctx_create, av_hwframe_transfer_data, AVBufferRef, AVCodec,
+    av_hwdevice_ctx_create, av_hwframe_transfer_data, av_packet_make_writable, av_packet_ref,
+    avcodec_alloc_context3, avcodec_free_context, avcodec_parameters_to_context, AVBufferRef,
+    AVCodec,
   },
   frame, Codec, Packet, Rational,
 };
@@ -341,8 +347,23 @@ impl VideoDecoder {
               self.probe = None;
               self.pending_frames.clear();
             } else {
-              probe.buffered_packets.push(packet.clone());
-              probe.buffered_bytes = new_bytes;
+              // Use the checked clone — ffmpeg-next's `Packet::clone`
+              // discards av_packet_ref's return value and would silently
+              // store an empty packet on ENOMEM, corrupting future replay.
+              match try_clone_packet(packet) {
+                Ok(cloned) => {
+                  probe.buffered_packets.push(cloned);
+                  probe.buffered_bytes = new_bytes;
+                }
+                Err(e) => {
+                  tracing::warn!(
+                    error = %e,
+                    "hwdecode: packet clone failed for probe history; abandoning fallback safety net"
+                  );
+                  self.probe = None;
+                  self.pending_frames.clear();
+                }
+              }
             }
           }
           return Ok(());
@@ -518,6 +539,14 @@ impl VideoDecoder {
   ///   on the very first inspection (e.g. a malformed `Parameters`); the
   ///   per-candidate failures during the loop are absorbed and logged.
   fn advance_probe(&mut self) -> Result<bool> {
+    // Drop frames previously queued from the backend we're now abandoning.
+    // They came from a candidate that just failed for cause and cannot be
+    // trusted alongside frames we may queue from the next candidate. (If
+    // this method is called repeatedly via chained probe advances, this
+    // also keeps `pending_frames` from accumulating frames from multiple
+    // rejected backends.)
+    self.pending_frames.clear();
+
     loop {
       // Snapshot inputs without mutating probe state.
       let (next_backend, parameters, codec) = match self.probe.as_ref() {
@@ -642,7 +671,10 @@ impl VideoDecoder {
     codec: Codec,
     backend: Backend,
   ) -> Result<DecoderState> {
-    let mut ctx = Context::from_parameters(parameters)?;
+    // Use our checked allocator instead of Context::from_parameters, which
+    // does not null-check avcodec_alloc_context3 and would feed a null
+    // AVCodecContext into FFmpeg under OOM.
+    let mut ctx = build_codec_context(&parameters)?;
     let av_type = backend.av_hwdevice_type();
 
     // Verify the codec advertises this hwaccel. We do *not* read the
@@ -771,6 +803,59 @@ fn alloc_av_frame() -> std::result::Result<frame::Video, ffmpeg_next::Error> {
   Ok(inner)
 }
 
+/// Build a fresh `Context` from `parameters`, checking the underlying
+/// `avcodec_alloc_context3` for NULL before passing it to
+/// `avcodec_parameters_to_context`. ffmpeg-next's `Context::from_parameters`
+/// skips that check and would feed a null pointer into FFmpeg under OOM —
+/// undefined behavior. This helper surfaces the failure as `ENOMEM` and
+/// frees the context if `parameters_to_context` itself errors.
+fn build_codec_context(parameters: &codec::Parameters) -> Result<Context> {
+  // SAFETY: avcodec_alloc_context3(NULL) returns a fresh AVCodecContext
+  // or NULL on allocation failure.
+  let ctx_ptr = unsafe { avcodec_alloc_context3(ptr::null()) };
+  if ctx_ptr.is_null() {
+    return Err(Error::Ffmpeg(ffmpeg_next::Error::Other {
+      errno: libc::ENOMEM,
+    }));
+  }
+  // SAFETY: ctx_ptr is non-null and freshly allocated; parameters.as_ptr()
+  // returns a valid AVCodecParameters pointer; the function copies bytes
+  // out of parameters into the context.
+  let ret = unsafe { avcodec_parameters_to_context(ctx_ptr, parameters.as_ptr()) };
+  if ret < 0 {
+    // SAFETY: ctx_ptr was allocated by us and never handed to anyone else.
+    let mut p = ctx_ptr;
+    unsafe { avcodec_free_context(&mut p) };
+    return Err(Error::Ffmpeg(ffmpeg_next::Error::from(ret)));
+  }
+  // SAFETY: ctx_ptr is valid; passing `owner: None` means our wrapper owns
+  // the allocation and `Context::drop` will run `avcodec_free_context`.
+  Ok(unsafe { Context::wrap(ctx_ptr, None) })
+}
+
+/// Checked counterpart to `Packet::clone()`. ffmpeg-next's `clone_from`
+/// calls `av_packet_ref` and ignores the int return value; on `ENOMEM`
+/// the destination is left empty while the caller assumes the clone
+/// succeeded — corrupting any later replay history. This helper surfaces
+/// the AVERROR.
+fn try_clone_packet(src: &Packet) -> std::result::Result<Packet, ffmpeg_next::Error> {
+  let mut dst = Packet::empty();
+  // SAFETY: dst is a freshly zero-initialized Packet (av_init_packet inside
+  // Packet::empty); av_packet_ref initializes its data fields from src's
+  // refcounted buffer or returns AVERROR(ENOMEM) on failure.
+  let ret = unsafe { av_packet_ref(dst.as_mut_ptr(), src.as_ptr()) };
+  if ret < 0 {
+    return Err(ffmpeg_next::Error::from(ret));
+  }
+  // av_packet_make_writable allocates a writable copy if the buffer is
+  // shared. Can also fail with ENOMEM.
+  let ret = unsafe { av_packet_make_writable(dst.as_mut_ptr()) };
+  if ret < 0 {
+    return Err(ffmpeg_next::Error::from(ret));
+  }
+  Ok(dst)
+}
+
 /// Just `EAGAIN` (separate from EOF — the FFmpeg send/receive state machine
 /// distinguishes "drain output and retry" from "stream over").
 fn is_eagain(e: &ffmpeg_next::Error) -> bool {

From a95968d1bf780b2dd3d130565deded5370cb94da Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 23:29:39 +1200
Subject: [PATCH 14/27] update

---
 src/decoder.rs | 120 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 101 insertions(+), 19 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index cb325fd..aee8366 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -9,7 +9,8 @@ use ffmpeg_next::{
   ffi::{
     av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_frame_move_ref, av_frame_unref,
     av_hwdevice_ctx_create, av_hwframe_transfer_data, av_packet_make_writable, av_packet_ref,
-    avcodec_alloc_context3, avcodec_free_context, avcodec_parameters_to_context, AVBufferRef,
+    avcodec_alloc_context3, avcodec_free_context, avcodec_parameters_alloc,
+    avcodec_parameters_copy, avcodec_parameters_free, avcodec_parameters_to_context, AVBufferRef,
     AVCodec,
   },
   frame, Codec, Packet, Rational,
@@ -204,7 +205,19 @@ impl VideoDecoder {
 
     let mut attempts: Vec<(Backend, Box<Error>)> = Vec::new();
     for (i, &backend) in order.iter().enumerate() {
-      match Self::build_state(parameters.clone(), codec, backend) {
+      // Use the checked clone — ffmpeg-next's `Parameters::clone` does
+      // `avcodec_parameters_alloc` without a null check and ignores the
+      // return of `avcodec_parameters_copy`. Under OOM that path silently
+      // produces a Parameters with a null inner pointer.
+      let cloned_for_build = match try_clone_parameters(&parameters) {
+        Ok(p) => p,
+        Err(e) => {
+          tracing::warn!(?backend, error = %e, "hwdecode: parameters clone failed");
+          attempts.push((backend, Box::new(Error::Ffmpeg(e))));
+          continue;
+        }
+      };
+      match Self::build_state(cloned_for_build, codec, backend) {
         Ok(state) => {
           tracing::info!(?backend, "hwdecode: opened video decoder (probing)");
           let remaining = order[(i + 1)..].to_vec();
@@ -212,17 +225,33 @@ impl VideoDecoder {
           // `codec::Parameters` from `stream.parameters()` carries an Rc
           // owner pointing at the demuxer; moving that Rc to a worker
           // thread (when VideoDecoder is sent) would race with the demuxer's
-          // Rc on the original thread. `Parameters::clone()` does
-          // `avcodec_parameters_copy` and returns a fully owned Parameters
-          // with `owner: None`, severing the link.
-          let probe = (!remaining.is_empty()).then(|| ProbeState {
-            parameters: parameters.clone(),
-            codec,
-            remaining_backends: remaining,
-            buffered_packets: Vec::new(),
-            buffered_bytes: 0,
-            eof_sent: false,
-          });
+          // Rc on the original thread. The checked clone copies the bytes
+          // into a fresh allocation with `owner: None`, severing the link.
+          //
+          // If the clone fails (ENOMEM), we keep the active `state` but
+          // skip probe setup — caller loses cross-backend fallback safety
+          // net but still gets a working decoder.
+          let probe = if remaining.is_empty() {
+            None
+          } else {
+            match try_clone_parameters(&parameters) {
+              Ok(probe_params) => Some(ProbeState {
+                parameters: probe_params,
+                codec,
+                remaining_backends: remaining,
+                buffered_packets: Vec::new(),
+                buffered_bytes: 0,
+                eof_sent: false,
+              }),
+              Err(e) => {
+                tracing::warn!(
+                  error = %e,
+                  "hwdecode: parameters clone failed for probe state; proceeding without fallback"
+                );
+                None
+              }
+            }
+          };
           return Ok(Self {
             state,
             hw_frame: alloc_av_frame().map_err(Error::Ffmpeg)?,
@@ -548,13 +577,28 @@ impl VideoDecoder {
     self.pending_frames.clear();
 
     loop {
-      // Snapshot inputs without mutating probe state.
+      // Snapshot inputs without mutating probe state. Use the checked
+      // clone helper rather than `Parameters::clone` (which masks ENOMEM).
       let (next_backend, parameters, codec) = match self.probe.as_ref() {
-        Some(probe) if !probe.remaining_backends.is_empty() => (
-          probe.remaining_backends[0],
-          probe.parameters.clone(),
-          probe.codec,
-        ),
+        Some(probe) if !probe.remaining_backends.is_empty() => {
+          let parameters = match try_clone_parameters(&probe.parameters) {
+            Ok(p) => p,
+            Err(e) => {
+              tracing::warn!(
+                error = %e,
+                "hwdecode: parameters clone failed during probe advance; popping backend and trying next"
+              );
+              self
+                .probe
+                .as_mut()
+                .expect("probe state present")
+                .remaining_backends
+                .remove(0);
+              continue;
+            }
+          };
+          (probe.remaining_backends[0], parameters, probe.codec)
+        }
         _ => return Ok(false),
       };
 
@@ -833,6 +877,44 @@ fn build_codec_context(parameters: &codec::Parameters) -> Result<Context> {
   Ok(unsafe { Context::wrap(ctx_ptr, None) })
 }
 
+/// Checked deep-clone of `codec::Parameters`. ffmpeg-next's
+/// `Parameters::clone` allocates via `avcodec_parameters_alloc` without
+/// checking for NULL and runs `avcodec_parameters_copy` without checking
+/// the return code. On `ENOMEM` the result is a `Parameters` with a null
+/// inner pointer, which becomes UB when later passed to FFmpeg.
+///
+/// This helper performs both calls explicitly, frees a partial allocation
+/// on failure, and surfaces the AVERROR. The returned `Parameters` has
+/// `owner: None`, severing any Rc link to the caller's demuxer (the
+/// reason we deep-clone in the first place — see Send safety in
+/// `VideoDecoder::open`).
+fn try_clone_parameters(
+  src: &codec::Parameters,
+) -> std::result::Result<codec::Parameters, ffmpeg_next::Error> {
+  // SAFETY: avcodec_parameters_alloc returns a fresh AVCodecParameters
+  // pointer or NULL on allocation failure.
+  let dst_ptr = unsafe { avcodec_parameters_alloc() };
+  if dst_ptr.is_null() {
+    return Err(ffmpeg_next::Error::Other {
+      errno: libc::ENOMEM,
+    });
+  }
+  // SAFETY: dst_ptr is non-null and freshly allocated; src.as_ptr() is
+  // a valid AVCodecParameters pointer; the function copies bytes from
+  // src into dst.
+  let ret = unsafe { avcodec_parameters_copy(dst_ptr, src.as_ptr()) };
+  if ret < 0 {
+    // SAFETY: dst_ptr was allocated by us and never handed out.
+    let mut p = dst_ptr;
+    unsafe { avcodec_parameters_free(&mut p) };
+    return Err(ffmpeg_next::Error::from(ret));
+  }
+  // SAFETY: dst_ptr is a valid AVCodecParameters; passing `owner: None`
+  // means our wrapper owns the allocation and `Parameters::drop` will
+  // call `avcodec_parameters_free`.
+  Ok(unsafe { codec::Parameters::wrap(dst_ptr, None) })
+}
+
 /// Checked counterpart to `Packet::clone()`. ffmpeg-next's `clone_from`
 /// calls `av_packet_ref` and ignores the int return value; on `ENOMEM`
 /// the destination is left empty while the caller assumes the clone

From ab25046e7e9bf01fabf399e94bb647101131bb1c Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 23:48:49 +1200
Subject: [PATCH 15/27] update

---
 src/decoder.rs | 65 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 4 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index aee8366..c9e99eb 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -832,6 +832,23 @@ fn is_transient(e: &ffmpeg_next::Error) -> bool {
   is_eagain(e) || matches!(e, ffmpeg_next::Error::Eof)
 }
 
+/// Reject a `codec::Parameters` whose inner `*mut AVCodecParameters` is
+/// null. This guards the public trust boundary: ffmpeg-next can produce
+/// such a `Parameters` under OOM (`Parameters::new()` does not check
+/// `avcodec_parameters_alloc`), and a safe caller can legally hand one
+/// in. Without this check, the very next `(*p.as_ptr()).field` read
+/// would be a null deref.
+fn ensure_parameters_non_null(parameters: &codec::Parameters) -> Result<()> {
+  // SAFETY: as_ptr() returns the inner *const AVCodecParameters; we just
+  // inspect the pointer value (no deref).
+  if unsafe { parameters.as_ptr() }.is_null() {
+    return Err(Error::Ffmpeg(ffmpeg_next::Error::Other {
+      errno: libc::ENOMEM,
+    }));
+  }
+  Ok(())
+}
+
 /// Allocate a fresh `frame::Video`, checking that `av_frame_alloc` did not
 /// return NULL. ffmpeg-next's `frame::Video::empty()` does not surface that
 /// failure and the resulting null pointer would be UB on the next field
@@ -854,6 +871,7 @@ fn alloc_av_frame() -> std::result::Result<frame::Video, ffmpeg_next::Error> {
 /// undefined behavior. This helper surfaces the failure as `ENOMEM` and
 /// frees the context if `parameters_to_context` itself errors.
 fn build_codec_context(parameters: &codec::Parameters) -> Result<Context> {
+  ensure_parameters_non_null(parameters)?;
   // SAFETY: avcodec_alloc_context3(NULL) returns a fresh AVCodecContext
   // or NULL on allocation failure.
   let ctx_ptr = unsafe { avcodec_alloc_context3(ptr::null()) };
@@ -891,6 +909,13 @@ fn build_codec_context(parameters: &codec::Parameters) -> Result<Context> {
 fn try_clone_parameters(
   src: &codec::Parameters,
 ) -> std::result::Result<codec::Parameters, ffmpeg_next::Error> {
+  // Reject a null inner pointer at the boundary; a deref inside
+  // avcodec_parameters_copy below would otherwise be UB.
+  if unsafe { src.as_ptr() }.is_null() {
+    return Err(ffmpeg_next::Error::Other {
+      errno: libc::ENOMEM,
+    });
+  }
   // SAFETY: avcodec_parameters_alloc returns a fresh AVCodecParameters
   // pointer or NULL on allocation failure.
   let dst_ptr = unsafe { avcodec_parameters_alloc() };
@@ -949,10 +974,11 @@ fn is_eagain(e: &ffmpeg_next::Error) -> bool {
 /// `addr_of!` + `ptr::read` so a value not in our build's discriminant
 /// set never invokes UB.
 fn find_decoder(parameters: &codec::Parameters) -> Result<Codec> {
-  // SAFETY: parameters owns a valid AVCodecParameters; addr_of! projects
-  // to the codec_id field; the *const u32 cast is sound because AVCodecID
-  // is `#[repr(u32)]` (same size and alignment as u32). Reading as u32
-  // cannot be UB regardless of the value FFmpeg wrote.
+  ensure_parameters_non_null(parameters)?;
+  // SAFETY: parameters' inner pointer is non-null (checked above);
+  // addr_of! projects to the codec_id field; the *const u32 cast is sound
+  // because AVCodecID is `#[repr(u32)]` (same size and alignment as u32).
+  // Reading as u32 cannot be UB regardless of the value FFmpeg wrote.
   let raw_id: u32 =
     unsafe { ptr::read(ptr::addr_of!((*parameters.as_ptr()).codec_id) as *const u32) };
 
@@ -1137,4 +1163,35 @@ mod tests {
     let other = ffmpeg_next::Error::InvalidData;
     assert!(!is_transient(&other));
   }
+
+  /// Regression: a `codec::Parameters` with a null inner pointer must be
+  /// rejected at the entrypoint, not deref'd. ffmpeg-next's
+  /// `Parameters::new()` does not check `avcodec_parameters_alloc()`, so a
+  /// safe caller can hand us such a value under OOM.
+  #[test]
+  fn open_rejects_null_parameters() {
+    // SAFETY: Parameters::wrap accepts any pointer; we explicitly construct
+    // one with null inner. avcodec_parameters_free is null-safe on Drop.
+    let null_params = unsafe { codec::Parameters::wrap(std::ptr::null_mut(), None) };
+    match VideoDecoder::open(null_params) {
+      Ok(_) => panic!("open should fail on null parameters"),
+      Err(Error::Ffmpeg(ffmpeg_next::Error::Other { errno })) => {
+        assert_eq!(errno, libc::ENOMEM, "expected ENOMEM, got {errno}");
+      }
+      Err(other) => panic!("expected Ffmpeg(Other {{ ENOMEM }}), got {other:?}"),
+    }
+  }
+
+  #[test]
+  fn open_with_rejects_null_parameters() {
+    // SAFETY: see open_rejects_null_parameters.
+    let null_params = unsafe { codec::Parameters::wrap(std::ptr::null_mut(), None) };
+    match VideoDecoder::open_with(null_params, Backend::VideoToolbox) {
+      Ok(_) => panic!("open_with should fail on null parameters"),
+      Err(Error::Ffmpeg(ffmpeg_next::Error::Other { errno })) => {
+        assert_eq!(errno, libc::ENOMEM, "expected ENOMEM, got {errno}");
+      }
+      Err(other) => panic!("expected Ffmpeg(Other {{ ENOMEM }}), got {other:?}"),
+    }
+  }
 }

From 36a4729606e825af94973f1229d83d2969f56401 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 00:26:00 +1200
Subject: [PATCH 16/27] update

---
 src/decoder.rs | 57 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 5 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index c9e99eb..165f6d8 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -11,7 +11,7 @@ use ffmpeg_next::{
     av_hwdevice_ctx_create, av_hwframe_transfer_data, av_packet_make_writable, av_packet_ref,
     avcodec_alloc_context3, avcodec_free_context, avcodec_parameters_alloc,
     avcodec_parameters_copy, avcodec_parameters_free, avcodec_parameters_to_context, AVBufferRef,
-    AVCodec,
+    AVCodec, AVMediaType,
   },
   frame, Codec, Packet, Rational,
 };
@@ -670,8 +670,30 @@ impl VideoDecoder {
           }
         }
         if r.is_ok() && probe.eof_sent {
-          if let Err(e) = candidate_state.inner.send_eof() {
-            r = Err(e);
+          // `avcodec_send_packet(NULL)` (which `send_eof` becomes) can
+          // return EAGAIN with the same drain-output-first semantics as
+          // a regular send_packet. Loop drain+retry instead of failing
+          // the candidate on backpressure.
+          loop {
+            match candidate_state.inner.send_eof() {
+              Ok(()) => break,
+              Err(e) if is_eagain(&e) => {
+                if let Err(de) = drain_into_pending(
+                  &mut candidate_state.inner,
+                  &mut hw_buf,
+                  &mut local_pending,
+                  &mut local_pending_bytes,
+                  max_pending_bytes,
+                ) {
+                  r = Err(de);
+                  break;
+                }
+              }
+              Err(e) => {
+                r = Err(e);
+                break;
+              }
+            }
           }
         }
         r
@@ -778,8 +800,15 @@ impl VideoDecoder {
 
     // Open the decoder. On any failure, release the resources we just
     // allocated so we don't leak.
-    let opened = match ctx.decoder().open_as(codec).and_then(|o| o.video()) {
-      Ok(d) => d,
+    //
+    // We deliberately bypass `Opened::video()` because it calls
+    // `Context::medium()`, which reads `AVCodecContext.codec_type` as the
+    // bindgen `AVMediaType` enum — the same UB hazard we've been
+    // systematically removing. Instead: validate `codec_type` as a raw
+    // `c_int` ourselves, then construct the `decoder::Video` wrapper
+    // directly via its public tuple field.
+    let opened = match ctx.decoder().open_as(codec) {
+      Ok(o) => o,
       Err(e) => {
         // SAFETY: we either allocated these in this function above or
         // they are null; av_buffer_unref / Box::from_raw handle null
@@ -797,6 +826,24 @@ impl VideoDecoder {
       }
     };
 
+    // Validate codec_type as a raw integer — never construct AVMediaType
+    // from an unvalidated runtime value.
+    // SAFETY: codec_type is bound as AVMediaType (`#[repr(i32)]`), same
+    // size and alignment as i32; reading the bytes as i32 cannot be UB.
+    let codec_type_int: i32 =
+      unsafe { ptr::read(ptr::addr_of!((*opened.as_ptr()).codec_type) as *const i32) };
+    let video_type_int: i32 = AVMediaType::AVMEDIA_TYPE_VIDEO as i32;
+    if codec_type_int != video_type_int {
+      // Not a video codec context — surface the same error
+      // `Opened::video()` would have, without going through enum
+      // construction. Cleanup runs via `opened`'s Drop.
+      return Err(Error::Ffmpeg(ffmpeg_next::Error::InvalidData));
+    }
+    // SAFETY of construction: `decoder::Video` is `pub struct Video(pub Opened)`.
+    // We construct via the public field; this is the same wrapping
+    // `Opened::video()` does on success, just without the enum read.
+    let opened = ffmpeg_next::decoder::Video(opened);
+
     Ok(DecoderState {
       inner: ManuallyDrop::new(opened),
       backend,

From e27758846eca038a0be9c3c265193ac8d9856ef3 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 01:31:36 +1200
Subject: [PATCH 17/27] update

---
 Cargo.toml        |  2 +-
 README.md         | 51 ++++++++++++++++++++++-----------
 docs/design.md    | 72 +++++++++--------------------------------------
 src/decoder.rs    | 28 +++++++++---------
 tests/hw_smoke.rs | 25 ++++++++++++----
 5 files changed, 82 insertions(+), 96 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 7691656..9a3b19a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "hwdecode"
-version = "0.0.0"
+version = "0.1.0"
 edition = "2021"
 rust-version = "1.95"
 description = "Cross-platform hardware-only video decoder built on top of ffmpeg-next, with auto-probe across HW backends. Callers handle software fallback."
diff --git a/README.md b/README.md
index c4e9108..05e7c56 100644
--- a/README.md
+++ b/README.md
@@ -4,27 +4,32 @@ Cross-platform hardware-accelerated video decoder for Rust, built on top of
 [`ffmpeg-next`](https://crates.io/crates/ffmpeg-next).
 
 `VideoDecoder` mirrors the `send_packet` / `receive_frame` interface of
-`ffmpeg::decoder::Video` and silently picks the best hardware backend for the
-host platform, falling back to software if none open. Output frames are
-CPU-side — for HW backends they are downloaded with `av_hwframe_transfer_data`
-(NV12 for 8-bit, P010 for 10-bit). Pixel-format conversion is intentionally
-out of scope.
+`ffmpeg::decoder::Video` and auto-probes the host's hardware backends.
+This crate is **hardware-only** — there is no software fallback inside it.
+If no hardware backend can decode the stream, `VideoDecoder::open` returns
+`Error::AllBackendsFailed` and the caller decides how to fall back (typically
+by opening an `ffmpeg::decoder::Video` directly). Output frames are CPU-side,
+downloaded with `av_hwframe_transfer_data` (NV12 for 8-bit, P010 for 10-bit).
+Pixel-format conversion is intentionally out of scope.
 
 ## Backends
 
-| Target              | Probe order                       |
+| Target              | Probe order (HW only)             |
 | ------------------- | --------------------------------- |
-| macOS / iOS / tvOS  | VideoToolbox → Software           |
-| Linux               | VAAPI → CUDA → Software           |
-| Windows             | D3D11VA → CUDA → Software         |
-| other               | Software                          |
+| macOS / iOS / tvOS  | VideoToolbox                      |
+| Linux               | VAAPI → CUDA                      |
+| Windows             | D3D11VA → CUDA                    |
+| other               | (none)                            |
+
+If `open` returns `Error::AllBackendsFailed`, software fallback is the
+caller's responsibility (this crate intentionally does not include one).
 
 ## Usage
 
-```rust
+```rust,no_run
 use ffmpeg_next as ffmpeg;
-use ffmpeg::{format, frame, media};
-use hwdecode::VideoDecoder;
+use ffmpeg::{format, media};
+use hwdecode::{Frame, VideoDecoder};
 
 ffmpeg::init()?;
 
@@ -32,15 +37,29 @@ let mut input = format::input(path)?;
 let stream = input.streams().best(media::Type::Video).unwrap();
 let stream_index = stream.index();
 
-let mut decoder = VideoDecoder::open(stream.parameters())?;
+// HW-only open. On AllBackendsFailed, fall back to software yourself.
+let mut decoder = match VideoDecoder::open(stream.parameters()) {
+    Ok(d) => d,
+    Err(hwdecode::Error::AllBackendsFailed { .. }) => {
+        // Caller-side software fallback.
+        let _sw = ffmpeg::codec::Context::from_parameters(stream.parameters())?
+            .decoder()
+            .video()?;
+        // ... drive _sw with send_packet / receive_frame yourself ...
+        return Ok(());
+    }
+    Err(e) => return Err(e.into()),
+};
 println!("backend = {:?}", decoder.backend());
 
-let mut frame = frame::Video::empty();
+let mut frame = Frame::empty()?;
 for (s, packet) in input.packets() {
     if s.index() != stream_index { continue; }
     decoder.send_packet(&packet)?;
     while decoder.receive_frame(&mut frame).is_ok() {
-        // frame.format() is NV12 / P010 (HW path) or codec-native (SW path)
+        // frame.pix_fmt() is the integer constant — match against
+        // hwdecode::pix_fmt::{NV12, P010LE, ...} and dispatch to your
+        // pixel-format pipeline (e.g. `colconv`).
         // ... do something with frame ...
     }
 }
diff --git a/docs/design.md b/docs/design.md
index 2c54aee..521dd49 100644
--- a/docs/design.md
+++ b/docs/design.md
@@ -26,72 +26,26 @@ Cross-platform **hardware-only** video decoder built on top of `ffmpeg-next` 8.1
 
 ## Public API
 
-```rust
-pub struct VideoDecoder { /* private */ }
-
-impl VideoDecoder {
-    /// Auto-probe HW backends in platform order. Returns
-    /// `Error::AllBackendsFailed` if no backend can decode this stream;
-    /// caller falls back to software decoder of choice. On success,
-    /// `backend()` reports the one that won.
-    pub fn open(parameters: ffmpeg::codec::Parameters) -> Result<Self, Error>;
-
-    /// Force a specific backend. No probe, no fallback.
-    pub fn open_with(parameters: ffmpeg::codec::Parameters, backend: Backend) -> Result<Self, Error>;
-
-    pub fn backend(&self) -> Backend;
-    pub fn width(&self) -> u32;
-    pub fn height(&self) -> u32;
-    pub fn format(&self) -> ffmpeg::format::Pixel;
-    pub fn time_base(&self) -> ffmpeg::Rational;
-    pub fn frame_rate(&self) -> ffmpeg::Rational;
-
-    pub fn send_packet(&mut self, packet: &ffmpeg::Packet) -> Result<(), Error>;
-    pub fn send_eof(&mut self) -> Result<(), Error>;
-
-    /// Receive a CPU-side frame. Internally calls
-    /// `av_hwframe_transfer_data` and copies PTS/timing onto the result;
-    /// output format is NV12 (8-bit) or P010 (10-bit) per the HW backend's
-    /// `AVHWFramesContext::sw_format`.
-    pub fn receive_frame(&mut self, frame: &mut Frame) -> Result<(), Error>;
-
-    pub fn flush(&mut self);
-}
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
-pub enum Backend {
-    Software,
-    VideoToolbox, // macOS, iOS, iPadOS, tvOS
-    Vaapi,        // Linux (Intel/AMD)
-    Cuda,         // Linux/Windows (NVIDIA)
-    D3d11va,      // Windows
-}
-
-#[derive(Debug, thiserror::Error)]
-pub enum Error {
-    #[error("ffmpeg error: {0}")]
-    Ffmpeg(#[from] ffmpeg::Error),
-    #[error("no decoder for codec id {0:?}")]
-    NoCodec(ffmpeg::codec::Id),
-    #[error("hardware device init failed for {backend:?}: {source}")]
-    HwDeviceInitFailed { backend: Backend, source: ffmpeg::Error },
-    #[error("all backends failed; attempts: {attempts:?}")]
-    AllBackendsFailed { attempts: Vec<(Backend, ffmpeg::Error)> },
-}
-```
+> The original spec listed an inline API surface here. It diverged from the
+> shipping crate (`Backend::Software` was removed; `format() -> Pixel` was
+> removed in favor of `Frame::pix_fmt() -> i32`; the `Frame` wrapper
+> replaced `frame::Video`; `Error` gained / dropped variants). Rather than
+> keep stale signatures here, the canonical reference is `src/lib.rs` and
+> the public docs on each item. See the README for a runnable usage
+> example.
 
 ## Behavior
 
 ### Probe order
 
-| Target              | Order tried                                  |
+| Target              | Order tried (HW only)                        |
 | ------------------- | -------------------------------------------- |
-| macOS, iOS, tvOS    | `[VideoToolbox, Software]`                   |
-| Linux               | `[Vaapi, Cuda, Software]`                    |
-| Windows             | `[D3d11va, Cuda, Software]`                  |
-| Other               | `[Software]`                                 |
+| macOS, iOS, tvOS    | `[VideoToolbox]`                             |
+| Linux               | `[Vaapi, Cuda]`                              |
+| Windows             | `[D3d11va, Cuda]`                            |
+| Other               | `[]` → `Error::AllBackendsFailed`            |
 
-A HW backend is a candidate only if **(a)** its `AVHWDeviceType` device can be created via `av_hwdevice_ctx_create`, and **(b)** the codec advertises support via `avcodec_get_hw_config` matching that device type. The first candidate that fully opens wins. Each failure logs `tracing::warn!` with the backend and the underlying error and the loop tries the next.
+A HW backend is a candidate only if **(a)** its `AVHWDeviceType` device can be created via `av_hwdevice_ctx_create`, and **(b)** the codec advertises support via `avcodec_get_hw_config` matching that device type. The first candidate that fully opens wins. Each failure logs `tracing::warn!` with the backend and the underlying error and the loop tries the next. If every backend fails (or the platform has none), `open` returns `Error::AllBackendsFailed`; software fallback is the caller's responsibility.
 
 ### Device selection
 
diff --git a/src/decoder.rs b/src/decoder.rs
index 165f6d8..e146a18 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -8,10 +8,9 @@ use ffmpeg_next::{
   },
   ffi::{
     av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_frame_move_ref, av_frame_unref,
-    av_hwdevice_ctx_create, av_hwframe_transfer_data, av_packet_make_writable, av_packet_ref,
-    avcodec_alloc_context3, avcodec_free_context, avcodec_parameters_alloc,
-    avcodec_parameters_copy, avcodec_parameters_free, avcodec_parameters_to_context, AVBufferRef,
-    AVCodec, AVMediaType,
+    av_hwdevice_ctx_create, av_hwframe_transfer_data, av_packet_ref, avcodec_alloc_context3,
+    avcodec_free_context, avcodec_parameters_alloc, avcodec_parameters_copy,
+    avcodec_parameters_free, avcodec_parameters_to_context, AVBufferRef, AVCodec, AVMediaType,
   },
   frame, Codec, Packet, Rational,
 };
@@ -114,10 +113,12 @@ const MAX_PROBE_PACKET_BYTES: usize = 64 * 1024 * 1024;
 /// during probe replay. Each frame is a fully-allocated CPU buffer
 /// (~3 MiB for 1080p NV12, ~24 MiB for 4K P010, ~96 MiB for 8K P010), so
 /// an unbounded queue would OOM on a candidate with a shallow internal
-/// queue against a deep replay history. Drained candidate frames in
-/// excess of this cap (or [`DEFAULT_MAX_PROBE_PENDING_BYTES`], whichever
-/// hits first) are discarded with a `tracing::warn!`; we still drain so
-/// `send_packet` can keep feeding the candidate.
+/// queue against a deep replay history. This cap, together with
+/// [`DEFAULT_MAX_PROBE_PENDING_BYTES`], is enforced as a hard limit during
+/// replay: once either limit is reached, probe buffering fails for the
+/// candidate (returns `ENOMEM` from `drain_into_pending`) instead of
+/// queueing additional drained frames. The probe loop then advances to
+/// the next backend or returns `Error::AllBackendsFailed` if exhausted.
 const MAX_PROBE_PENDING_FRAMES: usize = 16;
 
 /// Default byte budget for probe-replay drained frames. 256 MiB is enough
@@ -991,7 +992,10 @@ fn try_clone_parameters(
 /// calls `av_packet_ref` and ignores the int return value; on `ENOMEM`
 /// the destination is left empty while the caller assumes the clone
 /// succeeded — corrupting any later replay history. This helper surfaces
-/// the AVERROR.
+/// the AVERROR. The result is a refcounted shallow clone — the payload
+/// buffer is shared with `src` rather than deep-copied; the probe replay
+/// only sends packets through `avcodec_send_packet`, which does not
+/// require a writable buffer.
 fn try_clone_packet(src: &Packet) -> std::result::Result<Packet, ffmpeg_next::Error> {
   let mut dst = Packet::empty();
   // SAFETY: dst is a freshly zero-initialized Packet (av_init_packet inside
@@ -1001,12 +1005,6 @@ fn try_clone_packet(src: &Packet) -> std::result::Result<Packet, ffmpeg_next::Er
   if ret < 0 {
     return Err(ffmpeg_next::Error::from(ret));
   }
-  // av_packet_make_writable allocates a writable copy if the buffer is
-  // shared. Can also fail with ENOMEM.
-  let ret = unsafe { av_packet_make_writable(dst.as_mut_ptr()) };
-  if ret < 0 {
-    return Err(ffmpeg_next::Error::from(ret));
-  }
   Ok(dst)
 }
 
diff --git a/tests/hw_smoke.rs b/tests/hw_smoke.rs
index 372c8f7..50d67ca 100644
--- a/tests/hw_smoke.rs
+++ b/tests/hw_smoke.rs
@@ -1,5 +1,7 @@
-//! `#[ignore]`-gated smoke test that asserts the auto-probed backend is
-//! actually a hardware backend (not Software). Run with:
+//! `#[ignore]`-gated smoke test that exercises end-to-end hardware decode
+//! against a real video file: opens the auto-probed decoder, drives it
+//! until the first frame is delivered, and asserts the active backend is
+//! one of the supported HW variants. Run with:
 //!
 //! ```sh
 //! HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo test --test hw_smoke -- --ignored
@@ -7,7 +9,7 @@
 
 use ffmpeg::{format, media};
 use ffmpeg_next as ffmpeg;
-use hwdecode::{Frame, VideoDecoder};
+use hwdecode::{Backend, Frame, VideoDecoder};
 
 const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
 
@@ -60,6 +62,19 @@ fn auto_probe_picks_hardware_backend() {
     }
   }
   assert!(got_frame, "no frames decoded");
-  // hwdecode is hardware-only — `backend()` after a successful first frame
-  // is by construction one of the HW variants. Logged above for visibility.
+  // After the probe collapses, `backend()` reports the backend that
+  // actually produced the first frame. Make the doc-comment claim
+  // explicit: it must be one of the HW variants. Today the enum is
+  // exhaustively HW-only, so `matches!` here is tautological — but it
+  // documents intent and would catch a future regression that
+  // reintroduces a non-HW variant or leaves the active state
+  // mis-classified.
+  let backend = decoder.backend();
+  assert!(
+    matches!(
+      backend,
+      Backend::VideoToolbox | Backend::Vaapi | Backend::Cuda | Backend::D3d11va
+    ),
+    "expected HW backend, got {backend:?}"
+  );
 }

From 759f31d19ab0d0e8207c01e328746778657eb53d Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 10:58:42 +1200
Subject: [PATCH 18/27] update

---
 README.md      |  16 ++-
 src/decoder.rs | 177 +++++++++++++++++-------
 src/frame.rs   | 362 +++++++++++++++++++++++++++++++++++++++++--------
 src/lib.rs     |  10 +-
 4 files changed, 456 insertions(+), 109 deletions(-)

diff --git a/README.md b/README.md
index 05e7c56..3da5fba 100644
--- a/README.md
+++ b/README.md
@@ -6,11 +6,17 @@ Cross-platform hardware-accelerated video decoder for Rust, built on top of
 `VideoDecoder` mirrors the `send_packet` / `receive_frame` interface of
 `ffmpeg::decoder::Video` and auto-probes the host's hardware backends.
 This crate is **hardware-only** — there is no software fallback inside it.
-If no hardware backend can decode the stream, `VideoDecoder::open` returns
-`Error::AllBackendsFailed` and the caller decides how to fall back (typically
-by opening an `ffmpeg::decoder::Video` directly). Output frames are CPU-side,
-downloaded with `av_hwframe_transfer_data` (NV12 for 8-bit, P010 for 10-bit).
-Pixel-format conversion is intentionally out of scope.
+If no hardware backend can decode the stream, `Error::AllBackendsFailed`
+surfaces from `VideoDecoder::open` (when no backend opens) or from
+`receive_frame` / `send_packet` / `send_eof` (when the initially-opened
+backend fails at decode time and every remaining backend in the probe order
+also fails — the only way it surfaces on single-backend platforms like macOS).
+The caller decides how to fall back (typically by opening an
+`ffmpeg::decoder::Video` directly). Output frames are CPU-side, downloaded
+with `av_hwframe_transfer_data` (NV12 for 8-bit, P010 for 10-bit). Pixel-
+format conversion is intentionally out of scope; safe per-row access is via
+`Frame::row` / `Frame::rows` (clipped to visible byte width — never includes
+FFmpeg's per-row alignment padding).
 
 ## Backends
 
diff --git a/src/decoder.rs b/src/decoder.rs
index e146a18..2fe4f08 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -142,7 +142,9 @@ struct ProbeState {
   parameters: codec::Parameters,
   codec: Codec,
   /// Backends still to try, in order. Empty means "no more options after
-  /// the active one fails".
+  /// the active one fails" — `advance_probe` then surfaces
+  /// [`Error::AllBackendsFailed`] so the contract is the same on
+  /// single-backend platforms (e.g. macOS) as on multi-backend ones.
   remaining_backends: Vec<Backend>,
   /// Packets sent so far, kept for replay through any candidate backend.
   /// Preserved across failed candidates — only cleared when the probe
@@ -154,6 +156,12 @@ struct ProbeState {
   buffered_bytes: usize,
   /// Whether `send_eof` has been called; replayed alongside packets.
   eof_sent: bool,
+  /// Per-backend errors captured since the probe window opened. Pushed
+  /// whenever a backend's failure triggers `advance_probe` (the active
+  /// backend that just failed) or a candidate's build / replay rejects
+  /// it. Drained into [`Error::AllBackendsFailed`] when the probe
+  /// exhausts every option.
+  attempts: Vec<(Backend, Box<Error>)>,
 }
 
 // SAFETY: All raw pointers are exclusively owned by `DecoderState` and never
@@ -197,9 +205,21 @@ impl VideoDecoder {
   /// consumed, so a misbehaving middle backend cannot strand the caller.
   ///
   /// [`Self::backend`] reflects whichever backend ultimately produced the
-  /// first frame. If no hardware backend in the platform's probe order can
-  /// decode this stream, `open` returns [`Error::AllBackendsFailed`];
-  /// callers handle software fallback themselves.
+  /// first frame.
+  ///
+  /// [`Error::AllBackendsFailed`] surfaces in two places, with the same
+  /// meaning ("no hardware backend can decode this stream — fall back to
+  /// software yourself"):
+  /// - From `open` itself, when no backend even opens.
+  /// - From [`Self::send_packet`] / [`Self::send_eof`] /
+  ///   [`Self::receive_frame`], when the initially-opened backend fails
+  ///   at decode time and every remaining backend in the probe order
+  ///   either also fails or doesn't exist. On single-backend platforms
+  ///   (e.g. macOS, where the order is `[VideoToolbox]`), this is the
+  ///   only place a HW-only failure surfaces.
+  ///
+  /// In both cases, `attempts` carries the per-backend error log so the
+  /// caller can decide how to proceed with software fallback.
   pub fn open(parameters: codec::Parameters) -> Result<Self> {
     let codec = find_decoder(&parameters)?;
     let order = backend::probe_order();
@@ -229,28 +249,33 @@ impl VideoDecoder {
           // Rc on the original thread. The checked clone copies the bytes
           // into a fresh allocation with `owner: None`, severing the link.
           //
+          // We always create ProbeState — even when `remaining` is empty
+          // (single-backend platforms like macOS) — so that a first-frame
+          // failure on the only backend surfaces as
+          // `Error::AllBackendsFailed` from `receive_frame` /
+          // `send_packet` rather than as a raw FFmpeg error. That keeps
+          // the API contract the same regardless of how many HW backends
+          // the platform exposes.
+          //
           // If the clone fails (ENOMEM), we keep the active `state` but
-          // skip probe setup — caller loses cross-backend fallback safety
-          // net but still gets a working decoder.
-          let probe = if remaining.is_empty() {
-            None
-          } else {
-            match try_clone_parameters(&parameters) {
-              Ok(probe_params) => Some(ProbeState {
-                parameters: probe_params,
-                codec,
-                remaining_backends: remaining,
-                buffered_packets: Vec::new(),
-                buffered_bytes: 0,
-                eof_sent: false,
-              }),
-              Err(e) => {
-                tracing::warn!(
-                  error = %e,
-                  "hwdecode: parameters clone failed for probe state; proceeding without fallback"
-                );
-                None
-              }
+          // skip probe setup — caller loses the transactional probe /
+          // fallback safety net but still gets a working decoder.
+          let probe = match try_clone_parameters(&parameters) {
+            Ok(probe_params) => Some(ProbeState {
+              parameters: probe_params,
+              codec,
+              remaining_backends: remaining,
+              buffered_packets: Vec::new(),
+              buffered_bytes: 0,
+              eof_sent: false,
+              attempts: Vec::new(),
+            }),
+            Err(e) => {
+              tracing::warn!(
+                error = %e,
+                "hwdecode: parameters clone failed for probe state; proceeding without fallback"
+              );
+              None
             }
           };
           return Ok(Self {
@@ -403,7 +428,10 @@ impl VideoDecoder {
           return Err(Error::Ffmpeg(e));
         }
         Err(e) => {
-          if self.probe.is_some() && self.advance_probe()? {
+          if self.probe.is_some() {
+            // advance_probe consumes the error into `attempts` and either
+            // installs a candidate (Ok) or surfaces AllBackendsFailed (Err).
+            self.advance_probe(Error::Ffmpeg(e))?;
             continue;
           }
           return Err(Error::Ffmpeg(e));
@@ -428,7 +456,8 @@ impl VideoDecoder {
         }
         Err(e) if is_transient(&e) => return Err(Error::Ffmpeg(e)),
         Err(e) => {
-          if self.probe.is_some() && self.advance_probe()? {
+          if self.probe.is_some() {
+            self.advance_probe(Error::Ffmpeg(e))?;
             continue;
           }
           return Err(Error::Ffmpeg(e));
@@ -453,8 +482,11 @@ impl VideoDecoder {
   /// method, so the caller never loses initial frames after a fallback.
   ///
   /// This crate is hardware-only: there is no software fallback inside the
-  /// decoder. If every backend is exhausted, the failure surfaces as the
-  /// last decoder error. Callers handle software fallback themselves.
+  /// decoder. When every backend in the probe order has been exhausted —
+  /// including the case of a single-backend platform whose only backend
+  /// failed — this returns [`Error::AllBackendsFailed`] with the per-
+  /// backend attempt log so the caller can branch into a software
+  /// decoder of their choice.
   ///
   /// Returns the same transient signals as `ffmpeg::decoder::Video`:
   /// `Error::Ffmpeg(Other { errno: EAGAIN })` when no frame is ready and
@@ -478,8 +510,11 @@ impl VideoDecoder {
           // EOF (and every other non-transient error): if we are still
           // probing, treat it as candidate failure — a backend that drains
           // to EOF without ever producing a frame should not silently
-          // present as "stream over" to the caller. Advance and retry.
-          if self.probe.is_some() && self.advance_probe()? {
+          // present as "stream over" to the caller. Advance and retry; if
+          // every backend has been exhausted, advance_probe surfaces
+          // AllBackendsFailed and `?` propagates it.
+          if self.probe.is_some() {
+            self.advance_probe(Error::Ffmpeg(e))?;
             // Probe advance may have populated `pending_frames`; deliver
             // one of those before reading more from the new candidate.
             if self.try_pop_pending(frame) {
@@ -487,7 +522,7 @@ impl VideoDecoder {
             }
             continue;
           }
-          // Probe collapsed or exhausted — surface the error (including EOF
+          // Probe collapsed already — surface the error (including EOF
           // for a genuinely empty stream).
           return Err(Error::Ffmpeg(e));
         }
@@ -504,7 +539,8 @@ impl VideoDecoder {
               return Ok(());
             }
             Err(e) => {
-              if self.probe.is_some() && self.advance_probe()? {
+              if self.probe.is_some() {
+                self.advance_probe(Error::Ffmpeg(e))?;
                 unsafe { av_frame_unref(frame.as_inner_mut().as_mut_ptr()) };
                 if self.try_pop_pending(frame) {
                   return Ok(());
@@ -561,14 +597,39 @@ impl VideoDecoder {
   /// Try the next backend in `remaining_backends`. Transactional: a
   /// candidate must successfully build and accept the replayed history
   /// before any probe state is consumed. Backends that fail to build or
-  /// reject the replay are skipped (with `tracing::warn!`) and the loop
-  /// continues to the next one. Returns:
-  /// - `Ok(true)` when a candidate is installed and replay completed.
-  /// - `Ok(false)` when the probe is exhausted (no more backends to try).
-  /// - `Err(_)` only for genuinely fatal conditions surfaced by `build_state`
-  ///   on the very first inspection (e.g. a malformed `Parameters`); the
-  ///   per-candidate failures during the loop are absorbed and logged.
-  fn advance_probe(&mut self) -> Result<bool> {
+  /// reject the replay are recorded into `probe.attempts` and the loop
+  /// continues to the next one.
+  ///
+  /// `last_error` is the error that triggered this advance — i.e. the
+  /// failure of the currently active backend on `send_packet` /
+  /// `send_eof` / `receive_frame`. It is recorded against the active
+  /// backend before any candidate is tried so that a final
+  /// `AllBackendsFailed` carries the full attempt log including the
+  /// initially-opened backend's runtime failure.
+  ///
+  /// Returns:
+  /// - `Ok(())` when a candidate is installed and replay completed —
+  ///   caller should retry the operation.
+  /// - `Err(Error::AllBackendsFailed { attempts })` when every remaining
+  ///   backend has been exhausted (including the just-failed active one).
+  ///   This is what the documented `open` contract promises, surfaced at
+  ///   runtime so the caller can branch into a software fallback. On a
+  ///   single-backend platform (e.g. macOS), this fires after the only
+  ///   backend's first-frame failure; on multi-backend platforms it
+  ///   fires after the last candidate's failure.
+  /// - `Err(_)` for other fatal conditions surfaced by probe machinery
+  ///   itself (e.g. `alloc_av_frame` ENOMEM during replay drain).
+  fn advance_probe(&mut self, last_error: Error) -> Result<()> {
+    // Record the failure that triggered this advance against the active
+    // backend. If the probe was somehow already gone (shouldn't happen —
+    // call sites guard with `self.probe.is_some()`), just propagate the
+    // error so behaviour matches the pre-fix code path.
+    let active_backend = self.state.backend;
+    match self.probe.as_mut() {
+      Some(probe) => probe.attempts.push((active_backend, Box::new(last_error))),
+      None => return Err(last_error),
+    }
+
     // Drop frames previously queued from the backend we're now abandoning.
     // They came from a candidate that just failed for cause and cannot be
     // trusted alongside frames we may queue from the next candidate. (If
@@ -589,25 +650,37 @@ impl VideoDecoder {
                 error = %e,
                 "hwdecode: parameters clone failed during probe advance; popping backend and trying next"
               );
-              self
+              let popped = self
                 .probe
                 .as_mut()
                 .expect("probe state present")
                 .remaining_backends
                 .remove(0);
+              self
+                .probe
+                .as_mut()
+                .expect("probe state present")
+                .attempts
+                .push((popped, Box::new(Error::Ffmpeg(e))));
               continue;
             }
           };
           (probe.remaining_backends[0], parameters, probe.codec)
         }
-        _ => return Ok(false),
+        // No more candidates — surface the accumulated attempt log as
+        // AllBackendsFailed so single- and multi-backend platforms have
+        // the same contract for "every HW backend failed."
+        _ => {
+          let attempts = self.probe.take().map(|p| p.attempts).unwrap_or_default();
+          return Err(Error::AllBackendsFailed { attempts });
+        }
       };
 
       let prev_backend = self.state.backend;
       tracing::warn!(from = ?prev_backend, to = ?next_backend, "hwdecode: advancing probe");
 
-      // Build candidate. On failure, pop and continue without touching the
-      // packet buffer.
+      // Build candidate. On failure, record into attempts and continue
+      // without touching the packet buffer.
       let mut candidate_state = match Self::build_state(parameters, codec, next_backend) {
         Ok(s) => s,
         Err(e) => {
@@ -618,6 +691,12 @@ impl VideoDecoder {
             .expect("probe state present")
             .remaining_backends
             .remove(0);
+          self
+            .probe
+            .as_mut()
+            .expect("probe state present")
+            .attempts
+            .push((next_backend, Box::new(e)));
           continue;
         }
       };
@@ -713,6 +792,12 @@ impl VideoDecoder {
           .expect("probe state present")
           .remaining_backends
           .remove(0);
+        self
+          .probe
+          .as_mut()
+          .expect("probe state present")
+          .attempts
+          .push((next_backend, Box::new(Error::Ffmpeg(e))));
         continue;
       }
 
@@ -727,7 +812,7 @@ impl VideoDecoder {
         .expect("probe state present")
         .remaining_backends
         .remove(0);
-      return Ok(true);
+      return Ok(());
     }
   }
 
diff --git a/src/frame.rs b/src/frame.rs
index 65ec63d..3f48075 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -8,11 +8,24 @@
 //! when the value isn't in the build's bindgen-generated discriminant set
 //! (the exact failure mode this crate is designed to survive).
 //!
-//! Plane lengths for [`Frame::data`] are computed from a hardcoded chroma-
-//! subsampling table keyed on the safe `pix_fmt()` integer, covering only
-//! the formats `hwdecode` produces (the NV* and P0xx/P2xx/P4xx families
-//! after `av_hwframe_transfer_data`). For any other format, [`Frame::data`]
-//! returns `None` rather than guessing at a slice length.
+//! Per-row sizes for [`Frame::row`] / [`Frame::rows`] are computed from
+//! hardcoded chroma-subsampling and bit-depth tables keyed on the safe
+//! `pix_fmt()` integer, covering only the formats `hwdecode` produces (the
+//! NV* and P0xx/P2xx/P4xx families after `av_hwframe_transfer_data`). For
+//! any other format, the row accessors return `None` rather than guessing
+//! at a slice length.
+//!
+//! Why per-row, not whole-plane: FFmpeg allocates each row at
+//! `linesize[plane]` ([`Frame::stride`]) bytes for SIMD alignment, but
+//! hardware transfer paths only initialize the first
+//! [`Frame::row_bytes`]`(plane)` of every row. Exposing a stride-inclusive
+//! `&[u8]` over an entire plane would let safe code observe those
+//! uninitialized padding bytes, which violates `slice::from_raw_parts`.
+//! Per-row slices are tightly clipped to the visible byte width so the
+//! safe API never hands out an uninitialized byte. Callers that need a
+//! single base pointer (e.g. SIMD pixel converters keyed off stride) can
+//! reach for [`Frame::as_ptr`] and consume `stride * plane_h` bytes
+//! themselves under their own `unsafe` contract.
 //!
 //! Compare formats against integer constants in [`crate::pix_fmt`].
 
@@ -114,59 +127,138 @@ impl Frame {
     linesize as usize
   }
 
-  /// Pixel data for `plane`.
+  /// Visible byte width of `plane` — the number of initialized bytes at
+  /// the start of every row in that plane.
+  ///
+  /// Distinct from [`Self::stride`], which returns the FFmpeg `linesize`.
+  /// `linesize` is `>= row_bytes` and may include trailing alignment
+  /// padding bytes that FFmpeg's hardware transfer paths do not
+  /// initialize. `row_bytes` is what `slice::from_raw_parts` can safely
+  /// see.
+  ///
+  /// Returns `None` when the format is not in the supported HW-output set
+  /// (see crate `pix_fmt`) or the plane is out of range.
+  pub fn row_bytes(&self, plane: usize) -> Option<usize> {
+    if plane >= self.planes() {
+      return None;
+    }
+    plane_row_bytes_for(self.pix_fmt(), plane, self.width() as usize)
+  }
+
+  /// Pixel data for one row of `plane`, tightly clipped to the visible
+  /// byte width ([`Self::row_bytes`]).
+  ///
+  /// Excludes the trailing alignment padding that [`Self::stride`]
+  /// includes — those bytes are not guaranteed to be initialized by
+  /// FFmpeg's hardware transfer paths and must not be exposed through a
+  /// safe `&[u8]`.
   ///
   /// Returns `None` for any of the following — never panics:
-  /// - The frame's pixel format is not one of the hardware-output formats
-  ///   listed in [`crate::pix_fmt`] (we cannot safely compute the plane
-  ///   size for an unknown layout).
+  /// - The frame's pixel format is not one of the supported hardware-
+  ///   output formats listed in [`crate::pix_fmt`].
   /// - The plane index is out of range.
-  /// - `AVFrame.linesize[plane]` is `<= 0` (negative linesize signals
-  ///   vertically-flipped FFmpeg layouts which we do not surface; zero is
-  ///   "no plane").
-  /// - `AVFrame.height` is `<= 0`.
-  /// - The computed slice length would overflow or exceed `isize::MAX`
-  ///   (a precondition of [`std::slice::from_raw_parts`]).
+  /// - `y` is past the plane's row count.
+  /// - `AVFrame.linesize[plane]` is `<= 0` or `AVFrame.height` is `<= 0`.
   /// - The plane's data pointer is null.
+  /// - The plane size would overflow `isize::MAX`.
+  pub fn row(&self, plane: usize, y: usize) -> Option<&[u8]> {
+    let info = self.plane_info(plane)?;
+    if y >= info.plane_h {
+      return None;
+    }
+    // y < plane_h and plane_h * stride ≤ isize::MAX (verified in plane_info),
+    // so y * stride is bounded by (plane_h - 1) * stride ≤ isize::MAX.
+    let offset = y * info.stride;
+    // SAFETY:
+    // - `info.plane_ptr` is non-null (verified in plane_info).
+    // - `offset + row_bytes ≤ plane_h * stride`, which is the size of the
+    //   FFmpeg allocation for this plane.
+    // - Bytes 0..row_bytes of every row are written by FFmpeg's HW
+    //   transfer; the slice is fully initialized.
+    // - `row_bytes ≤ stride ≤ isize::MAX` per plane_info.
+    unsafe {
+      let row_ptr = info.plane_ptr.add(offset);
+      Some(slice::from_raw_parts(row_ptr, info.row_bytes))
+    }
+  }
+
+  /// Iterator over every row of `plane`. Each yielded slice has length
+  /// [`Self::row_bytes`]`(plane)` — never includes the trailing alignment
+  /// padding that lives within [`Self::stride`].
+  ///
+  /// Returns `None` under the same conditions as [`Self::row`].
+  pub fn rows(&self, plane: usize) -> Option<impl Iterator<Item = &[u8]> + '_> {
+    let info = self.plane_info(plane)?;
+    Some((0..info.plane_h).map(move |y| {
+      // Same bounds argument as `row()`.
+      let offset = y * info.stride;
+      // SAFETY: see `row()` — the same invariants hold here, and the
+      // iterator's lifetime is tied to `&self` so the pointer remains
+      // valid for every yielded slice.
+      unsafe { slice::from_raw_parts(info.plane_ptr.add(offset), info.row_bytes) }
+    }))
+  }
+
+  /// Raw base pointer to `plane`'s allocation, or `None` if the plane is
+  /// out of range or its data pointer is null.
+  ///
+  /// The returned pointer is valid for `stride(plane) * plane_height`
+  /// bytes, **but only the first [`Self::row_bytes`]`(plane)` bytes of
+  /// each row are guaranteed to be initialized.** The trailing per-row
+  /// alignment padding is uninitialized; callers performing wide SIMD
+  /// loads that read past `row_bytes` must mask the result and never
+  /// surface those bytes through a safe `&[u8]`.
   ///
-  /// Currently supported (post-`av_hwframe_transfer_data`):
-  /// - 4:2:0 semi-planar 8-bit: `NV12`, `NV21`
-  /// - 4:2:2 semi-planar 8-bit: `NV16`
-  /// - 4:4:4 semi-planar 8-bit: `NV24`
-  /// - 4:2:0 semi-planar 10/12/16-bit: `P010LE`/`P010BE`/`P012LE`/`P016LE`
-  /// - 4:2:2 semi-planar 10/12/16-bit: `P210LE`/`P212LE`/`P216LE`
-  /// - 4:4:4 semi-planar 10/12/16-bit: `P410LE`/`P412LE`/`P416LE`
-  pub fn data(&self, plane: usize) -> Option<&[u8]> {
+  /// This accessor exists for downstream pixel-format converters
+  /// (`colconv`) that work in `(ptr, stride, width, height)` quadruples;
+  /// safe code should prefer [`Self::row`] / [`Self::rows`].
+  pub fn as_ptr(&self, plane: usize) -> Option<*const u8> {
     if plane >= self.planes() {
       return None;
     }
+    // SAFETY: plane index bounds-checked; AVFrame.data is `[*mut u8; 8]`.
+    let p = unsafe { (*self.inner.as_ptr()).data[plane] };
+    if p.is_null() {
+      None
+    } else {
+      Some(p)
+    }
+  }
 
-    // SAFETY: bounds-checked plane index; `linesize` and `height` are
-    // primitive c_int reads that cannot themselves be UB.
-    let linesize: i32 = unsafe { (*self.inner.as_ptr()).linesize[plane] };
-    let height_int: i32 = unsafe { (*self.inner.as_ptr()).height };
-    if linesize <= 0 || height_int <= 0 {
+  /// Read every per-plane field needed by the row accessors with the
+  /// safety preconditions enforced once.
+  fn plane_info(&self, plane: usize) -> Option<PlaneInfo> {
+    if plane >= self.planes() {
       return None;
     }
-    let stride = linesize as usize;
-
-    let plane_height = plane_height_for(self.pix_fmt(), plane, height_int as usize)?;
-    let len = stride.checked_mul(plane_height)?;
-    if len > isize::MAX as usize {
+    // SAFETY: bounds-checked plane index; linesize/height/data are raw
+    // c_int / pointer reads that cannot themselves be UB.
+    let (stride_int, height_int, plane_ptr) = unsafe {
+      let raw = self.inner.as_ptr();
+      ((*raw).linesize[plane], (*raw).height, (*raw).data[plane])
+    };
+    if stride_int <= 0 || height_int <= 0 || plane_ptr.is_null() {
       return None;
     }
-
-    // SAFETY: linesize > 0 and height > 0 verified; len <= isize::MAX
-    // verified — both preconditions of `slice::from_raw_parts`. We trust
-    // FFmpeg to populate `data[plane]` validly when linesize[plane] is
-    // non-zero; the null check is a final defensive guard.
-    unsafe {
-      let ptr = (*self.inner.as_ptr()).data[plane];
-      if ptr.is_null() {
-        return None;
-      }
-      Some(slice::from_raw_parts(ptr, len))
+    let stride = stride_int as usize;
+    let plane_h = plane_height_for(self.pix_fmt(), plane, height_int as usize)?;
+    let row_bytes = plane_row_bytes_for(self.pix_fmt(), plane, self.width() as usize)?;
+    if row_bytes > stride {
+      return None;
     }
+    // Bound the entire plane allocation to isize::MAX so any byte offset
+    // computed as `y * stride` (y < plane_h) stays representable, satisfying
+    // the safety contract of `pointer::add` and `slice::from_raw_parts`.
+    let plane_size = stride.checked_mul(plane_h)?;
+    if plane_size > isize::MAX as usize {
+      return None;
+    }
+    Some(PlaneInfo {
+      plane_ptr,
+      stride,
+      plane_h,
+      row_bytes,
+    })
   }
 
   /// Crate-internal: hand the wrapped frame to FFmpeg / our decoder code.
@@ -175,10 +267,68 @@ impl Frame {
   }
 }
 
+#[derive(Clone, Copy)]
+struct PlaneInfo {
+  plane_ptr: *const u8,
+  stride: usize,
+  plane_h: usize,
+  row_bytes: usize,
+}
+
 // `Default` intentionally omitted: constructing a frame can fail (OOM
 // in `av_frame_alloc`), and a panicking `default()` would defeat the
 // safety stance of [`Frame::empty`]. Use `Frame::empty()?` directly.
 
+/// Visible byte width of `plane`'s rows for a frame of `frame_width` and
+/// the given pixel format. `None` for formats not in the supported HW-
+/// output set.
+///
+/// Distinct from `linesize` (FFmpeg's per-row stride, which may include
+/// alignment padding). HW transfer paths only initialize bytes
+/// `0..plane_row_bytes_for(...)` of each row; everything from there to
+/// `stride` is uninitialized padding and must not be exposed via
+/// `slice::from_raw_parts`.
+fn plane_row_bytes_for(pix_fmt_int: i32, plane: usize, frame_width: usize) -> Option<usize> {
+  match pix_fmt_int {
+    // 8-bit semi-planar: Y at full width (1 byte/sample), UV interleaved
+    // at horizontally-subsampled chroma (4:2:0 / 4:2:2) with 2 bytes per
+    // chroma pair → both planes have row width == frame_width.
+    pix_fmt::NV12 | pix_fmt::NV21 | pix_fmt::NV16 => match plane {
+      0 | 1 => Some(frame_width),
+      _ => None,
+    },
+    // 8-bit 4:4:4 semi-planar: chroma at full horizontal resolution,
+    // 2 bytes per pixel (1 byte U + 1 byte V).
+    pix_fmt::NV24 => match plane {
+      0 => Some(frame_width),
+      1 => Some(frame_width.checked_mul(2)?),
+      _ => None,
+    },
+    // 10/12/16-bit semi-planar 4:2:0 / 4:2:2: Y is 2 bytes/sample
+    // (high-bit-depth packed in 16-bit). UV interleaved at horizontally-
+    // subsampled chroma with 4 bytes per chroma pair (2 bytes U + 2 bytes
+    // V) → both planes have row width == 2 * frame_width.
+    pix_fmt::P010LE
+    | pix_fmt::P010BE
+    | pix_fmt::P012LE
+    | pix_fmt::P016LE
+    | pix_fmt::P210LE
+    | pix_fmt::P212LE
+    | pix_fmt::P216LE => match plane {
+      0 | 1 => Some(frame_width.checked_mul(2)?),
+      _ => None,
+    },
+    // 10/12/16-bit 4:4:4 semi-planar: Y is 2 bytes/sample; UV at full
+    // horizontal resolution with 4 bytes per pixel (2 bytes U + 2 bytes V).
+    pix_fmt::P410LE | pix_fmt::P412LE | pix_fmt::P416LE => match plane {
+      0 => Some(frame_width.checked_mul(2)?),
+      1 => Some(frame_width.checked_mul(4)?),
+      _ => None,
+    },
+    _ => None,
+  }
+}
+
 /// Number of rows in `plane` for a frame of `frame_height` and the given
 /// pixel format. `None` for formats not in the supported HW-output set.
 ///
@@ -235,18 +385,20 @@ mod tests {
   }
 
   #[test]
-  fn data_returns_none_for_unknown_format() {
+  fn row_returns_none_for_unknown_format() {
     let f = Frame::empty().expect("alloc");
     // pix_fmt is NONE (-1), not in the supported set.
-    assert!(f.data(0).is_none());
+    assert!(f.row(0, 0).is_none());
+    assert!(f.rows(0).is_none());
+    assert!(f.row_bytes(0).is_none());
   }
 
   /// Synthesize a frame with a negative linesize (FFmpeg's vertical-flip
-  /// convention) and assert `data()` refuses to construct a slice. Without
-  /// the linesize > 0 check, the negative `i32 as usize` would produce a
-  /// huge positive length and `from_raw_parts` would be UB.
+  /// convention) and assert the row accessors refuse to construct a slice.
+  /// Without the linesize > 0 check, the negative `i32 as usize` would
+  /// produce a huge positive length and `from_raw_parts` would be UB.
   #[test]
-  fn data_returns_none_for_negative_linesize() {
+  fn row_returns_none_for_negative_linesize() {
     let mut f = Frame::empty().expect("alloc");
     unsafe {
       let raw = f.inner.as_mut_ptr();
@@ -255,15 +407,16 @@ mod tests {
       (*raw).height = 1080;
       (*raw).linesize[0] = -1920; // vertically-flipped
       (*raw).linesize[1] = -1920;
-      // data pointers stay null; `data()` would return None on the null
-      // check anyway, but should bail earlier on the linesize sign.
+      // data pointers stay null; the accessors would also reject on null,
+      // but should bail earlier on the linesize sign.
     }
-    assert!(f.data(0).is_none());
-    assert!(f.data(1).is_none());
+    assert!(f.row(0, 0).is_none());
+    assert!(f.row(1, 0).is_none());
+    assert!(f.rows(0).is_none());
   }
 
   #[test]
-  fn data_returns_none_for_non_positive_height() {
+  fn row_returns_none_for_non_positive_height() {
     let mut f = Frame::empty().expect("alloc");
     unsafe {
       let raw = f.inner.as_mut_ptr();
@@ -273,7 +426,81 @@ mod tests {
       (*raw).linesize[0] = 1920;
       (*raw).linesize[1] = 1920;
     }
-    assert!(f.data(0).is_none());
+    assert!(f.row(0, 0).is_none());
+  }
+
+  /// Synthesize a frame backed by a manually-allocated buffer with stride
+  /// strictly larger than visible row bytes (the exact case where
+  /// FFmpeg's HW transfer leaves trailing padding uninitialized) and
+  /// confirm the safe row accessor returns slices clipped to the visible
+  /// width.
+  #[test]
+  fn row_clips_to_visible_width_not_stride() {
+    use std::alloc::{alloc, dealloc, Layout};
+    let width = 64usize;
+    let height = 4usize;
+    // Stride > width: 16 bytes of padding per row in the Y plane.
+    let stride = 80usize;
+    let plane_size = stride * height;
+    // Allocate ourselves so we can fully control initialization. Fill
+    // bytes 0..width with 0xAA per row (the "valid pixel" range) and
+    // bytes width..stride with 0xFF (the simulated alignment padding —
+    // FFmpeg would leave these uninitialized; we set them to a sentinel
+    // that the test can detect if the safe slice ever exposes them).
+    let layout = Layout::from_size_align(plane_size, 32).unwrap();
+    let buf = unsafe { alloc(layout) };
+    assert!(!buf.is_null());
+    for y in 0..height {
+      let row = unsafe { buf.add(y * stride) };
+      for x in 0..width {
+        unsafe { *row.add(x) = 0xAA };
+      }
+      for x in width..stride {
+        unsafe { *row.add(x) = 0xFF };
+      }
+    }
+
+    let mut f = Frame::empty().expect("alloc");
+    unsafe {
+      let raw = f.inner.as_mut_ptr();
+      (*raw).format = pix_fmt::NV12;
+      (*raw).width = width as i32;
+      (*raw).height = height as i32;
+      (*raw).linesize[0] = stride as i32;
+      // linesize[1] = 0 keeps planes() at 1 so the test stays focused on
+      // plane 0 without owning a second allocation.
+      (*raw).data[0] = buf;
+    }
+
+    assert_eq!(f.row_bytes(0), Some(width));
+    assert_eq!(f.stride(0), stride);
+    let row0 = f.row(0, 0).expect("row 0");
+    assert_eq!(
+      row0.len(),
+      width,
+      "safe row must be clipped to visible width"
+    );
+    assert!(
+      row0.iter().all(|&b| b == 0xAA),
+      "row must not include padding sentinel 0xFF"
+    );
+
+    let collected: Vec<&[u8]> = f.rows(0).expect("rows iterator").collect();
+    assert_eq!(collected.len(), height);
+    for r in &collected {
+      assert_eq!(r.len(), width);
+      assert!(r.iter().all(|&b| b == 0xAA));
+    }
+
+    // Out-of-range row index returns None instead of panicking.
+    assert!(f.row(0, height).is_none());
+
+    // Detach the buffer before drop so AVFrame's own free path doesn't
+    // touch our manual allocation.
+    unsafe {
+      (*f.inner.as_mut_ptr()).data[0] = std::ptr::null_mut();
+      dealloc(buf, layout);
+    }
   }
 
   #[test]
@@ -306,4 +533,27 @@ mod tests {
     assert_eq!(plane_height_for(pix_fmt::NONE, 0, 1080), None);
     assert_eq!(plane_height_for(pix_fmt::NV12, 2, 1080), None);
   }
+
+  #[test]
+  fn plane_row_bytes_table_covers_supported_formats() {
+    // 8-bit 4:2:0 / 4:2:2 — both planes at width.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 0, 1920), Some(1920));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 1, 1920), Some(1920));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV21, 1, 1920), Some(1920));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV16, 1, 1920), Some(1920));
+    // 8-bit 4:4:4 — chroma plane is 2 * width.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV24, 0, 1920), Some(1920));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV24, 1, 1920), Some(3840));
+    // 10/12/16-bit 4:2:0 / 4:2:2 — both planes at 2 * width.
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010LE, 0, 1920), Some(3840));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010LE, 1, 1920), Some(3840));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P210LE, 1, 1920), Some(3840));
+    // 10/12/16-bit 4:4:4 — Y is 2 * width, chroma is 4 * width.
+    assert_eq!(plane_row_bytes_for(pix_fmt::P410LE, 0, 1920), Some(3840));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P410LE, 1, 1920), Some(7680));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P416LE, 1, 1920), Some(7680));
+    // Unsupported / out-of-range.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NONE, 0, 1920), None);
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 2, 1920), None);
+  }
 }
diff --git a/src/lib.rs b/src/lib.rs
index b487132..3654016 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,8 +4,14 @@
 //! (`send_packet`/`receive_frame`/`send_eof`/`flush`) and auto-probes the
 //! host's hardware backends (VideoToolbox / VAAPI / NVDEC / D3D11VA).
 //! There is **no software fallback inside this crate** — if no hardware
-//! backend can decode the stream, [`VideoDecoder::open`] returns
-//! [`Error::AllBackendsFailed`] and the caller picks how to fall back
+//! backend can decode the stream, [`Error::AllBackendsFailed`] surfaces
+//! either from [`VideoDecoder::open`] (when no backend even opens) or
+//! from [`VideoDecoder::receive_frame`] / [`VideoDecoder::send_packet`] /
+//! [`VideoDecoder::send_eof`] (when the initially-opened backend or any
+//! later candidate fails at decode time and the probe order is
+//! exhausted). On single-backend platforms (e.g. macOS, where the order
+//! is `[VideoToolbox]`), only the runtime path can return it. The
+//! caller picks how to fall back to a software decoder of their choice
 //! (e.g. by opening an `ffmpeg::decoder::Video` directly).
 //!
 //! Output frames returned by [`VideoDecoder::receive_frame`] are CPU-side

From b61c76a3badbf1a8458a0d392fdcb32f3ad3bf38 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 11:25:54 +1200
Subject: [PATCH 19/27] update

---
 src/decoder.rs | 84 +++++++++++++++++++++++++++++++++++++++++++++++++-
 src/frame.rs   | 55 ++++++++++++++++++++++++++++-----
 2 files changed, 130 insertions(+), 9 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index 2fe4f08..87fe847 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -387,7 +387,12 @@ impl VideoDecoder {
       match self.state.inner.send_packet(packet) {
         Ok(()) => {
           if let Some(probe) = self.probe.as_mut() {
-            let pkt_size = packet.size();
+            // `try_clone_packet` calls `av_packet_ref`, which deep-copies
+            // side data via `av_packet_copy_props`. The probe budget must
+            // include side-data bytes or a malicious stream can keep
+            // `packet.size()` tiny while attaching megabytes of side data
+            // per packet and inflate retention beyond the advertised cap.
+            let pkt_size = packet.size().saturating_add(packet_side_data_bytes(packet));
             let new_count = probe.buffered_packets.len() + 1;
             let new_bytes = probe.buffered_bytes.saturating_add(pkt_size);
             if new_count > MAX_PROBE_PACKETS || new_bytes > MAX_PROBE_PACKET_BYTES {
@@ -1093,6 +1098,38 @@ fn try_clone_packet(src: &Packet) -> std::result::Result<Packet, ffmpeg_next::Er
   Ok(dst)
 }
 
+/// Sum of `AVPacket.side_data[i].size` across every entry. `av_packet_ref`
+/// performs a deep copy of side data via `av_packet_copy_props`, so each
+/// probe-buffered clone retains its own copy of every side-data byte.
+/// The probe budget must include this so a stream that keeps payload
+/// small while attaching arbitrarily large side data per packet cannot
+/// blow past `MAX_PROBE_PACKET_BYTES`.
+///
+/// Reads only the `size` field of each `AVPacketSideData` entry — never
+/// touches the bindgen `AVPacketSideDataType` enum, so no UB even if a
+/// future FFmpeg adds a side-data type discriminant our build doesn't
+/// know.
+fn packet_side_data_bytes(packet: &Packet) -> usize {
+  // SAFETY: AVPacket.side_data is `*mut AVPacketSideData` and
+  // side_data_elems is `c_int`; both are raw struct fields safe to read.
+  // Field projection (`.size`) does not reconstruct the enum-typed `type_`
+  // field, so the bindgen-enum UB hazard does not apply here.
+  unsafe {
+    let raw = packet.as_ptr();
+    let nel = (*raw).side_data_elems;
+    let arr = (*raw).side_data;
+    if arr.is_null() || nel <= 0 {
+      return 0;
+    }
+    let mut total: usize = 0;
+    for i in 0..(nel as usize) {
+      let entry = arr.add(i);
+      total = total.saturating_add((*entry).size);
+    }
+    total
+  }
+}
+
 /// Just `EAGAIN` (separate from EOF — the FFmpeg send/receive state machine
 /// distinguishes "drain output and retry" from "stream over").
 fn is_eagain(e: &ffmpeg_next::Error) -> bool {
@@ -1324,4 +1361,49 @@ mod tests {
       Err(other) => panic!("expected Ffmpeg(Other {{ ENOMEM }}), got {other:?}"),
     }
   }
+
+  /// `try_clone_packet` calls `av_packet_ref`, which deep-copies side
+  /// data via `av_packet_copy_props`. The probe budget therefore has to
+  /// include side-data bytes — otherwise a stream with a 16-byte payload
+  /// and a 1 MiB side-data attachment would only consume 16 bytes of the
+  /// 64 MiB budget per packet, and 256 buffered clones would retain
+  /// ~256 MiB of side data while logs claim a few KiB.
+  #[test]
+  fn packet_side_data_counts_against_probe_budget() {
+    use ffmpeg_next::ffi::{av_packet_new_side_data, AVPacketSideDataType};
+
+    const PAYLOAD_SIZE: usize = 16;
+    const SIDE_DATA_SIZE: usize = 1024 * 1024; // 1 MiB
+
+    let mut packet = Packet::new(PAYLOAD_SIZE);
+    // SAFETY: packet is a freshly allocated AVPacket; av_packet_new_side_data
+    // attaches a fresh `SIDE_DATA_SIZE`-byte buffer of the requested type
+    // to it and returns a writable pointer (or NULL on OOM).
+    let p = unsafe {
+      av_packet_new_side_data(
+        packet.as_mut_ptr(),
+        AVPacketSideDataType::AV_PKT_DATA_NEW_EXTRADATA,
+        SIDE_DATA_SIZE,
+      )
+    };
+    assert!(!p.is_null(), "av_packet_new_side_data returned NULL");
+
+    assert_eq!(packet.size(), PAYLOAD_SIZE);
+    let side = packet_side_data_bytes(&packet);
+    assert!(
+      side >= SIDE_DATA_SIZE,
+      "side-data accounting must include the attached buffer; got {side}"
+    );
+    let total = packet.size().saturating_add(side);
+    assert!(
+      total >= PAYLOAD_SIZE + SIDE_DATA_SIZE,
+      "probe budget must charge payload + side data; got {total}"
+    );
+  }
+
+  #[test]
+  fn packet_side_data_is_zero_when_no_side_data() {
+    let packet = Packet::new(64);
+    assert_eq!(packet_side_data_bytes(&packet), 0);
+  }
 }
diff --git a/src/frame.rs b/src/frame.rs
index 3f48075..9b651af 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -290,15 +290,19 @@ struct PlaneInfo {
 /// `slice::from_raw_parts`.
 fn plane_row_bytes_for(pix_fmt_int: i32, plane: usize, frame_width: usize) -> Option<usize> {
   match pix_fmt_int {
-    // 8-bit semi-planar: Y at full width (1 byte/sample), UV interleaved
-    // at horizontally-subsampled chroma (4:2:0 / 4:2:2) with 2 bytes per
-    // chroma pair → both planes have row width == frame_width.
+    // 8-bit semi-planar 4:2:0 / 4:2:2: Y at full width (1 byte/sample);
+    // UV interleaved at horizontally-subsampled chroma with `ceil(W/2)`
+    // U+V pairs at 2 bytes per pair. For even W the chroma row equals
+    // `W` bytes (the simple case); for odd W it must round *up* to the
+    // next even byte so the trailing chroma sample is not silently
+    // dropped on width = 2k+1 frames.
     pix_fmt::NV12 | pix_fmt::NV21 | pix_fmt::NV16 => match plane {
-      0 | 1 => Some(frame_width),
+      0 => Some(frame_width),
+      1 => Some(frame_width.div_ceil(2).checked_mul(2)?),
       _ => None,
     },
     // 8-bit 4:4:4 semi-planar: chroma at full horizontal resolution,
-    // 2 bytes per pixel (1 byte U + 1 byte V).
+    // 2 bytes per pixel (1 byte U + 1 byte V) — no rounding required.
     pix_fmt::NV24 => match plane {
       0 => Some(frame_width),
       1 => Some(frame_width.checked_mul(2)?),
@@ -306,8 +310,9 @@ fn plane_row_bytes_for(pix_fmt_int: i32, plane: usize, frame_width: usize) -> Op
     },
     // 10/12/16-bit semi-planar 4:2:0 / 4:2:2: Y is 2 bytes/sample
     // (high-bit-depth packed in 16-bit). UV interleaved at horizontally-
-    // subsampled chroma with 4 bytes per chroma pair (2 bytes U + 2 bytes
-    // V) → both planes have row width == 2 * frame_width.
+    // subsampled chroma with `ceil(W/2)` U+V pairs at 4 bytes per pair
+    // (2 bytes U + 2 bytes V). Same odd-width rounding as the 8-bit
+    // chroma path, scaled by 2 bytes per sample.
     pix_fmt::P010LE
     | pix_fmt::P010BE
     | pix_fmt::P012LE
@@ -315,7 +320,8 @@ fn plane_row_bytes_for(pix_fmt_int: i32, plane: usize, frame_width: usize) -> Op
     | pix_fmt::P210LE
     | pix_fmt::P212LE
     | pix_fmt::P216LE => match plane {
-      0 | 1 => Some(frame_width.checked_mul(2)?),
+      0 => Some(frame_width.checked_mul(2)?),
+      1 => Some(frame_width.div_ceil(2).checked_mul(4)?),
       _ => None,
     },
     // 10/12/16-bit 4:4:4 semi-planar: Y is 2 bytes/sample; UV at full
@@ -534,6 +540,39 @@ mod tests {
     assert_eq!(plane_height_for(pix_fmt::NV12, 2, 1080), None);
   }
 
+  /// 4:2:0 / 4:2:2 chroma planes carry `ceil(W/2)` U+V pairs per row.
+  /// For odd `W`, dropping the round-up silently truncates the last chroma
+  /// sample — and the safe row slice would expose a buffer one byte (8-bit)
+  /// or two bytes (high-bit-depth) shorter than the data FFmpeg actually
+  /// wrote. Y planes and 4:4:4 chroma planes are unaffected because their
+  /// row count is just `W` or a fixed multiple of `W`.
+  #[test]
+  fn plane_row_bytes_rounds_up_chroma_for_odd_widths() {
+    // 8-bit subsampled chroma — odd W gains one byte (the missing sample
+    // pair).
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 1, 1921), Some(1922));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV21, 1, 1921), Some(1922));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV16, 1, 1921), Some(1922));
+    // High-bit-depth subsampled chroma — odd W gains two bytes.
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010LE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010BE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P012LE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P016LE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P210LE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P212LE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P216LE, 1, 1921), Some(3844));
+    // Y planes always at full width regardless of subsampling.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 0, 1921), Some(1921));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010LE, 0, 1921), Some(3842));
+    // 4:4:4 chroma is at full horizontal resolution — no rounding.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV24, 1, 1921), Some(3842));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P410LE, 1, 1921), Some(7684));
+    // Even widths must still match the original (pre-fix) values so the
+    // change is purely additive on the dominant code path.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 1, 1920), Some(1920));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010LE, 1, 1920), Some(3840));
+  }
+
   #[test]
   fn plane_row_bytes_table_covers_supported_formats() {
     // 8-bit 4:2:0 / 4:2:2 — both planes at width.

From bedf83627acb7874cfe33b1e32cb225b0ab51b91 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 11:51:14 +1200
Subject: [PATCH 20/27] update

---
 src/decoder.rs | 84 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 79 insertions(+), 5 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index 87fe847..b99e3f7 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -1268,10 +1268,17 @@ fn drain_into_pending(
 /// Approximate resident size of a CPU frame: sum of `linesize[plane] *
 /// plane_height` across populated planes.
 ///
-/// Returns `None` for pixel formats not in our chroma-subsampling table,
-/// so the caller can refuse to queue an allocation it can't account for.
-/// Returning 0 for unknown formats would silently bypass the byte cap and
-/// let an unbounded number of large frames into `pending_frames`.
+/// Returns `None` for pixel formats not in our chroma-subsampling table or
+/// for frames whose `linesize` is negative — both signal an allocation we
+/// cannot account for, so the caller refuses to queue them. Returning 0
+/// in either case would silently bypass the byte cap and let an unbounded
+/// number of large frames into `pending_frames`.
+///
+/// Distinguishes `linesize == 0` (FFmpeg's sentinel for "no more populated
+/// planes" — terminates the scan) from `linesize < 0` (FFmpeg's vertically-
+/// flipped layout — `Frame::row` rejects those as unusable, so queueing one
+/// during probe replay would only delay the failure to the consumer side
+/// while wasting `|linesize| * plane_h` bytes of unaccounted memory).
 fn cpu_frame_bytes(frame: &frame::Video) -> Option<usize> {
   // SAFETY: AVFrame.height / format / linesize are c_int reads.
   let (height, pix_fmt, linesizes) = unsafe {
@@ -1281,9 +1288,17 @@ fn cpu_frame_bytes(frame: &frame::Video) -> Option<usize> {
   let mut total: usize = 0;
   let mut any_plane = false;
   for (plane, linesize) in linesizes.iter().enumerate() {
-    if *linesize <= 0 {
+    if *linesize == 0 {
+      // End of populated planes — FFmpeg zeroes the trailing entries.
       break;
     }
+    if *linesize < 0 {
+      // Vertically-flipped layout — refuse to size so `drain_into_pending`
+      // fails the candidate. The same pre-fix code path silently returned
+      // `Some(0)` for a frame whose first plane was negative, allowing up
+      // to MAX_PROBE_PENDING_FRAMES frames of unaccounted memory.
+      return None;
+    }
     any_plane = true;
     let stride = *linesize as usize;
     // If we can't size *any* populated plane, the format is outside our
@@ -1406,4 +1421,63 @@ mod tests {
     let packet = Packet::new(64);
     assert_eq!(packet_side_data_bytes(&packet), 0);
   }
+
+  /// `cpu_frame_bytes` must refuse to size a frame whose first plane has
+  /// a negative `linesize`. Pre-fix, the loop break treated negative the
+  /// same as zero (FFmpeg's "no more populated planes" sentinel), so a
+  /// vertically-flipped frame returned `Some(0)` and `drain_into_pending`
+  /// would queue it as a 0-byte allocation — letting up to
+  /// `MAX_PROBE_PENDING_FRAMES` such frames bypass the configured byte
+  /// budget entirely.
+  #[test]
+  fn cpu_frame_bytes_rejects_negative_first_plane_linesize() {
+    let mut f = frame::Video::empty();
+    // SAFETY: f is freshly allocated; we set `format` to NV12 and the
+    // first plane's linesize negative (FFmpeg's vertical-flip convention).
+    // No backing data buffer is allocated — cpu_frame_bytes must reject
+    // before any pointer dereference.
+    unsafe {
+      let raw = f.as_mut_ptr();
+      (*raw).format = crate::pix_fmt::NV12;
+      (*raw).width = 1920;
+      (*raw).height = 1080;
+      (*raw).linesize[0] = -1920;
+      (*raw).linesize[1] = -1920;
+    }
+    assert!(
+      cpu_frame_bytes(&f).is_none(),
+      "negative linesize must be unsizeable, not Some(0)"
+    );
+  }
+
+  /// Sanity-check the positive path: a synthesized NV12 frame with valid
+  /// linesizes must report the sum across populated planes (Y full height
+  /// + UV half height).
+  #[test]
+  fn cpu_frame_bytes_sums_populated_planes() {
+    let mut f = frame::Video::empty();
+    let stride = 1920usize;
+    let height = 1080usize;
+    // SAFETY: same scheme as above; we only mutate primitive struct fields.
+    unsafe {
+      let raw = f.as_mut_ptr();
+      (*raw).format = crate::pix_fmt::NV12;
+      (*raw).width = 1920;
+      (*raw).height = height as i32;
+      (*raw).linesize[0] = stride as i32;
+      (*raw).linesize[1] = stride as i32;
+    }
+    let expected = stride * height + stride * (height / 2);
+    assert_eq!(cpu_frame_bytes(&f), Some(expected));
+  }
+
+  /// A frame with only a zero linesize in plane 0 is "no populated
+  /// planes" — must return `Some(0)`, not `None`. Distinguishes the
+  /// FFmpeg sentinel from the vertically-flipped layout.
+  #[test]
+  fn cpu_frame_bytes_zero_first_plane_returns_zero() {
+    let f = frame::Video::empty();
+    // Default-allocated empty AVFrame already has all linesizes zero.
+    assert_eq!(cpu_frame_bytes(&f), Some(0));
+  }
 }

From 5447670bf89d3b693fdcb856a3376bda12cbabe5 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 12:37:20 +1200
Subject: [PATCH 21/27] update

---
 src/decoder.rs | 97 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 2 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index b99e3f7..1783924 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -404,8 +404,14 @@ impl VideoDecoder {
                 "hwdecode: probe window exceeded caps without first frame; \
                  abandoning fallback safety net"
               );
+              // Abandon the *future* probe-buffering only. `pending_frames`
+              // belong to the currently active backend (possibly the
+              // candidate `advance_probe` committed earlier in this same
+              // `send_packet` call) and are valid output the caller will
+              // dequeue via `receive_frame`. Clearing them here would
+              // silently drop initial frames at exactly the cap-overflow /
+              // OOM-stress paths.
               self.probe = None;
-              self.pending_frames.clear();
             } else {
               // Use the checked clone — ffmpeg-next's `Packet::clone`
               // discards av_packet_ref's return value and would silently
@@ -420,8 +426,10 @@ impl VideoDecoder {
                     error = %e,
                     "hwdecode: packet clone failed for probe history; abandoning fallback safety net"
                   );
+                  // Same reasoning as the cap-overflow branch above:
+                  // `pending_frames` are owned by the active backend, not
+                  // the probe buffer, so they survive abandonment.
                   self.probe = None;
-                  self.pending_frames.clear();
                 }
               }
             }
@@ -1480,4 +1488,89 @@ mod tests {
     // Default-allocated empty AVFrame already has all linesizes zero.
     assert_eq!(cpu_frame_bytes(&f), Some(0));
   }
+
+  /// Probe-abandon paths in `send_packet` (cap exceeded, packet clone
+  /// failed) must not drop frames already queued in `pending_frames`.
+  /// Those frames belong to the currently active backend — possibly a
+  /// candidate that `advance_probe` just committed earlier in the same
+  /// `send_packet` call — and are valid output the caller will dequeue
+  /// via `receive_frame`.
+  ///
+  /// Pre-fix, both abandon branches called `pending_frames.clear()`
+  /// alongside `self.probe = None;`, silently dropping initial frames at
+  /// exactly the cap-overflow / OOM-stress paths.
+  ///
+  /// Live HW required: a real `VideoDecoder` is the only way to construct
+  /// a valid `DecoderState` (its `Drop` invokes FFmpeg cleanup), and
+  /// `send_packet` must reach the Ok branch on a real decoder for the
+  /// cap check to fire.
+  #[test]
+  #[ignore = "requires HWDECODE_SAMPLE_VIDEO and a working hardware backend"]
+  fn cap_overflow_preserves_pending_frames_from_active_backend() {
+    use ffmpeg_next::{format, media};
+
+    let path = std::env::var_os("HWDECODE_SAMPLE_VIDEO")
+      .expect("HWDECODE_SAMPLE_VIDEO must be set for this test");
+
+    ffmpeg_next::init().expect("ffmpeg init");
+    let mut input = format::input(&path).expect("open input");
+    let stream_index = input
+      .streams()
+      .best(media::Type::Video)
+      .expect("video stream")
+      .index();
+    let stream_params = input
+      .streams()
+      .best(media::Type::Video)
+      .expect("video stream")
+      .parameters();
+
+    let mut decoder = VideoDecoder::open(stream_params).expect("open decoder");
+    assert!(
+      decoder.probe.is_some(),
+      "probe must be active immediately after open"
+    );
+
+    // Inject sentinel frames as if `advance_probe` had drained them from
+    // a freshly-committed candidate during this same send_packet call.
+    decoder.pending_frames.push_back(frame::Video::empty());
+    decoder.pending_frames.push_back(frame::Video::empty());
+    let pending_before = decoder.pending_frames.len();
+
+    // Fast-forward the probe state to the byte cap so the next successful
+    // send_packet trips the cap-overflow branch.
+    decoder
+      .probe
+      .as_mut()
+      .expect("probe present")
+      .buffered_bytes = MAX_PROBE_PACKET_BYTES;
+
+    // Find the first video packet and feed it. We don't care whether the
+    // underlying decoder actually accepts it cleanly; we only need to
+    // exercise the Ok branch's cap-overflow accounting at least once.
+    let mut hit_ok = false;
+    for (s, packet) in input.packets() {
+      if s.index() != stream_index {
+        continue;
+      }
+      if decoder.send_packet(&packet).is_ok() {
+        hit_ok = true;
+        break;
+      }
+    }
+    assert!(
+      hit_ok,
+      "expected at least one send_packet to succeed and trigger the cap-overflow branch"
+    );
+
+    assert!(
+      decoder.probe.is_none(),
+      "probe must be abandoned after cap overflow"
+    );
+    assert_eq!(
+      decoder.pending_frames.len(),
+      pending_before,
+      "pending_frames belong to the active backend; abandon must not drop them"
+    );
+  }
 }

From 88a84d49aa1b7631c8162a507c82eceeede182ed Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 12:59:22 +1200
Subject: [PATCH 22/27] update

---
 src/frame.rs | 68 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 18 deletions(-)

diff --git a/src/frame.rs b/src/frame.rs
index 9b651af..4642184 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -199,30 +199,42 @@ impl Frame {
     }))
   }
 
-  /// Raw base pointer to `plane`'s allocation, or `None` if the plane is
-  /// out of range or its data pointer is null.
+  /// Raw base pointer to `plane`'s allocation, or `None` if the plane
+  /// fails the same layout validation [`Self::row`] applies.
   ///
-  /// The returned pointer is valid for `stride(plane) * plane_height`
-  /// bytes, **but only the first [`Self::row_bytes`]`(plane)` bytes of
-  /// each row are guaranteed to be initialized.** The trailing per-row
-  /// alignment padding is uninitialized; callers performing wide SIMD
-  /// loads that read past `row_bytes` must mask the result and never
-  /// surface those bytes through a safe `&[u8]`.
+  /// Returns `None` whenever any of the following is true:
+  /// - The plane index is out of range (`plane >= planes()`).
+  /// - The frame's pixel format is not in the supported HW-output set.
+  /// - `linesize[plane] <= 0`. **In particular, FFmpeg permits negative
+  ///   linesizes for vertically-flipped frames with `data[n]` pointing
+  ///   at the *end* of the image. Returning that pointer with the
+  ///   advertised "valid for `stride * plane_h` bytes forward" contract
+  ///   would let a downstream converter walk past the buffer.** This
+  ///   accessor refuses the layout instead of handing back a pointer the
+  ///   caller cannot safely interpret as forward-addressable.
+  /// - `height <= 0`, the data pointer is null, `row_bytes > stride`, or
+  ///   the total plane size would overflow `isize::MAX`.
+  ///
+  /// On `Some(ptr)` the pointer is valid for
+  /// `stride(plane) * plane_height` *forward-addressable* bytes, and
+  /// only the first [`Self::row_bytes`]`(plane)` bytes of each row are
+  /// guaranteed to be initialized. The trailing per-row alignment padding
+  /// is uninitialized; callers performing wide SIMD loads that read past
+  /// `row_bytes` must mask the result and never surface those bytes
+  /// through a safe `&[u8]`.
   ///
   /// This accessor exists for downstream pixel-format converters
   /// (`colconv`) that work in `(ptr, stride, width, height)` quadruples;
   /// safe code should prefer [`Self::row`] / [`Self::rows`].
   pub fn as_ptr(&self, plane: usize) -> Option<*const u8> {
-    if plane >= self.planes() {
-      return None;
-    }
-    // SAFETY: plane index bounds-checked; AVFrame.data is `[*mut u8; 8]`.
-    let p = unsafe { (*self.inner.as_ptr()).data[plane] };
-    if p.is_null() {
-      None
-    } else {
-      Some(p)
-    }
+    // Share the full plane-layout validation so the unsafe escape hatch
+    // never escapes a layout that `row()` / `rows()` reject. Returning a
+    // pointer for a negative-stride frame (FFmpeg's vertical-flip
+    // convention, where `data[n]` points at the *end* of the image)
+    // would invite forward-walking out-of-bounds reads from a caller
+    // that trusts the documented "valid for stride × plane_h bytes"
+    // contract.
+    self.plane_info(plane).map(|info| info.plane_ptr)
   }
 
   /// Read every per-plane field needed by the row accessors with the
@@ -403,6 +415,11 @@ mod tests {
   /// convention) and assert the row accessors refuse to construct a slice.
   /// Without the linesize > 0 check, the negative `i32 as usize` would
   /// produce a huge positive length and `from_raw_parts` would be UB.
+  ///
+  /// `as_ptr` shares the same validation — handing back the data pointer
+  /// for a negative-stride frame would let a downstream converter
+  /// following the "valid for stride × plane_h bytes forward" contract
+  /// walk past the buffer.
   #[test]
   fn row_returns_none_for_negative_linesize() {
     let mut f = Frame::empty().expect("alloc");
@@ -419,6 +436,12 @@ mod tests {
     assert!(f.row(0, 0).is_none());
     assert!(f.row(1, 0).is_none());
     assert!(f.rows(0).is_none());
+    assert!(
+      f.as_ptr(0).is_none(),
+      "as_ptr must share row()/rows() validation — a negative-stride \
+       frame must not leak a forward-readable plane pointer"
+    );
+    assert!(f.as_ptr(1).is_none());
   }
 
   #[test]
@@ -498,6 +521,15 @@ mod tests {
       assert!(r.iter().all(|&b| b == 0xAA));
     }
 
+    // `as_ptr` accepts the valid layout and returns the same base pointer
+    // FFmpeg wrote into `data[0]`, so SIMD callers can reach the plane
+    // through the documented unsafe contract.
+    assert_eq!(
+      f.as_ptr(0),
+      Some(buf as *const u8),
+      "as_ptr must surface the plane base for a valid forward-stride frame"
+    );
+
     // Out-of-range row index returns None instead of panicking.
     assert!(f.row(0, height).is_none());
 

From 3e10b96091961005b808a37c9ad97682742b5920 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 15:21:16 +1200
Subject: [PATCH 23/27] update

---
 src/decoder.rs | 186 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 154 insertions(+), 32 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index 1783924..c609e89 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -1215,45 +1215,32 @@ fn drain_into_pending(
         }
         let mut cpu = alloc_av_frame()?;
         // SAFETY: hw_buf is a freshly-decoded HW frame; av_hwframe_transfer_data
-        // allocates buffers on `cpu`. copy_props moves timing/side data over.
+        // allocates buffers on `cpu`. We deliberately defer
+        // `av_frame_copy_props` until *after* the cap check below — that
+        // call deep-copies every AVFrameSideData entry, which a malicious
+        // stream can size in megabytes; allocating then discarding the
+        // copies on cap rejection is wasted work and a real allocator
+        // pressure source.
         unsafe {
           let r1 = av_hwframe_transfer_data(cpu.as_mut_ptr(), hw_buf.as_ptr(), 0);
           if r1 < 0 {
             return Err(ffmpeg_next::Error::from(r1));
           }
-          let r2 = av_frame_copy_props(cpu.as_mut_ptr(), hw_buf.as_ptr());
-          if r2 < 0 {
-            return Err(ffmpeg_next::Error::from(r2));
-          }
         }
-        // Post-transfer accounting: size the frame and confirm we can fit
-        // it without exceeding the byte budget. Both cap-hit and inability
-        // to size the frame are treated as candidate failures, so the byte
-        // budget is *strict* — we never queue a frame we can't account for.
-        match cpu_frame_bytes(&cpu) {
-          Some(bytes) => {
-            let new_total = pending_bytes.saturating_add(bytes);
-            if new_total > max_bytes {
-              tracing::warn!(
-                pending_bytes = *pending_bytes,
-                frame_bytes = bytes,
-                max_bytes,
-                "hwdecode: queueing this frame would exceed byte cap; failing candidate replay"
-              );
-              // cpu drops here.
-              return Err(ffmpeg_next::Error::Other {
-                errno: libc::ENOMEM,
-              });
-            }
-            *pending_bytes = new_total;
-            pending.push_back(cpu);
-          }
+        // Pre-copy_props accounting: size the frame's pixel storage and
+        // its (yet-to-be-copied) side data. Both cap-hit and inability to
+        // size the pixel layout are treated as candidate failures, so the
+        // byte budget is *strict* — we never queue a frame we can't fully
+        // account for, and we never pay the side-data deep copy on a
+        // frame we'd immediately drop.
+        let pixel_bytes = match cpu_frame_bytes(&cpu) {
+          Some(b) => b,
           None => {
-            // Unknown pix_fmt — we cannot bound this frame's contribution
-            // against the byte cap, so up to MAX_PROBE_PENDING_FRAMES of
-            // them could exhaust memory. Fail the candidate so probing
-            // tries the next backend rather than queueing untracked
-            // allocations.
+            // Unknown pix_fmt or vertically-flipped layout — we cannot
+            // bound this frame's contribution against the byte cap, so up
+            // to MAX_PROBE_PENDING_FRAMES of them could exhaust memory.
+            // Fail the candidate so probing tries the next backend
+            // rather than queueing untracked allocations.
             // SAFETY: AVFrame.format is c_int, safe to read.
             let pix_fmt: i32 = unsafe { (*cpu.as_ptr()).format };
             tracing::warn!(
@@ -1265,7 +1252,34 @@ fn drain_into_pending(
               errno: libc::ENOMEM,
             });
           }
+        };
+        let side_bytes = frame_side_data_bytes(hw_buf);
+        let frame_bytes = pixel_bytes.saturating_add(side_bytes);
+        let new_total = pending_bytes.saturating_add(frame_bytes);
+        if new_total > max_bytes {
+          tracing::warn!(
+            pending_bytes = *pending_bytes,
+            pixel_bytes,
+            side_bytes,
+            max_bytes,
+            "hwdecode: queueing this frame (pixels + side data) would exceed byte cap; \
+             failing candidate replay"
+          );
+          // cpu drops here without paying av_frame_copy_props.
+          return Err(ffmpeg_next::Error::Other {
+            errno: libc::ENOMEM,
+          });
         }
+        // Cap check passed — now safe to pay the side-data deep copy.
+        // SAFETY: cpu and hw_buf are both valid AVFrames we own.
+        unsafe {
+          let r2 = av_frame_copy_props(cpu.as_mut_ptr(), hw_buf.as_ptr());
+          if r2 < 0 {
+            return Err(ffmpeg_next::Error::from(r2));
+          }
+        }
+        *pending_bytes = new_total;
+        pending.push_back(cpu);
       }
       Err(e) if is_transient(&e) => return Ok(()),
       Err(e) => return Err(e),
@@ -1273,6 +1287,42 @@ fn drain_into_pending(
   }
 }
 
+/// Sum of `AVFrameSideData[i].size` across every entry attached to
+/// `frame`. `av_frame_copy_props` performs a deep copy of every side
+/// data buffer (allocates a fresh `AVBufferRef` per entry), so a
+/// candidate decoder that produces large per-frame metadata (HDR
+/// mastering display info, A53 closed captions, ICC profiles, dynamic
+/// HDR, motion vectors, …) would otherwise bypass the
+/// `max_probe_pending_bytes` cap because [`cpu_frame_bytes`] only
+/// accounts for pixel-plane storage.
+///
+/// Reads only the `size` field of each `AVFrameSideData` — never
+/// constructs the bindgen `AVFrameSideDataType` enum, so unknown side-
+/// data types from a future FFmpeg do not invoke UB.
+fn frame_side_data_bytes(frame: &frame::Video) -> usize {
+  // SAFETY: AVFrame.side_data is `*mut *mut AVFrameSideData` and
+  // nb_side_data is `c_int`; both are raw struct fields safe to read.
+  // Field projection through the indirected pointer touches only the
+  // primitive `usize` `.size` field (never `type_`).
+  unsafe {
+    let raw = frame.as_ptr();
+    let nb = (*raw).nb_side_data;
+    let arr = (*raw).side_data;
+    if arr.is_null() || nb <= 0 {
+      return 0;
+    }
+    let mut total: usize = 0;
+    for i in 0..(nb as usize) {
+      let entry = *arr.add(i);
+      if entry.is_null() {
+        continue;
+      }
+      total = total.saturating_add((*entry).size);
+    }
+    total
+  }
+}
+
 /// Approximate resident size of a CPU frame: sum of `linesize[plane] *
 /// plane_height` across populated planes.
 ///
@@ -1489,6 +1539,78 @@ mod tests {
     assert_eq!(cpu_frame_bytes(&f), Some(0));
   }
 
+  /// `av_frame_copy_props` deep-copies every AVFrameSideData attached
+  /// to the source frame. `frame_side_data_bytes` must surface that
+  /// retention so `drain_into_pending` can charge it against
+  /// `max_probe_pending_bytes` — otherwise a stream with megabytes of
+  /// per-frame metadata can queue up to `MAX_PROBE_PENDING_FRAMES`
+  /// frames and overshoot the configured cap by orders of magnitude.
+  #[test]
+  fn frame_side_data_bytes_counts_attached_buffers() {
+    use ffmpeg_next::ffi::{av_frame_new_side_data, AVFrameSideDataType};
+
+    const SIDE_DATA_SIZE: usize = 1024 * 1024; // 1 MiB
+
+    let mut f = frame::Video::empty();
+    // SAFETY: f is freshly allocated; av_frame_new_side_data attaches a
+    // fresh `SIDE_DATA_SIZE`-byte buffer of the requested type and returns
+    // a pointer to the entry (or NULL on OOM).
+    let p = unsafe {
+      av_frame_new_side_data(
+        f.as_mut_ptr(),
+        AVFrameSideDataType::AV_FRAME_DATA_SEI_UNREGISTERED,
+        SIDE_DATA_SIZE,
+      )
+    };
+    assert!(!p.is_null(), "av_frame_new_side_data returned NULL");
+
+    let bytes = frame_side_data_bytes(&f);
+    assert!(
+      bytes >= SIDE_DATA_SIZE,
+      "side-data accounting must include the attached buffer; got {bytes}"
+    );
+  }
+
+  #[test]
+  fn frame_side_data_bytes_is_zero_for_bare_frame() {
+    let f = frame::Video::empty();
+    assert_eq!(frame_side_data_bytes(&f), 0);
+  }
+
+  /// Multiple side-data entries must be summed, not just the first.
+  #[test]
+  fn frame_side_data_bytes_sums_all_entries() {
+    use ffmpeg_next::ffi::{av_frame_new_side_data, AVFrameSideDataType};
+
+    const ENTRY_A: usize = 256 * 1024; // 256 KiB
+    const ENTRY_B: usize = 512 * 1024; // 512 KiB
+
+    let mut f = frame::Video::empty();
+    // Two distinct types so neither call replaces the other.
+    let p1 = unsafe {
+      av_frame_new_side_data(
+        f.as_mut_ptr(),
+        AVFrameSideDataType::AV_FRAME_DATA_SEI_UNREGISTERED,
+        ENTRY_A,
+      )
+    };
+    let p2 = unsafe {
+      av_frame_new_side_data(
+        f.as_mut_ptr(),
+        AVFrameSideDataType::AV_FRAME_DATA_A53_CC,
+        ENTRY_B,
+      )
+    };
+    assert!(!p1.is_null() && !p2.is_null());
+
+    let bytes = frame_side_data_bytes(&f);
+    assert!(
+      bytes >= ENTRY_A + ENTRY_B,
+      "must sum across all side-data entries; got {bytes}, expected at least {}",
+      ENTRY_A + ENTRY_B
+    );
+  }
+
   /// Probe-abandon paths in `send_packet` (cap exceeded, packet clone
   /// failed) must not drop frames already queued in `pending_frames`.
   /// Those frames belong to the currently active backend — possibly a

From cb8e9e63f2d949fbe9025dabe87c4e8a28d2168f Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 16:27:38 +1200
Subject: [PATCH 24/27] update

---
 src/decoder.rs | 177 +++++++++++++++++++++++++++++++++++++++----------
 src/ffi.rs     |  72 +++++++++++++++++---
 2 files changed, 203 insertions(+), 46 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index c609e89..bac87b2 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -842,14 +842,18 @@ impl VideoDecoder {
     let mut ctx = build_codec_context(&parameters)?;
     let av_type = backend.av_hwdevice_type();
 
-    // Verify the codec advertises this hwaccel. We do *not* read the
-    // codec's advertised pix_fmt — we use the hardcoded constant from
-    // `Backend::hw_pixel_format` so no FFmpeg-supplied enum value is ever
-    // interpreted as `AVPixelFormat`.
-    if !codec_supports_hwaccel(unsafe { codec.as_ptr() }, av_type) {
+    // Verify the codec advertises this hwaccel **with the exact HW pix_fmt
+    // we're about to wire up in `get_format`**. FFmpeg's HW config table
+    // is keyed per (device_type, pix_fmt); a codec can advertise the same
+    // device with several HW pix_fmts, so matching only on device_type
+    // would let probing succeed for a backend whose pix_fmt the codec
+    // never offers — the failure would then surface deep inside the
+    // probe/decode loop. Matching the exact pix_fmt keeps the strict
+    // `get_format` honest and gives `open_with` a clean rejection.
+    let hw_pix_fmt = backend.hw_pixel_format();
+    if !codec_supports_hwaccel(unsafe { codec.as_ptr() }, av_type, hw_pix_fmt as i32) {
       return Err(Error::BackendUnsupportedByCodec(backend));
     }
-    let hw_pix_fmt = backend.hw_pixel_format();
 
     // Create the device context.
     let mut hw_device_ref: *mut AVBufferRef = ptr::null_mut();
@@ -868,6 +872,19 @@ impl VideoDecoder {
       wanted: hw_pix_fmt,
       wanted_int: hw_pix_fmt as i32,
     }));
+    // RAII guard: from now until the end-of-function `into_owned()`, every
+    // early return — `av_buffer_ref` failure, `open_as` failure, codec_type
+    // mismatch, or any future error path added between here and the
+    // `DecoderState` construction — frees `hw_device_ref` and
+    // `callback_state` via the guard's Drop. Without it, each error site
+    // had to remember to clean up these two FFI-owned resources by hand;
+    // the codec_type-mismatch branch was missed and silently leaked one
+    // device ref + one heap allocation per bad input.
+    let guard = PartialBuildState {
+      hw_device_ref,
+      callback_state,
+    };
+
     // SAFETY: ctx is a freshly-constructed AVCodecContext we own;
     // av_buffer_ref bumps the refcount of the device buffer for FFmpeg's
     // use (we keep our own ref in `hw_device_ref` for cleanup).
@@ -876,20 +893,18 @@ impl VideoDecoder {
     // HW-flagged setup but no actual device reference.
     let device_ref_for_ctx = unsafe { av_buffer_ref(hw_device_ref) };
     if device_ref_for_ctx.is_null() {
-      // SAFETY: rolling back what we just allocated above. hw_device_ref
-      // is non-null (we checked after av_hwdevice_ctx_create); callback_state
-      // was just freshly Box::into_raw'd.
-      unsafe {
-        let mut hw = hw_device_ref;
-        av_buffer_unref(&mut hw);
-        drop(Box::from_raw(callback_state));
-      }
+      // guard's Drop frees hw_device_ref (the first ref) and callback_state.
       return Err(Error::Ffmpeg(ffmpeg_next::Error::Other {
         errno: libc::ENOMEM,
       }));
     }
     // SAFETY: device_ref_for_ctx is a valid AVBufferRef* from av_buffer_ref;
-    // ctx is freshly built and owned by us.
+    // ctx is freshly built and owned by us. After this point ctx aliases
+    // `callback_state` via `opaque` (FFmpeg never frees opaque, so
+    // `callback_state` ownership stays with us / the guard) and aliases
+    // `device_ref_for_ctx` (the second ref) via `hw_device_ctx` (FFmpeg
+    // unrefs that on codec context drop, independent of the guard's first
+    // ref).
     unsafe {
       let raw = ctx.as_mut_ptr();
       (*raw).hw_device_ctx = device_ref_for_ctx;
@@ -897,8 +912,9 @@ impl VideoDecoder {
       (*raw).get_format = Some(get_hw_format);
     }
 
-    // Open the decoder. On any failure, release the resources we just
-    // allocated so we don't leak.
+    // Open the decoder. On failure `ctx`/`opened` Drop releases the codec
+    // context (and via that the second device ref); the guard releases the
+    // first device ref and the callback state.
     //
     // We deliberately bypass `Opened::video()` because it calls
     // `Context::medium()`, which reads `AVCodecContext.codec_type` as the
@@ -906,24 +922,7 @@ impl VideoDecoder {
     // systematically removing. Instead: validate `codec_type` as a raw
     // `c_int` ourselves, then construct the `decoder::Video` wrapper
     // directly via its public tuple field.
-    let opened = match ctx.decoder().open_as(codec) {
-      Ok(o) => o,
-      Err(e) => {
-        // SAFETY: we either allocated these in this function above or
-        // they are null; av_buffer_unref / Box::from_raw handle null
-        // explicitly (we check first).
-        unsafe {
-          let mut hw = hw_device_ref;
-          if !hw.is_null() {
-            av_buffer_unref(&mut hw);
-          }
-          if !callback_state.is_null() {
-            drop(Box::from_raw(callback_state));
-          }
-        }
-        return Err(Error::Ffmpeg(e));
-      }
-    };
+    let opened = ctx.decoder().open_as(codec).map_err(Error::Ffmpeg)?;
 
     // Validate codec_type as a raw integer — never construct AVMediaType
     // from an unvalidated runtime value.
@@ -935,7 +934,8 @@ impl VideoDecoder {
     if codec_type_int != video_type_int {
       // Not a video codec context — surface the same error
       // `Opened::video()` would have, without going through enum
-      // construction. Cleanup runs via `opened`'s Drop.
+      // construction. `opened`'s Drop releases the codec context; the
+      // guard releases the first hw_device_ref and the callback state.
       return Err(Error::Ffmpeg(ffmpeg_next::Error::InvalidData));
     }
     // SAFETY of construction: `decoder::Video` is `pub struct Video(pub Opened)`.
@@ -943,6 +943,9 @@ impl VideoDecoder {
     // `Opened::video()` does on success, just without the enum read.
     let opened = ffmpeg_next::decoder::Video(opened);
 
+    // Disarm the guard and transfer ownership of both resources into the
+    // returned DecoderState (whose own Drop handles their lifetime).
+    let (hw_device_ref, callback_state) = guard.into_owned();
     Ok(DecoderState {
       inner: ManuallyDrop::new(opened),
       backend,
@@ -952,6 +955,55 @@ impl VideoDecoder {
   }
 }
 
+/// RAII guard for the partially-owned FFmpeg state that
+/// [`VideoDecoder::build_state`] holds between the
+/// `av_hwdevice_ctx_create` and `Box::into_raw(CallbackState)`
+/// allocations and the final `DecoderState` construction.
+///
+/// If `build_state` returns `Err` for any reason in that window
+/// (`av_buffer_ref` ENOMEM, `open_as` failure, codec_type mismatch, or
+/// any future error path), this guard's `Drop` releases
+/// `hw_device_ref` — the first ref returned by `av_hwdevice_ctx_create`,
+/// distinct from the second ref FFmpeg unrefs when the codec context
+/// drops — and the boxed `CallbackState`, which FFmpeg never touches
+/// because `AVCodecContext::opaque` is purely user-owned.
+///
+/// Successful construction calls [`Self::into_owned`] to disarm the
+/// guard and hand both pointers to the new `DecoderState`.
+struct PartialBuildState {
+  hw_device_ref: *mut AVBufferRef,
+  callback_state: *mut CallbackState,
+}
+
+impl PartialBuildState {
+  /// Disarm the guard: return the owned pointers and replace the guard's
+  /// fields with null so its Drop is a no-op.
+  fn into_owned(mut self) -> (*mut AVBufferRef, *mut CallbackState) {
+    let hw = std::mem::replace(&mut self.hw_device_ref, ptr::null_mut());
+    let cb = std::mem::replace(&mut self.callback_state, ptr::null_mut());
+    (hw, cb)
+  }
+}
+
+impl Drop for PartialBuildState {
+  fn drop(&mut self) {
+    // SAFETY: pointers are either freshly allocated by `build_state` (via
+    // `av_hwdevice_ctx_create` and `Box::into_raw`) or null after
+    // `into_owned`. Both `av_buffer_unref` and `Box::from_raw` need the
+    // null check we apply here; both are otherwise sound on resources we
+    // own.
+    unsafe {
+      if !self.hw_device_ref.is_null() {
+        let mut hw = self.hw_device_ref;
+        av_buffer_unref(&mut hw);
+      }
+      if !self.callback_state.is_null() {
+        drop(Box::from_raw(self.callback_state));
+      }
+    }
+  }
+}
+
 /// Download a HW frame into a CPU [`Frame`]. Always unrefs the destination
 /// first so reuse across resolution changes is safe.
 unsafe fn transfer_hw_frame(
@@ -1611,6 +1663,59 @@ mod tests {
     );
   }
 
+  /// `PartialBuildState`'s `Drop` must be a no-op when both pointers are
+  /// null — the disarmed-by-`into_owned` post-state. A panic / double-free
+  /// here would break the success path of every `build_state` call.
+  #[test]
+  fn partial_build_state_drop_is_no_op_on_null_pointers() {
+    let _g = PartialBuildState {
+      hw_device_ref: ptr::null_mut(),
+      callback_state: ptr::null_mut(),
+    };
+    // Drops at end of scope. Test passes if it doesn't panic / crash.
+  }
+
+  /// `into_owned` must return the original pointers and disarm the guard
+  /// (so the guard's Drop becomes a no-op and the caller can safely
+  /// transfer ownership to `DecoderState` without double-freeing).
+  #[test]
+  fn partial_build_state_into_owned_disarms_and_returns_originals() {
+    use ffmpeg_next::ffi::{av_buffer_alloc, av_buffer_unref, AVPixelFormat};
+
+    // SAFETY: av_buffer_alloc returns a fresh AVBufferRef* with refcount
+    // 1, or NULL on OOM. We free it ourselves below (after into_owned
+    // disarms the guard).
+    let hw_ptr = unsafe { av_buffer_alloc(64) };
+    assert!(!hw_ptr.is_null(), "av_buffer_alloc(64) returned NULL");
+    let cb_ptr = Box::into_raw(Box::new(CallbackState {
+      wanted: AVPixelFormat::AV_PIX_FMT_NONE,
+      wanted_int: AVPixelFormat::AV_PIX_FMT_NONE as i32,
+    }));
+
+    let g = PartialBuildState {
+      hw_device_ref: hw_ptr,
+      callback_state: cb_ptr,
+    };
+    let (hw_back, cb_back) = g.into_owned();
+    assert_eq!(
+      hw_back, hw_ptr,
+      "into_owned must return the original device ref"
+    );
+    assert_eq!(
+      cb_back, cb_ptr,
+      "into_owned must return the original callback box"
+    );
+
+    // Guard is now disarmed (its Drop ran with null pointers as soon as
+    // into_owned consumed it). We own the pointers and must free them.
+    // SAFETY: hw_ptr and cb_ptr are still the freshly-allocated values.
+    unsafe {
+      let mut hw = hw_back;
+      av_buffer_unref(&mut hw);
+      drop(Box::from_raw(cb_back));
+    }
+  }
+
   /// Probe-abandon paths in `send_packet` (cap exceeded, packet clone
   /// failed) must not drop frames already queued in `pending_frames`.
   /// Those frames belong to the currently active backend — possibly a
diff --git a/src/ffi.rs b/src/ffi.rs
index 794d474..04aa50f 100644
--- a/src/ffi.rs
+++ b/src/ffi.rs
@@ -84,15 +84,28 @@ pub(crate) unsafe extern "C" fn get_hw_format(
 }
 
 /// Walk the codec's `AVCodecHWConfig` table and return whether the codec
-/// advertises support for `device_type` via the `HW_DEVICE_CTX` setup method.
+/// advertises support for `device_type` **with** `wanted_pix_fmt` via the
+/// `HW_DEVICE_CTX` setup method.
 ///
-/// We do not return the codec's advertised `pix_fmt` — we know it already
-/// from [`crate::backend::Backend::hw_pixel_format`] (a hardcoded constant
-/// from our bindings). All reads from the FFmpeg-supplied `AVCodecHWConfig`
-/// are performed as raw integers via `addr_of!` + `ptr::read::<i32>` to
-/// avoid copying or interpreting enum-typed fields whose runtime values
-/// might not match our build's discriminant set.
-pub(crate) fn codec_supports_hwaccel(codec: *const AVCodec, device_type: AVHWDeviceType) -> bool {
+/// FFmpeg's HW config table is keyed per (device_type, pix_fmt) pair: a
+/// codec can advertise the same device with several different hardware
+/// pixel formats (e.g. VAAPI codecs that offer both `AV_PIX_FMT_VAAPI`
+/// and `AV_PIX_FMT_DRM_PRIME`). Matching only on `device_type` would let
+/// us proceed to install a strict `get_format` callback for a format the
+/// codec never advertises, and the failure would surface deep inside the
+/// probe / decode path instead of up front. Requiring the codec to
+/// advertise the **exact** pix_fmt our `Backend` uses keeps the strict
+/// `get_format` honest and gives `open_with` a clean rejection signal.
+///
+/// All reads from the FFmpeg-supplied `AVCodecHWConfig` are performed as
+/// raw integers via `addr_of!` + `ptr::read::<i32>` to avoid copying or
+/// interpreting enum-typed fields whose runtime values might not match
+/// our build's discriminant set.
+pub(crate) fn codec_supports_hwaccel(
+  codec: *const AVCodec,
+  device_type: AVHWDeviceType,
+  wanted_pix_fmt: i32,
+) -> bool {
   debug_assert!(!codec.is_null());
   let device_type_int = device_type as i32;
   let mut i = 0;
@@ -106,15 +119,18 @@ pub(crate) fn codec_supports_hwaccel(codec: *const AVCodec, device_type: AVHWDev
     // (which would interpret `pix_fmt` and `device_type` as their enum types).
     // SAFETY: `cfg` is non-null and points to a valid `AVCodecHWConfig` for
     // the lifetime of the call; `addr_of!` projects to a sized field; the
-    // `*const i32` cast is sound because `methods` is `c_int` (i32) and
+    // `*const i32` cast is sound because `methods` is `c_int` (i32),
     // `device_type` is `AVHWDeviceType` (`#[repr(u32)]`, but FFmpeg's
-    // assigned values fit in i32 and the runtime layout is i32-sized).
+    // assigned values fit in i32 and the runtime layout is i32-sized),
+    // and `pix_fmt` is `AVPixelFormat` (`#[repr(i32)]`).
     let methods: i32 = unsafe { ptr::read(ptr::addr_of!((*cfg).methods)) };
     let cfg_device_type_int: i32 =
       unsafe { ptr::read(ptr::addr_of!((*cfg).device_type) as *const i32) };
+    let cfg_pix_fmt_int: i32 = unsafe { ptr::read(ptr::addr_of!((*cfg).pix_fmt) as *const i32) };
 
     if methods & (AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX as i32) != 0
       && cfg_device_type_int == device_type_int
+      && cfg_pix_fmt_int == wanted_pix_fmt
     {
       return true;
     }
@@ -217,4 +233,40 @@ mod tests {
     );
     assert_eq!(got, AVPixelFormat::AV_PIX_FMT_NONE);
   }
+
+  /// `codec_supports_hwaccel` must reject a (device_type, pix_fmt) pair
+  /// that the codec does not advertise — even if the device alone is
+  /// listed. Without this check, the strict `get_format` callback would
+  /// be wired up for a HW pix_fmt the codec never offers and the failure
+  /// would surface deep inside the probe / decode path instead of at
+  /// `open_with` / probe-build time.
+  ///
+  /// macOS-only: the test relies on FFmpeg's H.264 decoder advertising
+  /// `(AV_HWDEVICE_TYPE_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX)`, which is
+  /// only present in builds with VideoToolbox compiled in.
+  #[cfg(target_os = "macos")]
+  #[test]
+  fn codec_supports_hwaccel_requires_matching_pix_fmt() {
+    use ffmpeg_next::ffi::{avcodec_find_decoder, AVCodecID, AVHWDeviceType, AVPixelFormat};
+
+    // SAFETY: AV_CODEC_ID_H264 is a known constant in our build's
+    // `AVCodecID` discriminant set; constructing it does not invoke the
+    // bindgen-enum UB we worry about for runtime-derived ids.
+    let codec_ptr = unsafe { avcodec_find_decoder(AVCodecID::AV_CODEC_ID_H264) };
+    assert!(!codec_ptr.is_null(), "H.264 decoder must be present");
+
+    let device = AVHWDeviceType::AV_HWDEVICE_TYPE_VIDEOTOOLBOX;
+    let videotoolbox = AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX as i32;
+    let nv12 = AVPixelFormat::AV_PIX_FMT_NV12 as i32;
+
+    assert!(
+      codec_supports_hwaccel(codec_ptr, device, videotoolbox),
+      "VideoToolbox + AV_PIX_FMT_VIDEOTOOLBOX must be advertised by FFmpeg's H.264 decoder"
+    );
+    assert!(
+      !codec_supports_hwaccel(codec_ptr, device, nv12),
+      "VideoToolbox + AV_PIX_FMT_NV12 must NOT match the codec's HW config — \
+       the strict get_format would have no offered HW format to return"
+    );
+  }
 }

From 65ae604a74556ee89a249e2e4c5125735a6006bb Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 16:51:15 +1200
Subject: [PATCH 25/27] update

---
 src/decoder.rs | 326 +++++++++++++++++++++++++++----------------------
 1 file changed, 179 insertions(+), 147 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index bac87b2..81db18a 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -7,10 +7,10 @@ use ffmpeg_next::{
     Context,
   },
   ffi::{
-    av_buffer_ref, av_buffer_unref, av_frame_copy_props, av_frame_move_ref, av_frame_unref,
-    av_hwdevice_ctx_create, av_hwframe_transfer_data, av_packet_ref, avcodec_alloc_context3,
-    avcodec_free_context, avcodec_parameters_alloc, avcodec_parameters_copy,
-    avcodec_parameters_free, avcodec_parameters_to_context, AVBufferRef, AVCodec, AVMediaType,
+    av_buffer_ref, av_buffer_unref, av_frame_move_ref, av_frame_unref, av_hwdevice_ctx_create,
+    av_hwframe_transfer_data, av_packet_ref, avcodec_alloc_context3, avcodec_free_context,
+    avcodec_parameters_alloc, avcodec_parameters_copy, avcodec_parameters_free,
+    avcodec_parameters_to_context, AVBufferRef, AVCodec, AVFrame, AVMediaType,
   },
   frame, Codec, Packet, Rational,
 };
@@ -109,6 +109,29 @@ const MAX_PROBE_PACKETS: usize = 256;
 /// gives untrusted media a hard ceiling.
 const MAX_PROBE_PACKET_BYTES: usize = 64 * 1024 * 1024;
 
+/// Hard cap on the number of side-data entries we tolerate per buffered
+/// packet. `av_packet_ref` allocates an `AVPacketSideData` descriptor and
+/// an `AVBufferRef` per entry, so a packet stuffed with many tiny or
+/// zero-sized entries can consume significant memory in descriptor /
+/// allocator overhead even after [`packet_side_data_bytes`] charges
+/// [`SIDE_DATA_ENTRY_OVERHEAD`] bytes per entry. Refusing to clone such
+/// packets short-circuits the descriptor explosion path.
+///
+/// Sized for legitimate streams (typical video packets carry 0-5 side-
+/// data entries; SEI-heavy HEVC/AV1 maybe a dozen) while comfortably
+/// rejecting weaponised input.
+const MAX_PROBE_PACKET_SIDE_DATA_ENTRIES: usize = 64;
+
+/// Conservative per-side-data-entry overhead estimate used by both
+/// [`packet_side_data_bytes`] and the budget accounting in
+/// [`VideoDecoder::send_packet`]. Counts the `AVPacketSideData`
+/// descriptor (24 bytes per the FFmpeg 8.x bindings), the `AVBufferRef`
+/// FFmpeg allocates per entry, and a margin for malloc bookkeeping
+/// (header bytes, alignment slack). Setting it on the high side keeps
+/// the byte cap a true upper bound on retained memory; under-charging
+/// would let many tiny entries slip past the cap.
+const SIDE_DATA_ENTRY_OVERHEAD: usize = 80;
+
 /// Maximum number of CPU frames we are willing to queue from a candidate
 /// during probe replay. Each frame is a fully-allocated CPU buffer
 /// (~3 MiB for 1080p NV12, ~24 MiB for 4K P010, ~96 MiB for 8K P010), so
@@ -389,18 +412,32 @@ impl VideoDecoder {
           if let Some(probe) = self.probe.as_mut() {
             // `try_clone_packet` calls `av_packet_ref`, which deep-copies
             // side data via `av_packet_copy_props`. The probe budget must
-            // include side-data bytes or a malicious stream can keep
-            // `packet.size()` tiny while attaching megabytes of side data
-            // per packet and inflate retention beyond the advertised cap.
+            // include both descriptor + ref overhead per side-data entry
+            // (via `packet_side_data_bytes`) and a hard cap on the entry
+            // count itself — without the count cap, a packet stuffed with
+            // many tiny entries can dominate retained memory before the
+            // byte cap is even close to firing.
+            let side_count = packet_side_data_count(packet);
             let pkt_size = packet.size().saturating_add(packet_side_data_bytes(packet));
             let new_count = probe.buffered_packets.len() + 1;
             let new_bytes = probe.buffered_bytes.saturating_add(pkt_size);
-            if new_count > MAX_PROBE_PACKETS || new_bytes > MAX_PROBE_PACKET_BYTES {
+            let entry_cap_exceeded = side_count > MAX_PROBE_PACKET_SIDE_DATA_ENTRIES;
+            if new_count > MAX_PROBE_PACKETS
+              || new_bytes > MAX_PROBE_PACKET_BYTES
+              || entry_cap_exceeded
+            {
               tracing::warn!(
                 packets = new_count,
                 bytes = new_bytes,
+                side_data_entries = side_count,
                 max_packets = MAX_PROBE_PACKETS,
                 max_bytes = MAX_PROBE_PACKET_BYTES,
+                max_side_data_entries = MAX_PROBE_PACKET_SIDE_DATA_ENTRIES,
+                trigger = if entry_cap_exceeded {
+                  "side_data_entry_cap"
+                } else {
+                  "byte_or_packet_cap"
+                },
                 "hwdecode: probe window exceeded caps without first frame; \
                  abandoning fallback safety net"
               );
@@ -1006,6 +1043,20 @@ impl Drop for PartialBuildState {
 
 /// Download a HW frame into a CPU [`Frame`]. Always unrefs the destination
 /// first so reuse across resolution changes is safe.
+///
+/// Deliberately does **not** call `av_frame_copy_props`. That FFmpeg
+/// helper deep-copies AVFrame side data (SEI, mastering display, ICC
+/// profiles, dynamic HDR, etc.), the metadata dict, and bumps both
+/// `opaque_ref` and `private_ref` on every receive — none of which
+/// `Frame` exposes via its public accessors. On a crafted stream with
+/// megabytes of per-frame metadata that would mean an unbounded
+/// allocation per receive, with no caller-visible benefit. We instead
+/// copy only the scalar fields the public API can read (today: `pts`);
+/// pixel layout (`width`, `height`, `format`, `linesize`, `data`) is
+/// already set by `av_hwframe_transfer_data`. If `Frame` ever grows
+/// accessors for timing extras (`duration`, `time_base`, `pkt_dts`) or
+/// color metadata, add those to `copy_frame_props_minimal` at the same
+/// time.
 unsafe fn transfer_hw_frame(
   dst: &mut Frame,
   src: &mut frame::Video,
@@ -1016,14 +1067,27 @@ unsafe fn transfer_hw_frame(
     if ret < 0 {
       return Err(ffmpeg_next::Error::from(ret));
     }
-    let ret = av_frame_copy_props(dst.as_inner_mut().as_mut_ptr(), src.as_ptr());
-    if ret < 0 {
-      return Err(ffmpeg_next::Error::from(ret));
-    }
+    copy_frame_props_minimal(dst.as_inner_mut().as_mut_ptr(), src.as_ptr());
   }
   Ok(())
 }
 
+/// Bounded substitute for `av_frame_copy_props`. Copies only the scalar
+/// AVFrame fields the public `Frame` API needs from `src` to `dst` —
+/// today just `pts`. Skips every allocating field (`av_dict_copy` for
+/// `metadata`, `av_frame_new_side_data` + memcpy for each `side_data[i]`,
+/// `av_buffer_replace` for `opaque_ref` / `private_ref`) so the cost is
+/// O(1) per frame regardless of what the source attaches.
+///
+/// # Safety
+/// Both pointers must be valid `AVFrame` pointers we own; field
+/// projection touches only POD scalars, no enums or buffer refs.
+unsafe fn copy_frame_props_minimal(dst: *mut AVFrame, src: *const AVFrame) {
+  unsafe {
+    (*dst).pts = (*src).pts;
+  }
+}
+
 /// `EAGAIN` and `EOF` are normal flow signals from `avcodec_receive_frame`
 /// and must not be treated as backend failures.
 fn is_transient(e: &ffmpeg_next::Error) -> bool {
@@ -1181,8 +1245,12 @@ fn packet_side_data_bytes(packet: &Packet) -> usize {
     if arr.is_null() || nel <= 0 {
       return 0;
     }
-    let mut total: usize = 0;
-    for i in 0..(nel as usize) {
+    let count = nel as usize;
+    // Descriptor + AVBufferRef + allocator overhead per entry — without
+    // this, a packet stuffed with many zero-size entries could slip past
+    // `MAX_PROBE_PACKET_BYTES` purely on descriptor cost.
+    let mut total = count.saturating_mul(SIDE_DATA_ENTRY_OVERHEAD);
+    for i in 0..count {
       let entry = arr.add(i);
       total = total.saturating_add((*entry).size);
     }
@@ -1190,6 +1258,20 @@ fn packet_side_data_bytes(packet: &Packet) -> usize {
   }
 }
 
+/// Number of `AVPacketSideData` entries on `packet`. The probe buffer
+/// uses this to enforce [`MAX_PROBE_PACKET_SIDE_DATA_ENTRIES`] before
+/// cloning, so a packet whose entry count alone would dominate retained
+/// memory is rejected up front.
+fn packet_side_data_count(packet: &Packet) -> usize {
+  // SAFETY: side_data_elems is `c_int`, safe to read; clamp negatives to 0.
+  let nel = unsafe { (*packet.as_ptr()).side_data_elems };
+  if nel <= 0 {
+    0
+  } else {
+    nel as usize
+  }
+}
+
 /// Just `EAGAIN` (separate from EOF — the FFmpeg send/receive state machine
 /// distinguishes "drain output and retry" from "stream over").
 fn is_eagain(e: &ffmpeg_next::Error) -> bool {
@@ -1266,25 +1348,20 @@ fn drain_into_pending(
           });
         }
         let mut cpu = alloc_av_frame()?;
-        // SAFETY: hw_buf is a freshly-decoded HW frame; av_hwframe_transfer_data
-        // allocates buffers on `cpu`. We deliberately defer
-        // `av_frame_copy_props` until *after* the cap check below — that
-        // call deep-copies every AVFrameSideData entry, which a malicious
-        // stream can size in megabytes; allocating then discarding the
-        // copies on cap rejection is wasted work and a real allocator
-        // pressure source.
+        // SAFETY: hw_buf is a freshly-decoded HW frame;
+        // `av_hwframe_transfer_data` allocates pixel buffers on `cpu`.
+        // We use `copy_frame_props_minimal` (only `pts`) instead of
+        // `av_frame_copy_props` for the same reason as
+        // `transfer_hw_frame`: the public `Frame` API does not expose
+        // side data / metadata / opaque refs, so deep-copying them per
+        // frame is pure cost and an unbounded allocation source on
+        // attacker-controlled streams.
         unsafe {
           let r1 = av_hwframe_transfer_data(cpu.as_mut_ptr(), hw_buf.as_ptr(), 0);
           if r1 < 0 {
             return Err(ffmpeg_next::Error::from(r1));
           }
         }
-        // Pre-copy_props accounting: size the frame's pixel storage and
-        // its (yet-to-be-copied) side data. Both cap-hit and inability to
-        // size the pixel layout are treated as candidate failures, so the
-        // byte budget is *strict* — we never queue a frame we can't fully
-        // account for, and we never pay the side-data deep copy on a
-        // frame we'd immediately drop.
         let pixel_bytes = match cpu_frame_bytes(&cpu) {
           Some(b) => b,
           None => {
@@ -1305,30 +1382,25 @@ fn drain_into_pending(
             });
           }
         };
-        let side_bytes = frame_side_data_bytes(hw_buf);
-        let frame_bytes = pixel_bytes.saturating_add(side_bytes);
-        let new_total = pending_bytes.saturating_add(frame_bytes);
+        let new_total = pending_bytes.saturating_add(pixel_bytes);
         if new_total > max_bytes {
           tracing::warn!(
             pending_bytes = *pending_bytes,
             pixel_bytes,
-            side_bytes,
             max_bytes,
-            "hwdecode: queueing this frame (pixels + side data) would exceed byte cap; \
+            "hwdecode: queueing this frame would exceed byte cap; \
              failing candidate replay"
           );
-          // cpu drops here without paying av_frame_copy_props.
+          // cpu drops here without ever paying a metadata deep copy.
           return Err(ffmpeg_next::Error::Other {
             errno: libc::ENOMEM,
           });
         }
-        // Cap check passed — now safe to pay the side-data deep copy.
-        // SAFETY: cpu and hw_buf are both valid AVFrames we own.
+        // Cap check passed — copy only the scalar AVFrame fields the
+        // public API needs. SAFETY: cpu and hw_buf are both valid
+        // AVFrames we own.
         unsafe {
-          let r2 = av_frame_copy_props(cpu.as_mut_ptr(), hw_buf.as_ptr());
-          if r2 < 0 {
-            return Err(ffmpeg_next::Error::from(r2));
-          }
+          copy_frame_props_minimal(cpu.as_mut_ptr(), hw_buf.as_ptr());
         }
         *pending_bytes = new_total;
         pending.push_back(cpu);
@@ -1339,42 +1411,6 @@ fn drain_into_pending(
   }
 }
 
-/// Sum of `AVFrameSideData[i].size` across every entry attached to
-/// `frame`. `av_frame_copy_props` performs a deep copy of every side
-/// data buffer (allocates a fresh `AVBufferRef` per entry), so a
-/// candidate decoder that produces large per-frame metadata (HDR
-/// mastering display info, A53 closed captions, ICC profiles, dynamic
-/// HDR, motion vectors, …) would otherwise bypass the
-/// `max_probe_pending_bytes` cap because [`cpu_frame_bytes`] only
-/// accounts for pixel-plane storage.
-///
-/// Reads only the `size` field of each `AVFrameSideData` — never
-/// constructs the bindgen `AVFrameSideDataType` enum, so unknown side-
-/// data types from a future FFmpeg do not invoke UB.
-fn frame_side_data_bytes(frame: &frame::Video) -> usize {
-  // SAFETY: AVFrame.side_data is `*mut *mut AVFrameSideData` and
-  // nb_side_data is `c_int`; both are raw struct fields safe to read.
-  // Field projection through the indirected pointer touches only the
-  // primitive `usize` `.size` field (never `type_`).
-  unsafe {
-    let raw = frame.as_ptr();
-    let nb = (*raw).nb_side_data;
-    let arr = (*raw).side_data;
-    if arr.is_null() || nb <= 0 {
-      return 0;
-    }
-    let mut total: usize = 0;
-    for i in 0..(nb as usize) {
-      let entry = *arr.add(i);
-      if entry.is_null() {
-        continue;
-      }
-      total = total.saturating_add((*entry).size);
-    }
-    total
-  }
-}
-
 /// Approximate resident size of a CPU frame: sum of `linesize[plane] *
 /// plane_height` across populated planes.
 ///
@@ -1530,6 +1566,74 @@ mod tests {
   fn packet_side_data_is_zero_when_no_side_data() {
     let packet = Packet::new(64);
     assert_eq!(packet_side_data_bytes(&packet), 0);
+    assert_eq!(packet_side_data_count(&packet), 0);
+  }
+
+  /// Packets with many tiny side-data entries must be charged the
+  /// per-entry descriptor + ref overhead, even when each entry's payload
+  /// `size` is zero. Without `SIDE_DATA_ENTRY_OVERHEAD`, a packet stuffed
+  /// with N zero-byte entries would charge 0 bytes against the budget
+  /// while `av_packet_ref` still allocates ~`N * 80` bytes of descriptor
+  /// + AVBufferRef + allocator overhead per cloned copy.
+  #[test]
+  fn packet_side_data_bytes_charges_descriptor_overhead_for_zero_size_entries() {
+    use ffmpeg_next::ffi::{av_packet_new_side_data, AVPacketSideDataType};
+
+    let mut packet = Packet::new(0);
+    // Attach two zero-byte entries of distinct types so neither call
+    // replaces the other.
+    let p1 = unsafe {
+      av_packet_new_side_data(
+        packet.as_mut_ptr(),
+        AVPacketSideDataType::AV_PKT_DATA_NEW_EXTRADATA,
+        0,
+      )
+    };
+    let p2 = unsafe {
+      av_packet_new_side_data(
+        packet.as_mut_ptr(),
+        AVPacketSideDataType::AV_PKT_DATA_PALETTE,
+        0,
+      )
+    };
+    assert!(
+      !p1.is_null() && !p2.is_null(),
+      "av_packet_new_side_data NULL"
+    );
+
+    assert_eq!(packet_side_data_count(&packet), 2);
+    let bytes = packet_side_data_bytes(&packet);
+    assert!(
+      bytes >= 2 * SIDE_DATA_ENTRY_OVERHEAD,
+      "must charge descriptor overhead per entry even at zero payload; got {bytes}"
+    );
+  }
+
+  /// `MAX_PROBE_PACKET_SIDE_DATA_ENTRIES` is the cliff above which a
+  /// packet is rejected from the probe buffer regardless of byte total —
+  /// pure descriptor inflation is its own attack vector. Sanity-check
+  /// that `packet_side_data_count` reports the value the cap is checked
+  /// against.
+  #[test]
+  fn packet_side_data_count_reports_attached_entries() {
+    use ffmpeg_next::ffi::{av_packet_new_side_data, AVPacketSideDataType};
+
+    let mut packet = Packet::new(0);
+    let _p1 = unsafe {
+      av_packet_new_side_data(
+        packet.as_mut_ptr(),
+        AVPacketSideDataType::AV_PKT_DATA_NEW_EXTRADATA,
+        4,
+      )
+    };
+    let _p2 = unsafe {
+      av_packet_new_side_data(
+        packet.as_mut_ptr(),
+        AVPacketSideDataType::AV_PKT_DATA_PALETTE,
+        4,
+      )
+    };
+    assert_eq!(packet_side_data_count(&packet), 2);
   }
 
   /// `cpu_frame_bytes` must refuse to size a frame whose first plane has
@@ -1591,78 +1695,6 @@ mod tests {
     assert_eq!(cpu_frame_bytes(&f), Some(0));
   }
 
-  /// `av_frame_copy_props` deep-copies every AVFrameSideData attached
-  /// to the source frame. `frame_side_data_bytes` must surface that
-  /// retention so `drain_into_pending` can charge it against
-  /// `max_probe_pending_bytes` — otherwise a stream with megabytes of
-  /// per-frame metadata can queue up to `MAX_PROBE_PENDING_FRAMES`
-  /// frames and overshoot the configured cap by orders of magnitude.
-  #[test]
-  fn frame_side_data_bytes_counts_attached_buffers() {
-    use ffmpeg_next::ffi::{av_frame_new_side_data, AVFrameSideDataType};
-
-    const SIDE_DATA_SIZE: usize = 1024 * 1024; // 1 MiB
-
-    let mut f = frame::Video::empty();
-    // SAFETY: f is freshly allocated; av_frame_new_side_data attaches a
-    // fresh `SIDE_DATA_SIZE`-byte buffer of the requested type and returns
-    // a pointer to the entry (or NULL on OOM).
-    let p = unsafe {
-      av_frame_new_side_data(
-        f.as_mut_ptr(),
-        AVFrameSideDataType::AV_FRAME_DATA_SEI_UNREGISTERED,
-        SIDE_DATA_SIZE,
-      )
-    };
-    assert!(!p.is_null(), "av_frame_new_side_data returned NULL");
-
-    let bytes = frame_side_data_bytes(&f);
-    assert!(
-      bytes >= SIDE_DATA_SIZE,
-      "side-data accounting must include the attached buffer; got {bytes}"
-    );
-  }
-
-  #[test]
-  fn frame_side_data_bytes_is_zero_for_bare_frame() {
-    let f = frame::Video::empty();
-    assert_eq!(frame_side_data_bytes(&f), 0);
-  }
-
-  /// Multiple side-data entries must be summed, not just the first.
-  #[test]
-  fn frame_side_data_bytes_sums_all_entries() {
-    use ffmpeg_next::ffi::{av_frame_new_side_data, AVFrameSideDataType};
-
-    const ENTRY_A: usize = 256 * 1024; // 256 KiB
-    const ENTRY_B: usize = 512 * 1024; // 512 KiB
-
-    let mut f = frame::Video::empty();
-    // Two distinct types so neither call replaces the other.
-    let p1 = unsafe {
-      av_frame_new_side_data(
-        f.as_mut_ptr(),
-        AVFrameSideDataType::AV_FRAME_DATA_SEI_UNREGISTERED,
-        ENTRY_A,
-      )
-    };
-    let p2 = unsafe {
-      av_frame_new_side_data(
-        f.as_mut_ptr(),
-        AVFrameSideDataType::AV_FRAME_DATA_A53_CC,
-        ENTRY_B,
-      )
-    };
-    assert!(!p1.is_null() && !p2.is_null());
-
-    let bytes = frame_side_data_bytes(&f);
-    assert!(
-      bytes >= ENTRY_A + ENTRY_B,
-      "must sum across all side-data entries; got {bytes}, expected at least {}",
-      ENTRY_A + ENTRY_B
-    );
-  }
-
   /// `PartialBuildState`'s `Drop` must be a no-op when both pointers are
   /// null — the disarmed-by-`into_owned` post-state. A panic / double-free
   /// here would break the success path of every `build_state` call.

From 1dffd6fd82156316159b1bac94e0a56d613f1f0e Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 17:09:52 +1200
Subject: [PATCH 26/27] update

---
 src/decoder.rs | 210 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 145 insertions(+), 65 deletions(-)

diff --git a/src/decoder.rs b/src/decoder.rs
index 81db18a..46a099a 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -410,63 +410,84 @@ impl VideoDecoder {
       match self.state.inner.send_packet(packet) {
         Ok(()) => {
           if let Some(probe) = self.probe.as_mut() {
-            // `try_clone_packet` calls `av_packet_ref`, which deep-copies
-            // side data via `av_packet_copy_props`. The probe budget must
-            // include both descriptor + ref overhead per side-data entry
-            // (via `packet_side_data_bytes`) and a hard cap on the entry
-            // count itself — without the count cap, a packet stuffed with
-            // many tiny entries can dominate retained memory before the
-            // byte cap is even close to firing.
+            // Step 1: reject by side-data entry count BEFORE walking the
+            // side-data array for byte accounting. `packet_side_data_bytes`
+            // dereferences each `AVPacket.side_data[i]` based on the
+            // FFmpeg-supplied `side_data_elems`; if that integer is
+            // corrupt or weaponised we don't want to walk it from the
+            // safe `send_packet` path. The byte helper still clamps its
+            // own walk to the cap as defense-in-depth, but checking the
+            // count first short-circuits the descriptor-explosion case
+            // entirely.
             let side_count = packet_side_data_count(packet);
-            let pkt_size = packet.size().saturating_add(packet_side_data_bytes(packet));
-            let new_count = probe.buffered_packets.len() + 1;
-            let new_bytes = probe.buffered_bytes.saturating_add(pkt_size);
-            let entry_cap_exceeded = side_count > MAX_PROBE_PACKET_SIDE_DATA_ENTRIES;
-            if new_count > MAX_PROBE_PACKETS
-              || new_bytes > MAX_PROBE_PACKET_BYTES
-              || entry_cap_exceeded
-            {
+            if side_count > MAX_PROBE_PACKET_SIDE_DATA_ENTRIES {
               tracing::warn!(
-                packets = new_count,
-                bytes = new_bytes,
                 side_data_entries = side_count,
-                max_packets = MAX_PROBE_PACKETS,
-                max_bytes = MAX_PROBE_PACKET_BYTES,
                 max_side_data_entries = MAX_PROBE_PACKET_SIDE_DATA_ENTRIES,
-                trigger = if entry_cap_exceeded {
-                  "side_data_entry_cap"
-                } else {
-                  "byte_or_packet_cap"
-                },
-                "hwdecode: probe window exceeded caps without first frame; \
-                 abandoning fallback safety net"
+                trigger = "side_data_entry_cap",
+                "hwdecode: packet side-data entry count exceeds cap; \
+                 abandoning fallback safety net without byte accounting"
               );
-              // Abandon the *future* probe-buffering only. `pending_frames`
-              // belong to the currently active backend (possibly the
-              // candidate `advance_probe` committed earlier in this same
-              // `send_packet` call) and are valid output the caller will
-              // dequeue via `receive_frame`. Clearing them here would
-              // silently drop initial frames at exactly the cap-overflow /
-              // OOM-stress paths.
+              // Abandon the *future* probe-buffering only — see the byte/
+              // packet cap branch below for why `pending_frames` survives.
               self.probe = None;
             } else {
-              // Use the checked clone — ffmpeg-next's `Packet::clone`
-              // discards av_packet_ref's return value and would silently
-              // store an empty packet on ENOMEM, corrupting future replay.
-              match try_clone_packet(packet) {
-                Ok(cloned) => {
-                  probe.buffered_packets.push(cloned);
-                  probe.buffered_bytes = new_bytes;
-                }
-                Err(e) => {
-                  tracing::warn!(
-                    error = %e,
-                    "hwdecode: packet clone failed for probe history; abandoning fallback safety net"
-                  );
-                  // Same reasoning as the cap-overflow branch above:
-                  // `pending_frames` are owned by the active backend, not
-                  // the probe buffer, so they survive abandonment.
-                  self.probe = None;
+              // Step 2: now safe to compute byte budget — `side_count`
+              // is bounded.
+              //
+              // `try_clone_packet` calls `av_packet_ref`, which deep-copies
+              // side data via `av_packet_copy_props`. The probe budget
+              // must include descriptor + ref overhead per side-data
+              // entry (via `packet_side_data_bytes`); without it, a
+              // packet stuffed with many tiny entries can dominate
+              // retained memory before the byte cap is even close to
+              // firing.
+              let pkt_size = packet.size().saturating_add(packet_side_data_bytes(
+                packet,
+                MAX_PROBE_PACKET_SIDE_DATA_ENTRIES,
+              ));
+              let new_count = probe.buffered_packets.len() + 1;
+              let new_bytes = probe.buffered_bytes.saturating_add(pkt_size);
+              if new_count > MAX_PROBE_PACKETS || new_bytes > MAX_PROBE_PACKET_BYTES {
+                tracing::warn!(
+                  packets = new_count,
+                  bytes = new_bytes,
+                  side_data_entries = side_count,
+                  max_packets = MAX_PROBE_PACKETS,
+                  max_bytes = MAX_PROBE_PACKET_BYTES,
+                  trigger = "byte_or_packet_cap",
+                  "hwdecode: probe window exceeded caps without first frame; \
+                   abandoning fallback safety net"
+                );
+                // Abandon the *future* probe-buffering only.
+                // `pending_frames` belong to the currently active backend
+                // (possibly the candidate `advance_probe` committed
+                // earlier in this same `send_packet` call) and are valid
+                // output the caller will dequeue via `receive_frame`.
+                // Clearing them here would silently drop initial frames
+                // at exactly the cap-overflow / OOM-stress paths.
+                self.probe = None;
+              } else {
+                // Use the checked clone — ffmpeg-next's `Packet::clone`
+                // discards av_packet_ref's return value and would
+                // silently store an empty packet on ENOMEM, corrupting
+                // future replay.
+                match try_clone_packet(packet) {
+                  Ok(cloned) => {
+                    probe.buffered_packets.push(cloned);
+                    probe.buffered_bytes = new_bytes;
+                  }
+                  Err(e) => {
+                    tracing::warn!(
+                      error = %e,
+                      "hwdecode: packet clone failed for probe history; \
+                       abandoning fallback safety net"
+                    );
+                    // Same reasoning as the cap-overflow branch above:
+                    // `pending_frames` are owned by the active backend,
+                    // not the probe buffer, so they survive abandonment.
+                    self.probe = None;
+                  }
                 }
               }
             }
@@ -1222,18 +1243,27 @@ fn try_clone_packet(src: &Packet) -> std::result::Result<Packet, ffmpeg_next::Er
   Ok(dst)
 }
 
-/// Sum of `AVPacket.side_data[i].size` across every entry. `av_packet_ref`
-/// performs a deep copy of side data via `av_packet_copy_props`, so each
-/// probe-buffered clone retains its own copy of every side-data byte.
-/// The probe budget must include this so a stream that keeps payload
-/// small while attaching arbitrarily large side data per packet cannot
-/// blow past `MAX_PROBE_PACKET_BYTES`.
+/// Sum of `AVPacket.side_data[i].size` across every entry, plus
+/// `nb_entries * SIDE_DATA_ENTRY_OVERHEAD` (descriptor + AVBufferRef +
+/// allocator bookkeeping per entry). `av_packet_ref` performs a deep
+/// copy of side data via `av_packet_copy_props`, so each probe-buffered
+/// clone retains every one of these bytes. Charging both keeps
+/// `MAX_PROBE_PACKET_BYTES` a true upper bound — without the overhead,
+/// many zero-size entries slip past the cap on pure descriptor cost.
+///
+/// Walks at most `max_entries` entries even when `side_data_elems`
+/// reports a larger count. Defense-in-depth against a corrupt or hostile
+/// packet whose `side_data_elems` lies about the actual array length:
+/// the caller is expected to also reject any packet whose count exceeds
+/// the cap (so the inflated clone is never created), but bounding the
+/// walk here means a stale or weaponised value can never trigger an
+/// unbounded raw-pointer scan from the safe API.
 ///
 /// Reads only the `size` field of each `AVPacketSideData` entry — never
 /// touches the bindgen `AVPacketSideDataType` enum, so no UB even if a
 /// future FFmpeg adds a side-data type discriminant our build doesn't
 /// know.
-fn packet_side_data_bytes(packet: &Packet) -> usize {
+fn packet_side_data_bytes(packet: &Packet, max_entries: usize) -> usize {
   // SAFETY: AVPacket.side_data is `*mut AVPacketSideData` and
   // side_data_elems is `c_int`; both are raw struct fields safe to read.
   // Field projection (`.size`) does not reconstruct the enum-typed `type_`
@@ -1242,13 +1272,10 @@ fn packet_side_data_bytes(packet: &Packet) -> usize {
     let raw = packet.as_ptr();
     let nel = (*raw).side_data_elems;
     let arr = (*raw).side_data;
-    if arr.is_null() || nel <= 0 {
+    if arr.is_null() || nel <= 0 || max_entries == 0 {
       return 0;
     }
-    let count = nel as usize;
-    // Descriptor + AVBufferRef + allocator overhead per entry — without
-    // this, a packet stuffed with many zero-size entries could slip past
-    // `MAX_PROBE_PACKET_BYTES` purely on descriptor cost.
+    let count = (nel as usize).min(max_entries);
     let mut total = count.saturating_mul(SIDE_DATA_ENTRY_OVERHEAD);
     for i in 0..count {
       let entry = arr.add(i);
@@ -1550,7 +1577,7 @@ mod tests {
     assert!(!p.is_null(), "av_packet_new_side_data returned NULL");
 
     assert_eq!(packet.size(), PAYLOAD_SIZE);
-    let side = packet_side_data_bytes(&packet);
+    let side = packet_side_data_bytes(&packet, MAX_PROBE_PACKET_SIDE_DATA_ENTRIES);
     assert!(
       side >= SIDE_DATA_SIZE,
       "side-data accounting must include the attached buffer; got {side}"
@@ -1565,7 +1592,10 @@ mod tests {
   #[test]
   fn packet_side_data_is_zero_when_no_side_data() {
     let packet = Packet::new(64);
-    assert_eq!(packet_side_data_bytes(&packet), 0);
+    assert_eq!(
+      packet_side_data_bytes(&packet, MAX_PROBE_PACKET_SIDE_DATA_ENTRIES),
+      0
+    );
     assert_eq!(packet_side_data_count(&packet), 0);
   }
 
@@ -1602,13 +1632,63 @@ mod tests {
     );
 
     assert_eq!(packet_side_data_count(&packet), 2);
-    let bytes = packet_side_data_bytes(&packet);
+    let bytes = packet_side_data_bytes(&packet, MAX_PROBE_PACKET_SIDE_DATA_ENTRIES);
     assert!(
       bytes >= 2 * SIDE_DATA_ENTRY_OVERHEAD,
       "must charge descriptor overhead per entry even at zero payload; got {bytes}"
     );
   }
 
+  /// `packet_side_data_bytes` must clamp its walk to `max_entries`
+  /// regardless of `side_data_elems`. Defense-in-depth: the caller is
+  /// expected to short-circuit packets whose count exceeds the cap, but
+  /// if a corrupt or weaponised packet ever does reach the helper, the
+  /// internal cap prevents an unbounded raw-pointer walk.
+  ///
+  /// This test attaches 5 entries of distinct types and asks the helper
+  /// to walk only the first 2. Result must equal exactly `2 * overhead +
+  /// (size_a + size_b)`, confirming entries 3-5 were not even read.
+  #[test]
+  fn packet_side_data_bytes_respects_max_entries_cap() {
+    use ffmpeg_next::ffi::{av_packet_new_side_data, AVPacketSideDataType};
+
+    let mut packet = Packet::new(0);
+    // Five distinct side-data types so each `av_packet_new_side_data`
+    // call appends rather than replaces.
+    let types_and_sizes: [(AVPacketSideDataType, usize); 5] = [
+      (AVPacketSideDataType::AV_PKT_DATA_NEW_EXTRADATA, 100),
+      (AVPacketSideDataType::AV_PKT_DATA_PALETTE, 200),
+      (AVPacketSideDataType::AV_PKT_DATA_REPLAYGAIN, 300),
+      (AVPacketSideDataType::AV_PKT_DATA_DISPLAYMATRIX, 400),
+      (AVPacketSideDataType::AV_PKT_DATA_STEREO3D, 500),
+    ];
+    for (ty, size) in types_and_sizes {
+      let p = unsafe { av_packet_new_side_data(packet.as_mut_ptr(), ty, size) };
+      assert!(!p.is_null(), "av_packet_new_side_data returned NULL");
+    }
+    assert_eq!(packet_side_data_count(&packet), 5);
+
+    let walked_2 = packet_side_data_bytes(&packet, 2);
+    let walked_5 = packet_side_data_bytes(&packet, 5);
+
+    assert_eq!(
+      walked_2,
+      2 * SIDE_DATA_ENTRY_OVERHEAD + 100 + 200,
+      "max_entries=2 must walk exactly the first two entries"
+    );
+    assert_eq!(
+      walked_5,
+      5 * SIDE_DATA_ENTRY_OVERHEAD + 100 + 200 + 300 + 400 + 500,
+      "max_entries=5 must walk all five entries"
+    );
+    // max_entries=0 short-circuits to 0.
+    assert_eq!(packet_side_data_bytes(&packet, 0), 0);
+    // max_entries larger than the actual count clamps to the actual count
+    // (no out-of-bounds walk past `side_data_elems`).
+    let walked_huge = packet_side_data_bytes(&packet, 1_000_000);
+    assert_eq!(walked_huge, walked_5);
+  }
+
   /// `MAX_PROBE_PACKET_SIDE_DATA_ENTRIES` is the cliff above which a
   /// packet is rejected from the probe buffer regardless of byte total —
   /// pure descriptor inflation is its own attack vector. Sanity-check

From 62c6ff139bff15823cd7fa8db6803d4354835203 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 17:30:03 +1200
Subject: [PATCH 27/27] update

---
 src/decoder.rs | 169 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 169 insertions(+)

diff --git a/src/decoder.rs b/src/decoder.rs
index 46a099a..a241b68 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -132,6 +132,29 @@ const MAX_PROBE_PACKET_SIDE_DATA_ENTRIES: usize = 64;
 /// would let many tiny entries slip past the cap.
 const SIDE_DATA_ENTRY_OVERHEAD: usize = 80;
 
+/// Conservative upper-bound bytes-per-pixel multiplier used to estimate
+/// the size of a CPU frame **before** `av_hwframe_transfer_data`
+/// allocates its pixel buffers. Covers every HW download format this
+/// crate produces (worst case is `P416LE` / `P412LE` at 6 bytes/pixel
+/// for 16-bit 4:4:4 semi-planar) plus a margin for FFmpeg's per-row
+/// stride alignment (typically 32-byte aligned, ~5% extra at HD widths
+/// and below).
+///
+/// Used by [`drain_into_pending`] as a pre-transfer guard: if the
+/// product `width * height * WORST_CASE_BYTES_PER_PIXEL` would already
+/// push `pending_bytes` past `max_probe_pending_bytes`, the candidate
+/// replay refuses the frame *before* allocating. Without this, FFmpeg
+/// would perform the full HW→CPU download (potentially ~100 MiB for
+/// 8K HDR) and we would only reject the frame after RSS had already
+/// spiked. The post-transfer accounting via [`cpu_frame_bytes`] stays in
+/// place as a backstop using the frame's actual stride/format.
+///
+/// Slightly over-charges true 4:2:0 NV12 / P010 frames (which dominate
+/// real workloads) — that's the right side to err on. Callers feeding
+/// 8K+ workloads through the probe path can tune
+/// [`VideoDecoder::with_max_probe_pending_bytes`] upward to compensate.
+const WORST_CASE_BYTES_PER_PIXEL: usize = 8;
+
 /// Maximum number of CPU frames we are willing to queue from a candidate
 /// during probe replay. Each frame is a fully-allocated CPU buffer
 /// (~3 MiB for 1080p NV12, ~24 MiB for 4K P010, ~96 MiB for 8K P010), so
@@ -1374,6 +1397,54 @@ fn drain_into_pending(
             errno: libc::ENOMEM,
           });
         }
+        // Pre-transfer size guard: `av_hwframe_transfer_data` will
+        // allocate the CPU buffer based on `hw_buf`'s dimensions. If a
+        // single frame's worst-case footprint already pushes past the
+        // cap, refuse the candidate **before** allocating so RSS does
+        // not spike on a frame we'd immediately drop. Uses a width *
+        // height * `WORST_CASE_BYTES_PER_PIXEL` upper bound; the
+        // post-transfer accounting via `cpu_frame_bytes` below stays in
+        // place as a backstop using the actual stride/format.
+        let estimated_bytes = match estimate_transfer_bytes(hw_buf) {
+          Some(b) => b,
+          None => {
+            // SAFETY: AVFrame.width/height are c_int reads.
+            let (w, h) = unsafe {
+              let raw = hw_buf.as_ptr();
+              ((*raw).width, (*raw).height)
+            };
+            tracing::warn!(
+              width = w,
+              height = h,
+              "hwdecode: HW frame dimensions invalid for sizing; failing candidate replay"
+            );
+            unsafe { av_frame_unref(hw_buf.as_mut_ptr()) };
+            return Err(ffmpeg_next::Error::Other {
+              errno: libc::ENOMEM,
+            });
+          }
+        };
+        let estimated_total = pending_bytes.saturating_add(estimated_bytes);
+        if estimated_total > max_bytes {
+          // SAFETY: AVFrame.width/height are c_int reads.
+          let (w, h) = unsafe {
+            let raw = hw_buf.as_ptr();
+            ((*raw).width, (*raw).height)
+          };
+          tracing::warn!(
+            pending_bytes = *pending_bytes,
+            estimated_bytes,
+            width = w,
+            height = h,
+            max_bytes = max_bytes,
+            "hwdecode: pre-transfer size estimate exceeds cap; \
+             refusing candidate replay before allocating CPU frame"
+          );
+          unsafe { av_frame_unref(hw_buf.as_mut_ptr()) };
+          return Err(ffmpeg_next::Error::Other {
+            errno: libc::ENOMEM,
+          });
+        }
         let mut cpu = alloc_av_frame()?;
         // SAFETY: hw_buf is a freshly-decoded HW frame;
         // `av_hwframe_transfer_data` allocates pixel buffers on `cpu`.
@@ -1438,6 +1509,33 @@ fn drain_into_pending(
   }
 }
 
+/// Conservative upper-bound estimate of the bytes
+/// `av_hwframe_transfer_data` will allocate when downloading `hw_buf` to
+/// a CPU frame. Used by [`drain_into_pending`] as a pre-transfer guard
+/// so a candidate replay can refuse a frame whose footprint would
+/// exceed the byte budget *without* first paying the allocation. The
+/// estimate is `width * height * WORST_CASE_BYTES_PER_PIXEL` — see that
+/// constant for why we err on the high side.
+///
+/// Returns `None` when the frame's `width` or `height` are not strictly
+/// positive (caller treats as candidate failure — a HW frame with
+/// non-positive dimensions cannot be transferred meaningfully).
+fn estimate_transfer_bytes(hw_buf: &frame::Video) -> Option<usize> {
+  // SAFETY: AVFrame.width / height are c_int reads.
+  let (w, h) = unsafe {
+    let raw = hw_buf.as_ptr();
+    ((*raw).width, (*raw).height)
+  };
+  if w <= 0 || h <= 0 {
+    return None;
+  }
+  Some(
+    (w as usize)
+      .saturating_mul(h as usize)
+      .saturating_mul(WORST_CASE_BYTES_PER_PIXEL),
+  )
+}
+
 /// Approximate resident size of a CPU frame: sum of `linesize[plane] *
 /// plane_height` across populated planes.
 ///
@@ -1775,6 +1873,77 @@ mod tests {
     assert_eq!(cpu_frame_bytes(&f), Some(0));
   }
 
+  /// `estimate_transfer_bytes` is the pre-transfer size guard for
+  /// `drain_into_pending`: it must compute `width * height *
+  /// WORST_CASE_BYTES_PER_PIXEL` so the candidate replay can refuse a
+  /// frame *before* `av_hwframe_transfer_data` allocates.
+  #[test]
+  fn estimate_transfer_bytes_uses_worst_case_per_pixel() {
+    let mut f = frame::Video::empty();
+    // SAFETY: f is freshly allocated; we set width/height directly.
+    unsafe {
+      let raw = f.as_mut_ptr();
+      (*raw).width = 1920;
+      (*raw).height = 1080;
+    }
+    assert_eq!(
+      estimate_transfer_bytes(&f),
+      Some(1920 * 1080 * WORST_CASE_BYTES_PER_PIXEL),
+    );
+  }
+
+  /// Non-positive dimensions surface as `None` so `drain_into_pending`
+  /// fails the candidate before allocating anything. A zero-width or
+  /// zero-height frame would silently yield a 0-byte estimate under the
+  /// raw multiplication, letting the cap check pass and exposing the
+  /// allocation path to whatever the actual transfer would do.
+  #[test]
+  fn estimate_transfer_bytes_rejects_non_positive_dimensions() {
+    let mut f = frame::Video::empty();
+    unsafe {
+      let raw = f.as_mut_ptr();
+      (*raw).width = 0;
+      (*raw).height = 1080;
+    }
+    assert!(estimate_transfer_bytes(&f).is_none());
+
+    unsafe {
+      (*f.as_mut_ptr()).width = 1920;
+      (*f.as_mut_ptr()).height = -1;
+    }
+    assert!(estimate_transfer_bytes(&f).is_none());
+  }
+
+  /// 8K HDR P010 has actual ~96 MiB resident size; the estimate should
+  /// over-charge it (the right side to err on for a memory cap) while
+  /// still fitting within the configurable
+  /// [`DEFAULT_MAX_PROBE_PENDING_BYTES`] cap (256 MiB) for a single
+  /// frame so a default-configured decoder is not forced to reject 8K
+  /// streams outright.
+  #[test]
+  fn estimate_transfer_bytes_8k_fits_default_cap() {
+    let mut f = frame::Video::empty();
+    unsafe {
+      let raw = f.as_mut_ptr();
+      (*raw).width = 7680;
+      (*raw).height = 4320;
+    }
+    let estimate = estimate_transfer_bytes(&f).expect("8K is sizable");
+    // ~256 MiB exactly — at-or-just-under the default cap.
+    assert!(
+      estimate <= DEFAULT_MAX_PROBE_PENDING_BYTES,
+      "8K estimate {estimate} must fit DEFAULT_MAX_PROBE_PENDING_BYTES \
+       {DEFAULT_MAX_PROBE_PENDING_BYTES}; otherwise the default cap rejects \
+       even a single 8K frame at probe time"
+    );
+    // And strictly larger than a typical 8K P010 (~96 MiB) so the guard
+    // is actually conservative, not under-charging.
+    assert!(
+      estimate > 96 * 1024 * 1024,
+      "estimate must over-charge real 8K P010 to bound the worst case; got {estimate}"
+    );
+  }
+
   /// `PartialBuildState`'s `Drop` must be a no-op when both pointers are
   /// null — the disarmed-by-`into_owned` post-state. A panic / double-free
   /// here would break the success path of every `build_state` call.