diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index bd7a668..0000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# UNRELEASED
-
-# 0.1.2 (January 6th, 2022)
-
-FEATURES
-
-
diff --git a/Cargo.toml b/Cargo.toml
index ff7fe91..9a3b19a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,35 +1,37 @@
 [package]
-name = "template-rs"
-version = "0.0.0"
+name = "hwdecode"
+version = "0.1.0"
 edition = "2021"
-repository = "https://github.com/al8n/template-rs"
-homepage = "https://github.com/al8n/template-rs"
-documentation = "https://docs.rs/template-rs"
-description = "A template for creating Rust open-source repo on GitHub"
+rust-version = "1.95"
+description = "Cross-platform hardware-only video decoder built on top of ffmpeg-next, with auto-probe across HW backends. Callers handle software fallback."
+repository = "https://github.com/findit-ai/hwdecode"
+homepage = "https://github.com/findit-ai/hwdecode"
+documentation = "https://docs.rs/hwdecode"
 license = "MIT OR Apache-2.0"
-rust-version = "1.73"
-
-[[bench]]
-path = "benches/foo.rs"
-name = "foo"
-harness = false
-
-[features]
-default = ["std"]
-alloc = []
-std = []
 
 [dependencies]
+ffmpeg-next = { version = "8.1", default-features = false, features = ["codec", "format"] }
+thiserror = "2"
+tracing = "0.1"
+libc = "0.2"
 
 [dev-dependencies]
 criterion = "0.8"
-tempfile = "3"
+
+[[example]]
+name = "decode"
+path = "examples/decode.rs"
+
+[[bench]]
+name = "decode"
+path = "benches/decode.rs"
+harness = false
 
 [profile.bench]
 opt-level = 3
 debug = false
 codegen-units = 1
-lto = 'thin'
+lto = "thin"
 incremental = false
 debug-assertions = false
 overflow-checks = false
@@ -41,8 +43,6 @@ rustdoc-args = ["--cfg", "docsrs"]
 
 [lints.rust]
 rust_2018_idioms = "warn"
-single_use_lifetimes = "warn"
 unexpected_cfgs = { level = "warn", check-cfg = [
-  'cfg(all_tests)',
   'cfg(tarpaulin)',
 ] }
diff --git a/README-zh_CN.md b/README-zh_CN.md
deleted file mode 100644
index 7a07f4d..0000000
--- a/README-zh_CN.md
+++ /dev/null
@@ -1,51 +0,0 @@
-<div align="center">
-<h1>template-rs</h1>
-</div>
-<div align="center">
-
-开源Rust代码库GitHub模版
-
-[<img alt="github" src="https://img.shields.io/badge/github-al8n/template--rs-8da0cb?style=for-the-badge&logo=Github" height="22">][Github-url]
-<img alt="LoC" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fal8n%2F327b2a8aef9003246e45c6e47fe63937%2Fraw%2Ftemplate-rs" height="22">
-[<img alt="Build" src="https://img.shields.io/github/actions/workflow/status/al8n/template-rs/ci.yml?logo=Github-Actions&style=for-the-badge" height="22">][CI-url]
-[<img alt="codecov" src="https://img.shields.io/codecov/c/gh/al8n/template-rs?style=for-the-badge&token=6R3QFWRWHL&logo=codecov" height="22">][codecov-url]
-
-[<img alt="docs.rs" src="https://img.shields.io/badge/docs.rs-template--rs-66c2a5?style=for-the-badge&labelColor=555555&logo=data:image/svg+xml;base64,PHN2ZyByb2xlPSJpbWciIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgdmlld0JveD0iMCAwIDUxMiA1MTIiPjxwYXRoIGZpbGw9IiNmNWY1ZjUiIGQ9Ik00ODguNiAyNTAuMkwzOTIgMjE0VjEwNS41YzAtMTUtOS4zLTI4LjQtMjMuNC0zMy43bC0xMDAtMzcuNWMtOC4xLTMuMS0xNy4xLTMuMS0yNS4zIDBsLTEwMCAzNy41Yy0xNC4xIDUuMy0yMy40IDE4LjctMjMuNCAzMy43VjIxNGwtOTYuNiAzNi4yQzkuMyAyNTUuNSAwIDI2OC45IDAgMjgzLjlWMzk0YzAgMTMuNiA3LjcgMjYuMSAxOS45IDMyLjJsMTAwIDUwYzEwLjEgNS4xIDIyLjEgNS4xIDMyLjIgMGwxMDMuOS01MiAxMDMuOSA1MmMxMC4xIDUuMSAyMi4xIDUuMSAzMi4yIDBsMTAwLTUwYzEyLjItNi4xIDE5LjktMTguNiAxOS45LTMyLjJWMjgzLjljMC0xNS05LjMtMjguNC0yMy40LTMzLjd6TTM1OCAyMTQuOGwtODUgMzEuOXYtNjguMmw4NS0zN3Y3My4zek0xNTQgMTA0LjFsMTAyLTM4LjIgMTAyIDM4LjJ2LjZsLTEwMiA0MS40LTEwMi00MS40di0uNnptODQgMjkxLjFsLTg1IDQyLjV2LTc5LjFsODUtMzguOHY3NS40em0wLTExMmwtMTAyIDQxLjQtMTAyLTQxLjR2LS42bDEwMi0zOC4yIDEwMiAzOC4ydi42em0yNDAgMTEybC04NSA0Mi41di03OS4xbDg1LTM4Ljh2NzUuNHptMC0xMTJsLTEwMiA0MS40LTEwMi00MS40di0uNmwxMDItMzguMiAxMDIgMzguMnYuNnoiPjwvcGF0aD48L3N2Zz4K" height="20">][doc-url]
-[<img alt="crates.io" src="https://img.shields.io/crates/v/template-rs?style=for-the-badge&logo=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iaXNvLTg4NTktMSI/Pg0KPCEtLSBHZW5lcmF0b3I6IEFkb2JlIElsbHVzdHJhdG9yIDE5LjAuMCwgU1ZHIEV4cG9ydCBQbHVnLUluIC4gU1ZHIFZlcnNpb246IDYuMDAgQnVpbGQgMCkgIC0tPg0KPHN2ZyB2ZXJzaW9uPSIxLjEiIGlkPSJMYXllcl8xIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIiB4PSIwcHgiIHk9IjBweCINCgkgdmlld0JveD0iMCAwIDUxMiA1MTIiIHhtbDpzcGFjZT0icHJlc2VydmUiPg0KPGc+DQoJPGc+DQoJCTxwYXRoIGQ9Ik0yNTYsMEwzMS41MjgsMTEyLjIzNnYyODcuNTI4TDI1Niw1MTJsMjI0LjQ3Mi0xMTIuMjM2VjExMi4yMzZMMjU2LDB6IE0yMzQuMjc3LDQ1Mi41NjRMNzQuOTc0LDM3Mi45MTNWMTYwLjgxDQoJCQlsMTU5LjMwMyw3OS42NTFWNDUyLjU2NHogTTEwMS44MjYsMTI1LjY2MkwyNTYsNDguNTc2bDE1NC4xNzQsNzcuMDg3TDI1NiwyMDIuNzQ5TDEwMS44MjYsMTI1LjY2MnogTTQzNy4wMjYsMzcyLjkxMw0KCQkJbC0xNTkuMzAzLDc5LjY1MVYyNDAuNDYxbDE1OS4zMDMtNzkuNjUxVjM3Mi45MTN6IiBmaWxsPSIjRkZGIi8+DQoJPC9nPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPC9zdmc+DQo=" height="22">][crates-url]
-[<img alt="crates.io" src="https://img.shields.io/crates/d/template-rs?color=critical&logo=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBzdGFuZGFsb25lPSJubyI/PjwhRE9DVFlQRSBzdmcgUFVCTElDICItLy9XM0MvL0RURCBTVkcgMS4xLy9FTiIgImh0dHA6Ly93d3cudzMub3JnL0dyYXBoaWNzL1NWRy8xLjEvRFREL3N2ZzExLmR0ZCI+PHN2ZyB0PSIxNjQ1MTE3MzMyOTU5IiBjbGFzcz0iaWNvbiIgdmlld0JveD0iMCAwIDEwMjQgMTAyNCIgdmVyc2lvbj0iMS4xIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHAtaWQ9IjM0MjEiIGRhdGEtc3BtLWFuY2hvci1pZD0iYTMxM3guNzc4MTA2OS4wLmkzIiB3aWR0aD0iNDgiIGhlaWdodD0iNDgiIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIj48ZGVmcz48c3R5bGUgdHlwZT0idGV4dC9jc3MiPjwvc3R5bGU+PC9kZWZzPjxwYXRoIGQ9Ik00NjkuMzEyIDU3MC4yNHYtMjU2aDg1LjM3NnYyNTZoMTI4TDUxMiA3NTYuMjg4IDM0MS4zMTIgNTcwLjI0aDEyOHpNMTAyNCA2NDAuMTI4QzEwMjQgNzgyLjkxMiA5MTkuODcyIDg5NiA3ODcuNjQ4IDg5NmgtNTEyQzEyMy45MDQgODk2IDAgNzYxLjYgMCA1OTcuNTA0IDAgNDUxLjk2OCA5NC42NTYgMzMxLjUyIDIyNi40MzIgMzAyLjk3NiAyODQuMTYgMTk1LjQ1NiAzOTEuODA4IDEyOCA1MTIgMTI4YzE1Mi4zMiAwIDI4Mi4xMTIgMTA4LjQxNiAzMjMuMzkyIDI2MS4xMkM5NDEuODg4IDQxMy40NCAxMDI0IDUxOS4wNCAxMDI0IDY0MC4xOTJ6IG0tMjU5LjItMjA1LjMxMmMtMjQuNDQ4LTEyOS4wMjQtMTI4Ljg5Ni0yMjIuNzItMjUyLjgtMjIyLjcyLTk3LjI4IDAtMTgzLjA0IDU3LjM0NC0yMjQuNjQgMTQ3LjQ1NmwtOS4yOCAyMC4yMjQtMjAuOTI4IDIuOTQ0Yy0xMDMuMzYgMTQuNC0xNzguMzY4IDEwNC4zMi0xNzguMzY4IDIxNC43MiAwIDExNy45NTIgODguODMyIDIxNC40IDE5Ni45MjggMjE0LjRoNTEyYzg4LjMyIDAgMTU3LjUwNC03NS4xMzYgMTU3LjUwNC0xNzEuNzEyIDAtODguMDY0LTY1LjkyLTE2NC45MjgtMTQ0Ljk2LTE3MS43NzZsLTI5LjUwNC0yLjU2LTUuODg4LTMwLjk3NnoiIGZpbGw9IiNmZmZmZmYiIHAtaWQ9IjM0MjIiIGRhdGEtc3BtLWFuY2hvci1pZD0iYTMxM3guNzc4MTA2OS4wLmkwIiBjbGFzcz0iIj48L3BhdGg+PC9zdmc+&style=for-the-badge" height="22">][crates-url]
-<img alt="license" src="https://img.shields.io/badge/License-Apache%202.0/MIT-blue.svg?style=for-the-badge&fontColor=white&logoColor=f5c076&logo=data:image/svg+xml;base64,PCFET0NUWVBFIHN2ZyBQVUJMSUMgIi0vL1czQy8vRFREIFNWRyAxLjEvL0VOIiAiaHR0cDovL3d3dy53My5vcmcvR3JhcGhpY3MvU1ZHLzEuMS9EVEQvc3ZnMTEuZHRkIj4KDTwhLS0gVXBsb2FkZWQgdG86IFNWRyBSZXBvLCB3d3cuc3ZncmVwby5jb20sIFRyYW5zZm9ybWVkIGJ5OiBTVkcgUmVwbyBNaXhlciBUb29scyAtLT4KPHN2ZyBmaWxsPSIjZmZmZmZmIiBoZWlnaHQ9IjgwMHB4IiB3aWR0aD0iODAwcHgiIHZlcnNpb249IjEuMSIgaWQ9IkNhcGFfMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayIgdmlld0JveD0iMCAwIDI3Ni43MTUgMjc2LjcxNSIgeG1sOnNwYWNlPSJwcmVzZXJ2ZSIgc3Ryb2tlPSIjZmZmZmZmIj4KDTxnIGlkPSJTVkdSZXBvX2JnQ2FycmllciIgc3Ryb2tlLXdpZHRoPSIwIi8+Cg08ZyBpZD0iU1ZHUmVwb190cmFjZXJDYXJyaWVyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiLz4KDTxnIGlkPSJTVkdSZXBvX2ljb25DYXJyaWVyIj4gPGc+IDxwYXRoIGQ9Ik0xMzguMzU3LDBDNjIuMDY2LDAsMCw2Mi4wNjYsMCwxMzguMzU3czYyLjA2NiwxMzguMzU3LDEzOC4zNTcsMTM4LjM1N3MxMzguMzU3LTYyLjA2NiwxMzguMzU3LTEzOC4zNTcgUzIxNC42NDgsMCwxMzguMzU3LDB6IE0xMzguMzU3LDI1OC43MTVDNzEuOTkyLDI1OC43MTUsMTgsMjA0LjcyMywxOCwxMzguMzU3UzcxLjk5MiwxOCwxMzguMzU3LDE4IHMxMjAuMzU3LDUzLjk5MiwxMjAuMzU3LDEyMC4zNTdTMjA0LjcyMywyNTguNzE1LDEzOC4zNTcsMjU4LjcxNXoiLz4gPHBhdGggZD0iTTE5NC43OTgsMTYwLjkwM2MtNC4xODgtMi42NzctOS43NTMtMS40NTQtMTIuNDMyLDIuNzMyYy04LjY5NCwxMy41OTMtMjMuNTAzLDIxLjcwOC0zOS42MTQsMjEuNzA4IGMtMjUuOTA4LDAtNDYuOTg1LTIxLjA3OC00Ni45ODUtNDYuOTg2czIxLjA3Ny00Ni45ODYsNDYuOTg1LTQ2Ljk4NmMxNS42MzMsMCwzMC4yLDcuNzQ3LDM4Ljk2OCwyMC43MjMgYzIuNzgyLDQuMTE3LDguMzc1LDUuMjAxLDEyLjQ5NiwyLjQxOGM0LjExOC0yLjc4Miw1LjIwMS04LjM3NywyLjQxOC0xMi40OTZjLTEyLjExOC0xNy45MzctMzIuMjYyLTI4LjY0NS01My44ODItMjguNjQ1IGMtMzUuODMzLDAtNjQuOTg1LDI5LjE1Mi02NC45ODUsNjQuOTg2czI5LjE1Miw2NC45ODYsNjQuOTg1LDY0Ljk4NmMyMi4yODEsMCw0Mi43NTktMTEuMjE4LDU0Ljc3OC0zMC4wMDkgQzIwMC4yMDgsMTY5LjE0NywxOTguOTg1LDE2My41ODIsMTk0Ljc5OCwxNjAuOTAzeiIvPiA8L2c+IDwvZz4KDTwvc3ZnPg==" height="22">
-
-[English][en-url] | 简体中文
-
-</div>
-
-## Installation
-
-```toml
-[dependencies]
-template_rs = "0.1"
-```
-
-## Features
-
-- [x] 更快的创建GitHub开源Rust代码库
-
-#### License
-
-`Template-rs` is under the terms of both the MIT license and the
-Apache License (Version 2.0).
-
-See [LICENSE-APACHE](LICENSE-APACHE), [LICENSE-MIT](LICENSE-MIT) for details.
-
-Copyright (c) 2021 Al Liu.
-
-[Github-url]: https://github.com/al8n/template-rs/
-[CI-url]: https://github.com/al8n/template/actions/workflows/template.yml
-[doc-url]: https://docs.rs/template-rs
-[crates-url]: https://crates.io/crates/template-rs
-[codecov-url]: https://app.codecov.io/gh/al8n/template-rs/
-[license-url]: https://opensource.org/licenses/Apache-2.0
-[rustc-url]: https://github.com/rust-lang/rust/blob/master/RELEASES.md
-[license-apache-url]: https://opensource.org/licenses/Apache-2.0
-[license-mit-url]: https://opensource.org/licenses/MIT
-[en-url]: https://github.com/al8n/template-rs/tree/main/README.md
diff --git a/README.md b/README.md
index 1af27e2..3da5fba 100644
--- a/README.md
+++ b/README.md
@@ -1,46 +1,115 @@
-<div align="center">
-<h1>template-rs</h1>
-</div>
-<div align="center">
+# hwdecode
 
-A template for creating Rust open-source GitHub repo.
+Cross-platform hardware-accelerated video decoder for Rust, built on top of
+[`ffmpeg-next`](https://crates.io/crates/ffmpeg-next).
 
-[<img alt="github" src="https://img.shields.io/badge/github-al8n/template--rs-8da0cb?style=for-the-badge&logo=Github" height="22">][Github-url]
-<img alt="LoC" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fal8n%2F327b2a8aef9003246e45c6e47fe63937%2Fraw%2Ftemplate-rs" height="22">
-[<img alt="Build" src="https://img.shields.io/github/actions/workflow/status/al8n/template-rs/ci.yml?logo=Github-Actions&style=for-the-badge" height="22">][CI-url]
-[<img alt="codecov" src="https://img.shields.io/codecov/c/gh/al8n/template-rs?style=for-the-badge&token=6R3QFWRWHL&logo=codecov" height="22">][codecov-url]
+`VideoDecoder` mirrors the `send_packet` / `receive_frame` interface of
+`ffmpeg::decoder::Video` and auto-probes the host's hardware backends.
+This crate is **hardware-only** — there is no software fallback inside it.
+If no hardware backend can decode the stream, `Error::AllBackendsFailed`
+surfaces from `VideoDecoder::open` (when no backend opens) or from
+`receive_frame` / `send_packet` / `send_eof` (when the initially-opened
+backend fails at decode time and every remaining backend in the probe order
+also fails — the only way it surfaces on single-backend platforms like macOS).
+The caller decides how to fall back (typically by opening an
+`ffmpeg::decoder::Video` directly). Output frames are CPU-side, downloaded
+with `av_hwframe_transfer_data` (NV12 for 8-bit, P010 for 10-bit). Pixel-
+format conversion is intentionally out of scope; safe per-row access is via
+`Frame::row` / `Frame::rows` (clipped to visible byte width — never includes
+FFmpeg's per-row alignment padding).
 
-[<img alt="docs.rs" src="https://img.shields.io/badge/docs.rs-template--rs-66c2a5?style=for-the-badge&labelColor=555555&logo=data:image/svg+xml;base64,PHN2ZyByb2xlPSJpbWciIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgdmlld0JveD0iMCAwIDUxMiA1MTIiPjxwYXRoIGZpbGw9IiNmNWY1ZjUiIGQ9Ik00ODguNiAyNTAuMkwzOTIgMjE0VjEwNS41YzAtMTUtOS4zLTI4LjQtMjMuNC0zMy43bC0xMDAtMzcuNWMtOC4xLTMuMS0xNy4xLTMuMS0yNS4zIDBsLTEwMCAzNy41Yy0xNC4xIDUuMy0yMy40IDE4LjctMjMuNCAzMy43VjIxNGwtOTYuNiAzNi4yQzkuMyAyNTUuNSAwIDI2OC45IDAgMjgzLjlWMzk0YzAgMTMuNiA3LjcgMjYuMSAxOS45IDMyLjJsMTAwIDUwYzEwLjEgNS4xIDIyLjEgNS4xIDMyLjIgMGwxMDMuOS01MiAxMDMuOSA1MmMxMC4xIDUuMSAyMi4xIDUuMSAzMi4yIDBsMTAwLTUwYzEyLjItNi4xIDE5LjktMTguNiAxOS45LTMyLjJWMjgzLjljMC0xNS05LjMtMjguNC0yMy40LTMzLjd6TTM1OCAyMTQuOGwtODUgMzEuOXYtNjguMmw4NS0zN3Y3My4zek0xNTQgMTA0LjFsMTAyLTM4LjIgMTAyIDM4LjJ2LjZsLTEwMiA0MS40LTEwMi00MS40di0uNnptODQgMjkxLjFsLTg1IDQyLjV2LTc5LjFsODUtMzguOHY3NS40em0wLTExMmwtMTAyIDQxLjQtMTAyLTQxLjR2LS42bDEwMi0zOC4yIDEwMiAzOC4ydi42em0yNDAgMTEybC04NSA0Mi41di03OS4xbDg1LTM4Ljh2NzUuNHptMC0xMTJsLTEwMiA0MS40LTEwMi00MS40di0uNmwxMDItMzguMiAxMDIgMzguMnYuNnoiPjwvcGF0aD48L3N2Zz4K" height="20">][doc-url]
-[<img alt="crates.io" src="https://img.shields.io/crates/v/template-rs?style=for-the-badge&logo=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iaXNvLTg4NTktMSI/Pg0KPCEtLSBHZW5lcmF0b3I6IEFkb2JlIElsbHVzdHJhdG9yIDE5LjAuMCwgU1ZHIEV4cG9ydCBQbHVnLUluIC4gU1ZHIFZlcnNpb246IDYuMDAgQnVpbGQgMCkgIC0tPg0KPHN2ZyB2ZXJzaW9uPSIxLjEiIGlkPSJMYXllcl8xIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIiB4PSIwcHgiIHk9IjBweCINCgkgdmlld0JveD0iMCAwIDUxMiA1MTIiIHhtbDpzcGFjZT0icHJlc2VydmUiPg0KPGc+DQoJPGc+DQoJCTxwYXRoIGQ9Ik0yNTYsMEwzMS41MjgsMTEyLjIzNnYyODcuNTI4TDI1Niw1MTJsMjI0LjQ3Mi0xMTIuMjM2VjExMi4yMzZMMjU2LDB6IE0yMzQuMjc3LDQ1Mi41NjRMNzQuOTc0LDM3Mi45MTNWMTYwLjgxDQoJCQlsMTU5LjMwMyw3OS42NTFWNDUyLjU2NHogTTEwMS44MjYsMTI1LjY2MkwyNTYsNDguNTc2bDE1NC4xNzQsNzcuMDg3TDI1NiwyMDIuNzQ5TDEwMS44MjYsMTI1LjY2MnogTTQzNy4wMjYsMzcyLjkxMw0KCQkJbC0xNTkuMzAzLDc5LjY1MVYyNDAuNDYxbDE1OS4zMDMtNzkuNjUxVjM3Mi45MTN6IiBmaWxsPSIjRkZGIi8+DQoJPC9nPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPGc+DQo8L2c+DQo8Zz4NCjwvZz4NCjxnPg0KPC9nPg0KPC9zdmc+DQo=" height="22">][crates-url]
-[<img alt="crates.io" src="https://img.shields.io/crates/d/template-rs?color=critical&logo=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBzdGFuZGFsb25lPSJubyI/PjwhRE9DVFlQRSBzdmcgUFVCTElDICItLy9XM0MvL0RURCBTVkcgMS4xLy9FTiIgImh0dHA6Ly93d3cudzMub3JnL0dyYXBoaWNzL1NWRy8xLjEvRFREL3N2ZzExLmR0ZCI+PHN2ZyB0PSIxNjQ1MTE3MzMyOTU5IiBjbGFzcz0iaWNvbiIgdmlld0JveD0iMCAwIDEwMjQgMTAyNCIgdmVyc2lvbj0iMS4xIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHAtaWQ9IjM0MjEiIGRhdGEtc3BtLWFuY2hvci1pZD0iYTMxM3guNzc4MTA2OS4wLmkzIiB3aWR0aD0iNDgiIGhlaWdodD0iNDgiIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIj48ZGVmcz48c3R5bGUgdHlwZT0idGV4dC9jc3MiPjwvc3R5bGU+PC9kZWZzPjxwYXRoIGQ9Ik00NjkuMzEyIDU3MC4yNHYtMjU2aDg1LjM3NnYyNTZoMTI4TDUxMiA3NTYuMjg4IDM0MS4zMTIgNTcwLjI0aDEyOHpNMTAyNCA2NDAuMTI4QzEwMjQgNzgyLjkxMiA5MTkuODcyIDg5NiA3ODcuNjQ4IDg5NmgtNTEyQzEyMy45MDQgODk2IDAgNzYxLjYgMCA1OTcuNTA0IDAgNDUxLjk2OCA5NC42NTYgMzMxLjUyIDIyNi40MzIgMzAyLjk3NiAyODQuMTYgMTk1LjQ1NiAzOTEuODA4IDEyOCA1MTIgMTI4YzE1Mi4zMiAwIDI4Mi4xMTIgMTA4LjQxNiAzMjMuMzkyIDI2MS4xMkM5NDEuODg4IDQxMy40NCAxMDI0IDUxOS4wNCAxMDI0IDY0MC4xOTJ6IG0tMjU5LjItMjA1LjMxMmMtMjQuNDQ4LTEyOS4wMjQtMTI4Ljg5Ni0yMjIuNzItMjUyLjgtMjIyLjcyLTk3LjI4IDAtMTgzLjA0IDU3LjM0NC0yMjQuNjQgMTQ3LjQ1NmwtOS4yOCAyMC4yMjQtMjAuOTI4IDIuOTQ0Yy0xMDMuMzYgMTQuNC0xNzguMzY4IDEwNC4zMi0xNzguMzY4IDIxNC43MiAwIDExNy45NTIgODguODMyIDIxNC40IDE5Ni45MjggMjE0LjRoNTEyYzg4LjMyIDAgMTU3LjUwNC03NS4xMzYgMTU3LjUwNC0xNzEuNzEyIDAtODguMDY0LTY1LjkyLTE2NC45MjgtMTQ0Ljk2LTE3MS43NzZsLTI5LjUwNC0yLjU2LTUuODg4LTMwLjk3NnoiIGZpbGw9IiNmZmZmZmYiIHAtaWQ9IjM0MjIiIGRhdGEtc3BtLWFuY2hvci1pZD0iYTMxM3guNzc4MTA2OS4wLmkwIiBjbGFzcz0iIj48L3BhdGg+PC9zdmc+&style=for-the-badge" height="22">][crates-url]
-<img alt="license" src="https://img.shields.io/badge/License-Apache%202.0/MIT-blue.svg?style=for-the-badge&fontColor=white&logoColor=f5c076&logo=data:image/svg+xml;base64,PCFET0NUWVBFIHN2ZyBQVUJMSUMgIi0vL1czQy8vRFREIFNWRyAxLjEvL0VOIiAiaHR0cDovL3d3dy53My5vcmcvR3JhcGhpY3MvU1ZHLzEuMS9EVEQvc3ZnMTEuZHRkIj4KDTwhLS0gVXBsb2FkZWQgdG86IFNWRyBSZXBvLCB3d3cuc3ZncmVwby5jb20sIFRyYW5zZm9ybWVkIGJ5OiBTVkcgUmVwbyBNaXhlciBUb29scyAtLT4KPHN2ZyBmaWxsPSIjZmZmZmZmIiBoZWlnaHQ9IjgwMHB4IiB3aWR0aD0iODAwcHgiIHZlcnNpb249IjEuMSIgaWQ9IkNhcGFfMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayIgdmlld0JveD0iMCAwIDI3Ni43MTUgMjc2LjcxNSIgeG1sOnNwYWNlPSJwcmVzZXJ2ZSIgc3Ryb2tlPSIjZmZmZmZmIj4KDTxnIGlkPSJTVkdSZXBvX2JnQ2FycmllciIgc3Ryb2tlLXdpZHRoPSIwIi8+Cg08ZyBpZD0iU1ZHUmVwb190cmFjZXJDYXJyaWVyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiLz4KDTxnIGlkPSJTVkdSZXBvX2ljb25DYXJyaWVyIj4gPGc+IDxwYXRoIGQ9Ik0xMzguMzU3LDBDNjIuMDY2LDAsMCw2Mi4wNjYsMCwxMzguMzU3czYyLjA2NiwxMzguMzU3LDEzOC4zNTcsMTM4LjM1N3MxMzguMzU3LTYyLjA2NiwxMzguMzU3LTEzOC4zNTcgUzIxNC42NDgsMCwxMzguMzU3LDB6IE0xMzguMzU3LDI1OC43MTVDNzEuOTkyLDI1OC43MTUsMTgsMjA0LjcyMywxOCwxMzguMzU3UzcxLjk5MiwxOCwxMzguMzU3LDE4IHMxMjAuMzU3LDUzLjk5MiwxMjAuMzU3LDEyMC4zNTdTMjA0LjcyMywyNTguNzE1LDEzOC4zNTcsMjU4LjcxNXoiLz4gPHBhdGggZD0iTTE5NC43OTgsMTYwLjkwM2MtNC4xODgtMi42NzctOS43NTMtMS40NTQtMTIuNDMyLDIuNzMyYy04LjY5NCwxMy41OTMtMjMuNTAzLDIxLjcwOC0zOS42MTQsMjEuNzA4IGMtMjUuOTA4LDAtNDYuOTg1LTIxLjA3OC00Ni45ODUtNDYuOTg2czIxLjA3Ny00Ni45ODYsNDYuOTg1LTQ2Ljk4NmMxNS42MzMsMCwzMC4yLDcuNzQ3LDM4Ljk2OCwyMC43MjMgYzIuNzgyLDQuMTE3LDguMzc1LDUuMjAxLDEyLjQ5NiwyLjQxOGM0LjExOC0yLjc4Miw1LjIwMS04LjM3NywyLjQxOC0xMi40OTZjLTEyLjExOC0xNy45MzctMzIuMjYyLTI4LjY0NS01My44ODItMjguNjQ1IGMtMzUuODMzLDAtNjQuOTg1LDI5LjE1Mi02NC45ODUsNjQuOTg2czI5LjE1Miw2NC45ODYsNjQuOTg1LDY0Ljk4NmMyMi4yODEsMCw0Mi43NTktMTEuMjE4LDU0Ljc3OC0zMC4wMDkgQzIwMC4yMDgsMTY5LjE0NywxOTguOTg1LDE2My41ODIsMTk0Ljc5OCwxNjAuOTAzeiIvPiA8L2c+IDwvZz4KDTwvc3ZnPg==" height="22">
+## Backends
 
-English | [简体中文][zh-cn-url]
+| Target              | Probe order (HW only)             |
+| ------------------- | --------------------------------- |
+| macOS / iOS / tvOS  | VideoToolbox                      |
+| Linux               | VAAPI → CUDA                      |
+| Windows             | D3D11VA → CUDA                    |
+| other               | (none)                            |
 
-</div>
+If `open` returns `Error::AllBackendsFailed`, software fallback is the
+caller's responsibility (this crate intentionally does not include one).
 
-## Installation
+## Usage
 
-```toml
-[dependencies]
-template_rs = "0.1"
+```rust,no_run
+use ffmpeg_next as ffmpeg;
+use ffmpeg::{format, media};
+use hwdecode::{Frame, VideoDecoder};
+
+ffmpeg::init()?;
+
+let mut input = format::input(path)?;
+let stream = input.streams().best(media::Type::Video).unwrap();
+let stream_index = stream.index();
+
+// HW-only open. On AllBackendsFailed, fall back to software yourself.
+let mut decoder = match VideoDecoder::open(stream.parameters()) {
+    Ok(d) => d,
+    Err(hwdecode::Error::AllBackendsFailed { .. }) => {
+        // Caller-side software fallback.
+        let _sw = ffmpeg::codec::Context::from_parameters(stream.parameters())?
+            .decoder()
+            .video()?;
+        // ... drive _sw with send_packet / receive_frame yourself ...
+        return Ok(());
+    }
+    Err(e) => return Err(e.into()),
+};
+println!("backend = {:?}", decoder.backend());
+
+let mut frame = Frame::empty()?;
+for (s, packet) in input.packets() {
+    if s.index() != stream_index { continue; }
+    decoder.send_packet(&packet)?;
+    while decoder.receive_frame(&mut frame).is_ok() {
+        // frame.pix_fmt() is the integer constant — match against
+        // hwdecode::pix_fmt::{NV12, P010LE, ...} and dispatch to your
+        // pixel-format pipeline (e.g. `colconv`).
+        // ... do something with frame ...
+    }
+}
+decoder.send_eof()?;
+while decoder.receive_frame(&mut frame).is_ok() {
+    // ... drain ...
+}
 ```
 
-## Features
-- [x] Create a Rust open-source repo fast 
+To force a specific hardware backend (no probe, no fallback):
+
+```rust
+use hwdecode::{Backend, VideoDecoder};
+let decoder = VideoDecoder::open_with(parameters, Backend::VideoToolbox)?;
+```
+
+`hwdecode` is hardware-only: there is no `Backend::Software`. If `open`
+returns `Error::AllBackendsFailed`, fall back to a software decoder
+yourself (typically `ffmpeg::decoder::Video`).
+
+## Running tests and benches
+
+The integration test and benchmark expect a real video file. Set
+`HWDECODE_SAMPLE_VIDEO` to enable them:
+
+```sh
+HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo test
+HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo test --test hw_smoke -- --ignored
+HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo bench
+```
 
-#### License
+Without the env var the integration test skips with a notice; unit tests run
+unconditionally.
 
-`template-rs` is under the terms of both the MIT license and the
-Apache License (Version 2.0).
+## Build requirements
 
-See [LICENSE-APACHE](LICENSE-APACHE), [LICENSE-MIT](LICENSE-MIT) for details.
+- A system FFmpeg ≥ **5.1** linkable via `pkg-config` (we reference
+  `AV_PIX_FMT_P212LE` / `AV_PIX_FMT_P412LE`, which were added in 5.1).
+  Tested against 8.1. Verify with
+  `ffmpeg -hwaccels` that your build has the backends you expect compiled in
+  (e.g. `videotoolbox` on macOS, `vaapi` / `cuda` on Linux,
+  `d3d11va` / `cuda` on Windows).
+- Rust ≥ 1.95.
 
-Copyright (c) 2021 Al Liu.
+## License
 
-[Github-url]: https://github.com/al8n/template-rs/
-[CI-url]: https://github.com/al8n/template-rs/actions/workflows/ci.yml
-[doc-url]: https://docs.rs/template-rs
-[crates-url]: https://crates.io/crates/template-rs
-[codecov-url]: https://app.codecov.io/gh/al8n/template-rs/
-[zh-cn-url]: https://github.com/al8n/template-rs/tree/main/README-zh_CN.md
+MIT or Apache-2.0, at your option.
diff --git a/benches/decode.rs b/benches/decode.rs
new file mode 100644
index 0000000..9e53f0a
--- /dev/null
+++ b/benches/decode.rs
@@ -0,0 +1,173 @@
+//! Benchmark comparing software-only decode (via `ffmpeg-next` directly,
+//! since `hwdecode` is hardware-only) against `hwdecode`'s auto-probed
+//! hardware backend on the same input file.
+//!
+//! Set `HWDECODE_SAMPLE_VIDEO` to a video file path. The hardware bench is
+//! skipped (with a notice) when no hardware backend is available on the host.
+//!
+//! ```sh
+//! HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo bench
+//! ```
+
+use std::{path::PathBuf, time::Duration};
+
+use criterion::{criterion_group, criterion_main, Criterion};
+use ffmpeg::{codec::Context as CodecContext, format, frame, media};
+use ffmpeg_next as ffmpeg;
+use hwdecode::{Frame, VideoDecoder};
+
+const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
+
+fn sample_path() -> Option<PathBuf> {
+  std::env::var_os(SAMPLE_ENV).map(PathBuf::from)
+}
+
+/// Decode every frame using `hwdecode`'s auto-probed hardware backend.
+fn decode_all_hw(path: &PathBuf) -> Result<usize, hwdecode::Error> {
+  let mut input = format::input(path).map_err(hwdecode::Error::Ffmpeg)?;
+  let stream = input
+    .streams()
+    .best(media::Type::Video)
+    .ok_or(hwdecode::Error::Ffmpeg(ffmpeg::Error::StreamNotFound))?;
+  let stream_index = stream.index();
+
+  let mut decoder = VideoDecoder::open(stream.parameters())?;
+  let mut frame = Frame::empty()?;
+  let mut count = 0_usize;
+
+  let mut drain = |decoder: &mut VideoDecoder, count: &mut usize| -> Result<(), hwdecode::Error> {
+    loop {
+      match decoder.receive_frame(&mut frame) {
+        Ok(()) => *count += 1,
+        Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+          if errno == ffmpeg::error::EAGAIN =>
+        {
+          return Ok(());
+        }
+        Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Eof)) => return Ok(()),
+        Err(e) => return Err(e),
+      }
+    }
+  };
+
+  for (s, packet) in input.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet)?;
+    drain(&mut decoder, &mut count)?;
+  }
+  decoder.send_eof()?;
+  drain(&mut decoder, &mut count)?;
+  Ok(count)
+}
+
+/// Decode every frame using a plain software `ffmpeg-next` decoder. Used as
+/// the SW baseline since `hwdecode` no longer exposes a software backend.
+fn decode_all_sw(path: &PathBuf) -> Result<usize, ffmpeg::Error> {
+  let mut input = format::input(path)?;
+  let stream = input
+    .streams()
+    .best(media::Type::Video)
+    .ok_or(ffmpeg::Error::StreamNotFound)?;
+  let stream_index = stream.index();
+  let mut decoder = CodecContext::from_parameters(stream.parameters())?
+    .decoder()
+    .video()?;
+
+  let mut frame = frame::Video::empty();
+  let mut count = 0_usize;
+
+  let mut drain =
+    |decoder: &mut ffmpeg::decoder::Video, count: &mut usize| -> Result<(), ffmpeg::Error> {
+      loop {
+        match decoder.receive_frame(&mut frame) {
+          Ok(()) => *count += 1,
+          Err(ffmpeg::Error::Other { errno }) if errno == ffmpeg::error::EAGAIN => return Ok(()),
+          Err(ffmpeg::Error::Eof) => return Ok(()),
+          Err(e) => return Err(e),
+        }
+      }
+    };
+
+  for (s, packet) in input.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet)?;
+    drain(&mut decoder, &mut count)?;
+  }
+  decoder.send_eof()?;
+  drain(&mut decoder, &mut count)?;
+  Ok(count)
+}
+
+fn bench_decode(c: &mut Criterion) {
+  ffmpeg::init().expect("ffmpeg init");
+
+  let Some(path) = sample_path() else {
+    eprintln!("skipping benches: set {SAMPLE_ENV} to a video file path");
+    return;
+  };
+
+  // Probe by decoding one frame so the probe collapses to the backend that
+  // actually produced output. None means no HW backend is available — we
+  // skip the HW arm and bench SW only.
+  let probed_backend = {
+    let mut input = format::input(&path).expect("open input");
+    let stream = input
+      .streams()
+      .best(media::Type::Video)
+      .expect("video stream");
+    let stream_index = stream.index();
+    match VideoDecoder::open(stream.parameters()) {
+      Ok(mut dec) => {
+        let mut frame = Frame::empty().expect("alloc probe frame");
+        'probe: for (s, packet) in input.packets() {
+          if s.index() != stream_index {
+            continue;
+          }
+          dec.send_packet(&packet).expect("probe send_packet");
+          match dec.receive_frame(&mut frame) {
+            Ok(()) => break 'probe,
+            Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+              if errno == ffmpeg::error::EAGAIN =>
+            {
+              continue;
+            }
+            Err(e) => panic!("probe receive_frame: {e}"),
+          }
+        }
+        Some(dec.backend())
+      }
+      Err(hwdecode::Error::AllBackendsFailed { .. }) => None,
+      Err(e) => panic!("hwdecode probe: {e}"),
+    }
+  };
+  match probed_backend {
+    Some(b) => eprintln!("auto-probe settled on backend: {b:?}"),
+    None => eprintln!("no hardware backend available — hardware bench will be skipped"),
+  }
+
+  let mut group = c.benchmark_group("decode");
+  group.measurement_time(Duration::from_secs(15));
+  group.sample_size(20);
+
+  group.bench_function("software", |b| {
+    b.iter(|| decode_all_sw(&path).expect("software decode"))
+  });
+
+  if probed_backend.is_some() {
+    group.bench_function("hardware", |b| {
+      b.iter(|| {
+        let n = decode_all_hw(&path).expect("hardware decode");
+        std::hint::black_box(n);
+      })
+    });
+  }
+
+  group.finish();
+}
+
+criterion_group!(benches, bench_decode);
+criterion_main!(benches);
diff --git a/benches/foo.rs b/benches/foo.rs
deleted file mode 100644
index f328e4d..0000000
--- a/benches/foo.rs
+++ /dev/null
@@ -1 +0,0 @@
-fn main() {}
diff --git a/docs/design.md b/docs/design.md
new file mode 100644
index 0000000..521dd49
--- /dev/null
+++ b/docs/design.md
@@ -0,0 +1,120 @@
+# hwdecode — design
+
+Cross-platform **hardware-only** video decoder built on top of `ffmpeg-next` 8.1.
+
+> **Status note.** This document was the original spec from the brainstorm
+> phase and parts have evolved since: the crate is hardware-only (no
+> `Backend::Software`), `Frame` is its own safe wrapper, and several pixel-
+> format / safety details were tightened during review. For the canonical
+> behavior, read `src/lib.rs` and `README.md`. Sections below have been
+> trimmed where they conflicted; the spec is otherwise preserved as
+> historical context.
+
+## Goals
+
+- Drop-in replacement for `ffmpeg::decoder::Video` at the call site (`send_packet` / `receive_frame` / `send_eof` / `flush`).
+- Auto-probe the platform's hardware backends. **No software fallback inside this crate** — callers handle that themselves (e.g. via `ffmpeg::decoder::Video`) when `open` returns `Error::AllBackendsFailed`.
+- Hand back native-format CPU frames (NV12/P010 from the HW path post-transfer). Pixel-format conversion is the caller's responsibility (e.g. via `colconv`).
+- Cross-platform: macOS / iOS / iPadOS / tvOS, Linux (Intel/AMD/NVIDIA), Windows (any GPU + CUDA on NVIDIA).
+
+## Non-goals
+
+- Audio hardware decoding. Out of scope; software AAC/Opus/etc. is fast enough that the complexity isn't justified.
+- Demuxing. Callers open files/streams themselves (e.g. via `findit-demuxer`) and feed packets in.
+- Pixel-format conversion. Done downstream (`colconv`).
+- Encoding.
+
+## Public API
+
+> The original spec listed an inline API surface here. It diverged from the
+> shipping crate (`Backend::Software` was removed; `format() -> Pixel` was
+> removed in favor of `Frame::pix_fmt() -> i32`; the `Frame` wrapper
+> replaced `frame::Video`; `Error` gained / dropped variants). Rather than
+> keep stale signatures here, the canonical reference is `src/lib.rs` and
+> the public docs on each item. See the README for a runnable usage
+> example.
+
+## Behavior
+
+### Probe order
+
+| Target              | Order tried (HW only)                        |
+| ------------------- | -------------------------------------------- |
+| macOS, iOS, tvOS    | `[VideoToolbox]`                             |
+| Linux               | `[Vaapi, Cuda]`                              |
+| Windows             | `[D3d11va, Cuda]`                            |
+| Other               | `[]` → `Error::AllBackendsFailed`            |
+
+A HW backend is a candidate only if **(a)** its `AVHWDeviceType` device can be created via `av_hwdevice_ctx_create`, and **(b)** the codec advertises support via `avcodec_get_hw_config` matching that device type. The first candidate that fully opens wins. Each failure logs `tracing::warn!` with the backend and the underlying error and the loop tries the next. If every backend fails (or the platform has none), `open` returns `Error::AllBackendsFailed`; software fallback is the caller's responsibility.
+
+### Device selection
+
+Always device 0 / system default (`av_hwdevice_ctx_create(.., NULL, ..)`). No env var, no config knob in v1. Add later if the multi-GPU use case appears.
+
+### `get_format` callback
+
+A static `extern "C"` callback. The decoder context's `opaque` field points to a small heap-allocated `CallbackState`. The callback walks the offered `pix_fmts` list as raw `i32` (avoiding bindgen-enum UB on header skew), returns `wanted` if present, else `AV_PIX_FMT_NONE` (which causes the decoder to fail; the caller-side probe loop then tears down and tries the next hardware backend).
+
+### Frame transfer
+
+`receive_frame` always:
+
+1. Reads from the codec into an internal `hw_frame: ffmpeg::frame::Video` (allocated once, reused).
+2. If the frame's format is the HW pix fmt, calls `av_hwframe_transfer_data(out, hw_frame, 0)` into the caller's `&mut frame`. Copies `pts`, `pkt_dts`, `time_base`, `duration` (FFmpeg does not transfer timing).
+3. Otherwise (SW path or decoder fell back mid-stream), clones the frame into the caller's slot.
+
+### Threading
+
+`VideoDecoder: Send + !Sync`. Each instance owns its own `AVCodecContext` and `AVBufferRef*`. Multiple decoders can run on different threads; a single decoder is not concurrent.
+
+### Drop
+
+`Drop` calls `av_buffer_unref(&mut self.hw_device_ref)` if non-null, frees the boxed `CallbackState`, then lets `ffmpeg::decoder::Video`'s own Drop free the codec context.
+
+## Internals
+
+```text
+src/
+├── lib.rs       // re-exports + crate-level docs
+├── error.rs     // Error enum
+├── backend.rs   // Backend enum, probe order, AVHWDeviceType <-> Backend mapping
+├── decoder.rs   // VideoDecoder, open/open_with, send/receive
+└── ffi.rs       // get_format callback, av_hwdevice_ctx_create / transfer wrappers,
+                 // avcodec_get_hw_config probe
+```
+
+No other modules. Keep the surface small.
+
+## Build & dependencies
+
+- `ffmpeg-next = { version = "8.1", default-features = false, features = ["codec", "format"] }`
+- `thiserror = "2"`
+- `tracing = "0.1"`
+- `libc = "0.2"`
+
+No platform-specific Cargo features. `cfg!(target_os = ...)` selects which `AVHWDeviceType` constants we even attempt — the FFI symbols are linked unconditionally via `ffmpeg-sys-next`.
+
+System FFmpeg ≥ **5.1** (we reference `AV_PIX_FMT_P212LE` / `AV_PIX_FMT_P412LE`,
+added upstream in 5.1). Verified against the macOS Homebrew build (FFmpeg 8.1,
+VideoToolbox enabled).
+
+## Testing
+
+1. **Unit tests** (`src/backend.rs`, `src/error.rs`) — pure-Rust: probe-order construction per platform, `Backend` ↔ `AVHWDeviceType` mapping, error formatting.
+2. **Integration** (`tests/decode.rs`) — opens a sample H.264 file via `ffmpeg::format::input`, decodes 30 frames through `VideoDecoder::open` (auto-probe), asserts frame count and dimensions. Sample path comes from env var `HWDECODE_SAMPLE_VIDEO`; test is skipped with a clear `eprintln!` if unset.
+3. **HW smoke** (`tests/hw_smoke.rs`, `#[ignore]`) — same decode, asserts `decoder.backend()` returns one of the hardware variants (the enum no longer has a Software variant; this is a sanity check against accidental no-op selection). CI runs this on platform-matched runners.
+
+Sample-file env var keeps the repo binary-free. Documented in `README.md`.
+
+## Benchmark
+
+`benches/decode.rs` (criterion) — two functions:
+
+- `bench_software_decode` — drives `ffmpeg::decoder::Video` directly (this crate has no software backend), decodes all frames, measures wall-clock per frame.
+- `bench_hardware_decode` — `VideoDecoder::open(..)` (auto-probe). Skipped if `open` returns `AllBackendsFailed` (no HW backend available on this host).
+
+Both use the same `HWDECODE_SAMPLE_VIDEO` file. Bench prints which backend the HW run actually used, so results are interpretable across machines.
+
+## Examples
+
+`examples/decode.rs` — opens a path from `argv[1]` with `ffmpeg::format::input`, finds the best video stream, feeds packets through `VideoDecoder`, prints `(pts, width, height, format, backend)` for each frame.
diff --git a/examples/decode.rs b/examples/decode.rs
new file mode 100644
index 0000000..1d14de1
--- /dev/null
+++ b/examples/decode.rs
@@ -0,0 +1,90 @@
+//! Decode every video frame in `argv[1]`, printing one line per frame.
+//!
+//! ```sh
+//! cargo run --release --example decode -- /path/to/video.mp4
+//! ```
+
+use ffmpeg::{format, media};
+use ffmpeg_next as ffmpeg;
+use hwdecode::{Frame, VideoDecoder};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+  let path = std::env::args()
+    .nth(1)
+    .ok_or("usage: decode <video-file>")?;
+
+  ffmpeg::init()?;
+
+  let mut input = format::input(&path)?;
+  let stream = input
+    .streams()
+    .best(media::Type::Video)
+    .ok_or("no video stream")?;
+  let stream_index = stream.index();
+
+  let mut decoder = match VideoDecoder::open(stream.parameters()) {
+    Ok(d) => d,
+    Err(hwdecode::Error::AllBackendsFailed { attempts }) => {
+      eprintln!(
+        "no hardware backend available; tried {} backend(s):",
+        attempts.len()
+      );
+      for (b, e) in &attempts {
+        eprintln!("  {b:?}: {e}");
+      }
+      eprintln!("(callers handle software fallback themselves — see ffmpeg::decoder::Video)");
+      return Ok(());
+    }
+    Err(e) => return Err(e.into()),
+  };
+  println!(
+    "open: backend={:?} {}x{}",
+    decoder.backend(),
+    decoder.width(),
+    decoder.height(),
+  );
+
+  let mut frame = Frame::empty()?;
+  let mut count: u64 = 0;
+
+  let drain = |decoder: &mut VideoDecoder, frame: &mut Frame, count: &mut u64| loop {
+    match decoder.receive_frame(frame) {
+      Ok(()) => {
+        *count += 1;
+        println!(
+          "frame#{count} pts={:?} {}x{} pix_fmt={}",
+          frame.pts(),
+          frame.width(),
+          frame.height(),
+          frame.pix_fmt(),
+        );
+      }
+      Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+        if errno == ffmpeg::error::EAGAIN =>
+      {
+        break
+      }
+      Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Eof)) => break,
+      Err(e) => {
+        eprintln!("decode error: {e}");
+        break;
+      }
+    }
+  };
+
+  for (s, packet) in input.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet)?;
+    drain(&mut decoder, &mut frame, &mut count);
+  }
+  decoder.send_eof()?;
+  drain(&mut decoder, &mut frame, &mut count);
+
+  println!(
+    "decoded {count} frames; final backend={:?}",
+    decoder.backend()
+  );
+  Ok(())
+}
diff --git a/examples/foo.rs b/examples/foo.rs
deleted file mode 100644
index f328e4d..0000000
--- a/examples/foo.rs
+++ /dev/null
@@ -1 +0,0 @@
-fn main() {}
diff --git a/src/backend.rs b/src/backend.rs
new file mode 100644
index 0000000..00cf82e
--- /dev/null
+++ b/src/backend.rs
@@ -0,0 +1,118 @@
+use ffmpeg_next::ffi::{AVHWDeviceType, AVPixelFormat};
+
+/// Hardware decoding backend.
+///
+/// `hwdecode` only manages **hardware** decoders — software fallback is
+/// out of scope. If no backend in [`probe_order`] for the current platform
+/// can decode a stream, [`crate::VideoDecoder::open`] returns
+/// [`crate::Error::AllBackendsFailed`] and the caller decides how to fall
+/// back (e.g. by opening an `ffmpeg::decoder::Video` directly).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum Backend {
+  /// Apple VideoToolbox (macOS, iOS, iPadOS, tvOS, visionOS).
+  VideoToolbox,
+  /// Linux Video Acceleration API (Intel / AMD GPUs).
+  Vaapi,
+  /// NVIDIA NVDEC via CUDA (Linux / Windows on NVIDIA hardware).
+  Cuda,
+  /// Microsoft Direct3D 11 Video Acceleration (Windows).
+  D3d11va,
+}
+
+impl Backend {
+  /// `AVHWDeviceType` corresponding to this backend.
+  pub(crate) fn av_hwdevice_type(self) -> AVHWDeviceType {
+    match self {
+      Self::VideoToolbox => AVHWDeviceType::AV_HWDEVICE_TYPE_VIDEOTOOLBOX,
+      Self::Vaapi => AVHWDeviceType::AV_HWDEVICE_TYPE_VAAPI,
+      Self::Cuda => AVHWDeviceType::AV_HWDEVICE_TYPE_CUDA,
+      Self::D3d11va => AVHWDeviceType::AV_HWDEVICE_TYPE_D3D11VA,
+    }
+  }
+
+  /// Hardware pixel format the codec is expected to produce when this
+  /// backend is in use. (The post-`av_hwframe_transfer_data` CPU format is
+  /// typically `NV12` or `P010LE`; this is the *pre-transfer* sentinel.)
+  ///
+  /// Returns a `AVPixelFormat` value constructed from a hardcoded constant
+  /// in our bindings — never reads an enum value supplied by FFmpeg, so
+  /// no enum-discriminant UB risk.
+  pub(crate) fn hw_pixel_format(self) -> AVPixelFormat {
+    match self {
+      Self::VideoToolbox => AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX,
+      Self::Vaapi => AVPixelFormat::AV_PIX_FMT_VAAPI,
+      Self::Cuda => AVPixelFormat::AV_PIX_FMT_CUDA,
+      Self::D3d11va => AVPixelFormat::AV_PIX_FMT_D3D11,
+    }
+  }
+}
+
+/// Probe order for `VideoDecoder::open` on the current target. Hardware
+/// backends only, in preference order. Empty for platforms with no known
+/// HW backend; on those `open()` returns `AllBackendsFailed` immediately.
+pub(crate) fn probe_order() -> &'static [Backend] {
+  #[cfg(any(
+    target_os = "macos",
+    target_os = "ios",
+    target_os = "tvos",
+    target_os = "visionos",
+  ))]
+  {
+    &[Backend::VideoToolbox]
+  }
+  #[cfg(target_os = "linux")]
+  {
+    &[Backend::Vaapi, Backend::Cuda]
+  }
+  #[cfg(target_os = "windows")]
+  {
+    &[Backend::D3d11va, Backend::Cuda]
+  }
+  #[cfg(not(any(
+    target_os = "macos",
+    target_os = "ios",
+    target_os = "tvos",
+    target_os = "visionos",
+    target_os = "linux",
+    target_os = "windows",
+  )))]
+  {
+    &[]
+  }
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  #[test]
+  fn all_backends_have_hwdevice_type_and_pix_fmt() {
+    for b in [
+      Backend::VideoToolbox,
+      Backend::Vaapi,
+      Backend::Cuda,
+      Backend::D3d11va,
+    ] {
+      let _ = b.av_hwdevice_type();
+      let _ = b.hw_pixel_format();
+    }
+  }
+
+  #[cfg(any(target_os = "macos", target_os = "ios", target_os = "tvos"))]
+  #[test]
+  fn apple_probe_order() {
+    assert_eq!(probe_order(), &[Backend::VideoToolbox]);
+  }
+
+  #[cfg(target_os = "linux")]
+  #[test]
+  fn linux_probe_order() {
+    assert_eq!(probe_order(), &[Backend::Vaapi, Backend::Cuda]);
+  }
+
+  #[cfg(target_os = "windows")]
+  #[test]
+  fn windows_probe_order() {
+    assert_eq!(probe_order(), &[Backend::D3d11va, Backend::Cuda]);
+  }
+}
diff --git a/src/decoder.rs b/src/decoder.rs
new file mode 100644
index 0000000..a241b68
--- /dev/null
+++ b/src/decoder.rs
@@ -0,0 +1,2084 @@
+use std::{collections::VecDeque, mem::ManuallyDrop, ptr};
+
+use ffmpeg_next::{
+  codec::{
+    self,
+    packet::{Mut as PacketMut, Ref as PacketRef},
+    Context,
+  },
+  ffi::{
+    av_buffer_ref, av_buffer_unref, av_frame_move_ref, av_frame_unref, av_hwdevice_ctx_create,
+    av_hwframe_transfer_data, av_packet_ref, avcodec_alloc_context3, avcodec_free_context,
+    avcodec_parameters_alloc, avcodec_parameters_copy, avcodec_parameters_free,
+    avcodec_parameters_to_context, AVBufferRef, AVCodec, AVFrame, AVMediaType,
+  },
+  frame, Codec, Packet, Rational,
+};
+
+/// Local FFI shim: `avcodec_find_decoder` declared with `c_int` instead of
+/// the bindgen `AVCodecID` enum. Constructing `AVCodecID` from a runtime
+/// integer that isn't in our build's discriminant set is UB; calling the
+/// C function with a raw int avoids that boundary entirely. Both Rust
+/// declarations resolve to the same C symbol at link time.
+mod c_shims {
+  use super::AVCodec;
+  use libc::c_int;
+  extern "C" {
+    pub fn avcodec_find_decoder(id: c_int) -> *const AVCodec;
+  }
+}
+
+use crate::{
+  backend::{self, Backend},
+  error::{Error, Result},
+  ffi::{codec_supports_hwaccel, get_hw_format, CallbackState},
+  frame::Frame,
+};
+
+/// Hardware-accelerated video decoder.
+///
+/// Hardware-only — there is no software fallback inside this crate. If
+/// every hardware backend in the platform's probe order fails to open,
+/// `open` returns [`Error::AllBackendsFailed`] and the caller is
+/// responsible for falling back to a software decoder of their choice
+/// (e.g. `ffmpeg::decoder::Video`).
+///
+/// Mirrors `ffmpeg::decoder::Video`'s `send_packet`/`receive_frame` interface.
+/// Decoded frames are returned through [`crate::Frame`], a CPU-side wrapper
+/// whose accessors avoid the `AVPixelFormat`-enum UB that an unvalidated read
+/// of FFmpeg's raw integer pixel formats can trigger.
+///
+/// `open` does a true probe: each backend opens with a strict `get_format`
+/// callback. On the first non-transient error from a backend the decoder is
+/// torn down and the next backend in probe order is tried, with all packets
+/// seen so far replayed through it. The advance is *transactional* — the
+/// candidate backend must successfully build and accept the replayed packets
+/// before any probe state is consumed, so a failing backend in the middle of
+/// the order does not strand the caller without history. Once the first frame
+/// is delivered the probe collapses and subsequent calls go straight to the
+/// active backend.
+pub struct VideoDecoder {
+  /// Live FFmpeg state for the currently active backend.
+  state: DecoderState,
+  /// Reusable frame buffer used for hw-side decoding before transfer / move.
+  /// Internal use only — never handed to callers.
+  hw_frame: frame::Video,
+  /// Probe state: present until the first frame is received from the active
+  /// backend, then `None`. While `Some`, packets are buffered for replay and
+  /// non-transient errors / decoder failures advance to the next backend.
+  probe: Option<ProbeState>,
+  /// CPU-side frames produced by a candidate decoder during probe replay
+  /// (when its internal queue filled and we had to drain output before the
+  /// next `send_packet`). Already transferred from the candidate's
+  /// `AVHWFramesContext` to a CPU frame, so they remain valid after the
+  /// candidate state is committed. [`Self::receive_frame`] dequeues these
+  /// FIFO before reading from `state.inner`.
+  pending_frames: VecDeque<frame::Video>,
+  /// Per-decoder byte budget for [`Self::pending_frames`] during probe
+  /// replay. Defaults to [`DEFAULT_MAX_PROBE_PENDING_BYTES`]; override via
+  /// [`Self::with_max_probe_pending_bytes`].
+  max_probe_pending_bytes: usize,
+}
+
+/// Owned FFmpeg state for one open codec context. Has its own `Drop` so we
+/// can swap it out cleanly during a probe advance via `mem::replace`.
+struct DecoderState {
+  /// Wrapped FFmpeg decoder. `ManuallyDrop` so we can sequence its drop
+  /// before freeing the callback state.
+  inner: ManuallyDrop<ffmpeg_next::decoder::Video>,
+  /// Backend driving this state.
+  backend: Backend,
+  /// Owned reference produced by `av_hwdevice_ctx_create`.
+  hw_device_ref: *mut AVBufferRef,
+  /// Owned `Box<CallbackState>` raw pointer; `AVCodecContext::opaque`
+  /// aliases it.
+  callback_state: *mut CallbackState,
+}
+
+/// Maximum number of packets we are willing to buffer for probe replay
+/// before abandoning the fallback safety net. Set high enough to absorb
+/// long B-frame GOPs and codec setup latency, low enough to bound memory
+/// against malicious / pathological streams that never produce a first
+/// frame.
+const MAX_PROBE_PACKETS: usize = 256;
+
+/// Maximum total compressed-byte size of buffered probe packets. Each
+/// `Packet` clone holds a refcounted reference to the demuxer's bitstream
+/// data — even though the clone itself is shallow, the underlying buffers
+/// stay alive until we drop them. 64 MiB is generous for normal video and
+/// gives untrusted media a hard ceiling.
+const MAX_PROBE_PACKET_BYTES: usize = 64 * 1024 * 1024;
+
+/// Hard cap on the number of side-data entries we tolerate per buffered
+/// packet. `av_packet_ref` allocates an `AVPacketSideData` descriptor and
+/// an `AVBufferRef` per entry, so a packet stuffed with many tiny or
+/// zero-sized entries can consume significant memory in descriptor /
+/// allocator overhead even after [`packet_side_data_bytes`] charges
+/// [`SIDE_DATA_ENTRY_OVERHEAD`] bytes per entry. Refusing to clone such
+/// packets short-circuits the descriptor explosion path.
+///
+/// Sized for legitimate streams (typical video packets carry 0-5 side-
+/// data entries; SEI-heavy HEVC/AV1 maybe a dozen) while comfortably
+/// rejecting weaponised input.
+const MAX_PROBE_PACKET_SIDE_DATA_ENTRIES: usize = 64;
+
+/// Conservative per-side-data-entry overhead estimate used by both
+/// [`packet_side_data_bytes`] and the budget accounting in
+/// [`VideoDecoder::send_packet`]. Counts the `AVPacketSideData`
+/// descriptor (24 bytes per the FFmpeg 8.x bindings), the `AVBufferRef`
+/// FFmpeg allocates per entry, and a margin for malloc bookkeeping
+/// (header bytes, alignment slack). Setting it on the high side keeps
+/// the byte cap a true upper bound on retained memory; under-charging
+/// would let many tiny entries slip past the cap.
+const SIDE_DATA_ENTRY_OVERHEAD: usize = 80;
+
+/// Conservative upper-bound bytes-per-pixel multiplier used to estimate
+/// the size of a CPU frame **before** `av_hwframe_transfer_data`
+/// allocates its pixel buffers. Covers every HW download format this
+/// crate produces (worst case is `P416LE` / `P412LE` at 6 bytes/pixel
+/// for 16-bit 4:4:4 semi-planar) plus a margin for FFmpeg's per-row
+/// stride alignment (typically 32-byte aligned, ~5% extra at HD widths
+/// and below).
+///
+/// Used by [`drain_into_pending`] as a pre-transfer guard: if the
+/// product `width * height * WORST_CASE_BYTES_PER_PIXEL` would already
+/// push `pending_bytes` past `max_probe_pending_bytes`, the candidate
+/// replay refuses the frame *before* allocating. Without this, FFmpeg
+/// would perform the full HW→CPU download (potentially ~100 MiB for
+/// 8K HDR) and we would only reject the frame after RSS had already
+/// spiked. The post-transfer accounting via [`cpu_frame_bytes`] stays in
+/// place as a backstop using the frame's actual stride/format.
+///
+/// Slightly over-charges true 4:2:0 NV12 / P010 frames (which dominate
+/// real workloads) — that's the right side to err on. Callers feeding
+/// 8K+ workloads through the probe path can tune
+/// [`VideoDecoder::with_max_probe_pending_bytes`] upward to compensate.
+const WORST_CASE_BYTES_PER_PIXEL: usize = 8;
+
+/// Maximum number of CPU frames we are willing to queue from a candidate
+/// during probe replay. Each frame is a fully-allocated CPU buffer
+/// (~3 MiB for 1080p NV12, ~24 MiB for 4K P010, ~96 MiB for 8K P010), so
+/// an unbounded queue would OOM on a candidate with a shallow internal
+/// queue against a deep replay history. This cap, together with
+/// [`DEFAULT_MAX_PROBE_PENDING_BYTES`], is enforced as a hard limit during
+/// replay: once either limit is reached, probe buffering fails for the
+/// candidate (returns `ENOMEM` from `drain_into_pending`) instead of
+/// queueing additional drained frames. The probe loop then advances to
+/// the next backend or returns `Error::AllBackendsFailed` if exhausted.
+const MAX_PROBE_PENDING_FRAMES: usize = 16;
+
+/// Default byte budget for probe-replay drained frames. 256 MiB is enough
+/// for 16 frames at 4K P010 (~24 MiB each = 384 MiB worst case under the
+/// count cap), and is the cap that fires first for very high-resolution
+/// content (8K P010: ~96 MiB per frame → only ~2 frames fit).
+///
+/// Override per-decoder with [`VideoDecoder::with_max_probe_pending_bytes`]
+/// when targeting 8K+ workloads or memory-constrained environments.
+///
+/// TODO: when frames significantly exceed typical sizes, consider
+/// memmap-backed pending buffers (write transferred frames to a temp file
+/// or shared-memory segment) so the resident set stays bounded even when
+/// the byte cap is raised. Out of scope for v0.0.0.
+pub const DEFAULT_MAX_PROBE_PENDING_BYTES: usize = 256 * 1024 * 1024;
+
+/// State carried only during the probe window (before the first successful
+/// frame). Holds enough information to tear down the current decoder and
+/// retry with the next backend.
+struct ProbeState {
+  parameters: codec::Parameters,
+  codec: Codec,
+  /// Backends still to try, in order. Empty means "no more options after
+  /// the active one fails" — `advance_probe` then surfaces
+  /// [`Error::AllBackendsFailed`] so the contract is the same on
+  /// single-backend platforms (e.g. macOS) as on multi-backend ones.
+  remaining_backends: Vec<Backend>,
+  /// Packets sent so far, kept for replay through any candidate backend.
+  /// Preserved across failed candidates — only cleared when the probe
+  /// collapses on a successful first frame, or when the probe is
+  /// abandoned due to the size caps.
+  buffered_packets: Vec<Packet>,
+  /// Cumulative size (in compressed bytes) of `buffered_packets`. Tracked
+  /// incrementally so we don't have to re-sum on every send.
+  buffered_bytes: usize,
+  /// Whether `send_eof` has been called; replayed alongside packets.
+  eof_sent: bool,
+  /// Per-backend errors captured since the probe window opened. Pushed
+  /// whenever a backend's failure triggers `advance_probe` (the active
+  /// backend that just failed) or a candidate's build / replay rejects
+  /// it. Drained into [`Error::AllBackendsFailed`] when the probe
+  /// exhausts every option.
+  attempts: Vec<(Backend, Box<Error>)>,
+}
+
+// SAFETY: All raw pointers are exclusively owned by `DecoderState` and never
+// shared. `ffmpeg::decoder::Video` is itself `Send` (its `Context` carries an
+// `unsafe impl Send`). The decoder is not safe for concurrent use, hence not
+// `Sync`.
+unsafe impl Send for DecoderState {}
+unsafe impl Send for VideoDecoder {}
+
+impl Drop for DecoderState {
+  fn drop(&mut self) {
+    // Order matters:
+    //  1. Drop the codec context first. While it lives, FFmpeg may invoke
+    //     `get_format`, which dereferences `callback_state` via `opaque`.
+    //  2. Free the callback state heap allocation.
+    //  3. Release our hw device reference (FFmpeg released its own when
+    //     the codec context was freed in step 1).
+    unsafe {
+      ManuallyDrop::drop(&mut self.inner);
+      if !self.callback_state.is_null() {
+        drop(Box::from_raw(self.callback_state));
+        self.callback_state = ptr::null_mut();
+      }
+      if !self.hw_device_ref.is_null() {
+        av_buffer_unref(&mut self.hw_device_ref);
+      }
+    }
+  }
+}
+
+impl VideoDecoder {
+  /// Auto-probe hardware backends in the platform's default order.
+  ///
+  /// Each backend opens with a strict `get_format` callback. The first
+  /// backend whose `avcodec_open2` succeeds becomes active; if its first
+  /// frame is unusable (decode error, transfer failure, or a CPU-format
+  /// frame from a HW context) the decoder is torn down and the next backend
+  /// is tried — packets sent so far are replayed through the new decoder
+  /// transparently. The probe advance is transactional: the next backend
+  /// must build *and* accept the replayed history before any probe state is
+  /// consumed, so a misbehaving middle backend cannot strand the caller.
+  ///
+  /// [`Self::backend`] reflects whichever backend ultimately produced the
+  /// first frame.
+  ///
+  /// [`Error::AllBackendsFailed`] surfaces in two places, with the same
+  /// meaning ("no hardware backend can decode this stream — fall back to
+  /// software yourself"):
+  /// - From `open` itself, when no backend even opens.
+  /// - From [`Self::send_packet`] / [`Self::send_eof`] /
+  ///   [`Self::receive_frame`], when the initially-opened backend fails
+  ///   at decode time and every remaining backend in the probe order
+  ///   either also fails or doesn't exist. On single-backend platforms
+  ///   (e.g. macOS, where the order is `[VideoToolbox]`), this is the
+  ///   only place a HW-only failure surfaces.
+  ///
+  /// In both cases, `attempts` carries the per-backend error log so the
+  /// caller can decide how to proceed with software fallback.
+  pub fn open(parameters: codec::Parameters) -> Result<Self> {
+    let codec = find_decoder(&parameters)?;
+    let order = backend::probe_order();
+
+    let mut attempts: Vec<(Backend, Box<Error>)> = Vec::new();
+    for (i, &backend) in order.iter().enumerate() {
+      // Use the checked clone — ffmpeg-next's `Parameters::clone` does
+      // `avcodec_parameters_alloc` without a null check and ignores the
+      // return of `avcodec_parameters_copy`. Under OOM that path silently
+      // produces a Parameters with a null inner pointer.
+      let cloned_for_build = match try_clone_parameters(&parameters) {
+        Ok(p) => p,
+        Err(e) => {
+          tracing::warn!(?backend, error = %e, "hwdecode: parameters clone failed");
+          attempts.push((backend, Box::new(Error::Ffmpeg(e))));
+          continue;
+        }
+      };
+      match Self::build_state(cloned_for_build, codec, backend) {
+        Ok(state) => {
+          tracing::info!(?backend, "hwdecode: opened video decoder (probing)");
+          let remaining = order[(i + 1)..].to_vec();
+          // Deep-copy the caller's `parameters` before storing in ProbeState.
+          // `codec::Parameters` from `stream.parameters()` carries an Rc
+          // owner pointing at the demuxer; moving that Rc to a worker
+          // thread (when VideoDecoder is sent) would race with the demuxer's
+          // Rc on the original thread. The checked clone copies the bytes
+          // into a fresh allocation with `owner: None`, severing the link.
+          //
+          // We always create ProbeState — even when `remaining` is empty
+          // (single-backend platforms like macOS) — so that a first-frame
+          // failure on the only backend surfaces as
+          // `Error::AllBackendsFailed` from `receive_frame` /
+          // `send_packet` rather than as a raw FFmpeg error. That keeps
+          // the API contract the same regardless of how many HW backends
+          // the platform exposes.
+          //
+          // If the clone fails (ENOMEM), we keep the active `state` but
+          // skip probe setup — caller loses the transactional probe /
+          // fallback safety net but still gets a working decoder.
+          let probe = match try_clone_parameters(&parameters) {
+            Ok(probe_params) => Some(ProbeState {
+              parameters: probe_params,
+              codec,
+              remaining_backends: remaining,
+              buffered_packets: Vec::new(),
+              buffered_bytes: 0,
+              eof_sent: false,
+              attempts: Vec::new(),
+            }),
+            Err(e) => {
+              tracing::warn!(
+                error = %e,
+                "hwdecode: parameters clone failed for probe state; proceeding without fallback"
+              );
+              None
+            }
+          };
+          return Ok(Self {
+            state,
+            hw_frame: alloc_av_frame().map_err(Error::Ffmpeg)?,
+            probe,
+            pending_frames: VecDeque::new(),
+            max_probe_pending_bytes: DEFAULT_MAX_PROBE_PENDING_BYTES,
+          });
+        }
+        Err(e) => {
+          tracing::warn!(?backend, error = %e, "hwdecode: backend open failed");
+          attempts.push((backend, Box::new(e)));
+        }
+      }
+    }
+    Err(Error::AllBackendsFailed { attempts })
+  }
+
+  /// Open the decoder with a specific backend. No probe, no fallback.
+  ///
+  /// If `backend` cannot actually decode this stream, the failure surfaces
+  /// from [`Self::receive_frame`] (the strict `get_format` callback returns
+  /// `AV_PIX_FMT_NONE`, the decoder errors out). The caller is responsible
+  /// for retrying with another hardware backend or falling back to a
+  /// software decoder of their choice (e.g. `ffmpeg::decoder::Video`).
+  pub fn open_with(parameters: codec::Parameters, backend: Backend) -> Result<Self> {
+    let codec = find_decoder(&parameters)?;
+    let state = Self::build_state(parameters, codec, backend)?;
+    Ok(Self {
+      state,
+      hw_frame: alloc_av_frame().map_err(Error::Ffmpeg)?,
+      probe: None,
+      pending_frames: VecDeque::new(),
+      max_probe_pending_bytes: DEFAULT_MAX_PROBE_PENDING_BYTES,
+    })
+  }
+
+  /// Override the byte budget for probe-replay queued frames. Defaults to
+  /// [`DEFAULT_MAX_PROBE_PENDING_BYTES`]. Use a higher value when targeting
+  /// 8K+ workloads where 16 frames at full size could exceed the default;
+  /// use a lower value in memory-constrained services to bound peak
+  /// allocation more tightly.
+  ///
+  /// Setting after the first frame has been delivered is harmless but has
+  /// no observable effect — the probe has already collapsed and the cap
+  /// only applies during replay drain.
+  ///
+  /// Returns `self` for builder-style chaining:
+  /// ```ignore
+  /// let decoder = VideoDecoder::open(params)?
+  ///     .with_max_probe_pending_bytes(1024 * 1024 * 1024); // 1 GiB
+  /// ```
+  pub fn with_max_probe_pending_bytes(mut self, bytes: usize) -> Self {
+    self.max_probe_pending_bytes = bytes;
+    self
+  }
+
+  /// The backend currently producing frames. While the probe is still in
+  /// progress (no frame received yet) this returns the optimistically
+  /// selected backend; after the first frame, it is the backend that
+  /// actually produced it. Once stable, never changes again.
+  pub fn backend(&self) -> Backend {
+    self.state.backend
+  }
+
+  /// Decoder width in pixels.
+  pub fn width(&self) -> u32 {
+    self.state.inner.width()
+  }
+
+  /// Decoder height in pixels.
+  pub fn height(&self) -> u32 {
+    self.state.inner.height()
+  }
+
+  /// Codec context time base.
+  pub fn time_base(&self) -> Rational {
+    self.state.inner.time_base()
+  }
+
+  /// Frame rate from the codec context, if known.
+  pub fn frame_rate(&self) -> Option<Rational> {
+    self.state.inner.frame_rate()
+  }
+
+  /// Submit a packet to the decoder.
+  ///
+  /// On success — and only on success — the packet is buffered for potential
+  /// replay through a fallback backend while the probe is active. EAGAIN
+  /// (decoder needs `receive_frame` to drain output first) propagates as
+  /// normal backpressure; the caller drains then retries.
+  ///
+  /// While the probe is active, a non-transient error (e.g. the active HW
+  /// backend rejecting this stream's geometry on first packet) advances the
+  /// probe to the next candidate and retries the packet there. The caller
+  /// observes only the eventual success or, if the probe is exhausted, the
+  /// final error.
+  ///
+  /// If the probe window grows beyond [`MAX_PROBE_PACKETS`] or
+  /// [`MAX_PROBE_PACKET_BYTES`] without producing a first frame (a stream
+  /// the active backend is silently mishandling, or pathological input),
+  /// the probe is **abandoned**: replay history is dropped, queued frames
+  /// are cleared, and `self.probe = None`. The active backend continues
+  /// serving the caller without fallback. A `tracing::warn!` records this
+  /// so it is visible in production logs.
+  pub fn send_packet(&mut self, packet: &Packet) -> Result<()> {
+    loop {
+      match self.state.inner.send_packet(packet) {
+        Ok(()) => {
+          if let Some(probe) = self.probe.as_mut() {
+            // Step 1: reject by side-data entry count BEFORE walking the
+            // side-data array for byte accounting. `packet_side_data_bytes`
+            // dereferences each `AVPacket.side_data[i]` based on the
+            // FFmpeg-supplied `side_data_elems`; if that integer is
+            // corrupt or weaponised we don't want to walk it from the
+            // safe `send_packet` path. The byte helper still clamps its
+            // own walk to the cap as defense-in-depth, but checking the
+            // count first short-circuits the descriptor-explosion case
+            // entirely.
+            let side_count = packet_side_data_count(packet);
+            if side_count > MAX_PROBE_PACKET_SIDE_DATA_ENTRIES {
+              tracing::warn!(
+                side_data_entries = side_count,
+                max_side_data_entries = MAX_PROBE_PACKET_SIDE_DATA_ENTRIES,
+                trigger = "side_data_entry_cap",
+                "hwdecode: packet side-data entry count exceeds cap; \
+                 abandoning fallback safety net without byte accounting"
+              );
+              // Abandon the *future* probe-buffering only — see the byte/
+              // packet cap branch below for why `pending_frames` survives.
+              self.probe = None;
+            } else {
+              // Step 2: now safe to compute byte budget — `side_count`
+              // is bounded.
+              //
+              // `try_clone_packet` calls `av_packet_ref`, which deep-copies
+              // side data via `av_packet_copy_props`. The probe budget
+              // must include descriptor + ref overhead per side-data
+              // entry (via `packet_side_data_bytes`); without it, a
+              // packet stuffed with many tiny entries can dominate
+              // retained memory before the byte cap is even close to
+              // firing.
+              let pkt_size = packet.size().saturating_add(packet_side_data_bytes(
+                packet,
+                MAX_PROBE_PACKET_SIDE_DATA_ENTRIES,
+              ));
+              let new_count = probe.buffered_packets.len() + 1;
+              let new_bytes = probe.buffered_bytes.saturating_add(pkt_size);
+              if new_count > MAX_PROBE_PACKETS || new_bytes > MAX_PROBE_PACKET_BYTES {
+                tracing::warn!(
+                  packets = new_count,
+                  bytes = new_bytes,
+                  side_data_entries = side_count,
+                  max_packets = MAX_PROBE_PACKETS,
+                  max_bytes = MAX_PROBE_PACKET_BYTES,
+                  trigger = "byte_or_packet_cap",
+                  "hwdecode: probe window exceeded caps without first frame; \
+                   abandoning fallback safety net"
+                );
+                // Abandon the *future* probe-buffering only.
+                // `pending_frames` belong to the currently active backend
+                // (possibly the candidate `advance_probe` committed
+                // earlier in this same `send_packet` call) and are valid
+                // output the caller will dequeue via `receive_frame`.
+                // Clearing them here would silently drop initial frames
+                // at exactly the cap-overflow / OOM-stress paths.
+                self.probe = None;
+              } else {
+                // Use the checked clone — ffmpeg-next's `Packet::clone`
+                // discards av_packet_ref's return value and would
+                // silently store an empty packet on ENOMEM, corrupting
+                // future replay.
+                match try_clone_packet(packet) {
+                  Ok(cloned) => {
+                    probe.buffered_packets.push(cloned);
+                    probe.buffered_bytes = new_bytes;
+                  }
+                  Err(e) => {
+                    tracing::warn!(
+                      error = %e,
+                      "hwdecode: packet clone failed for probe history; \
+                       abandoning fallback safety net"
+                    );
+                    // Same reasoning as the cap-overflow branch above:
+                    // `pending_frames` are owned by the active backend,
+                    // not the probe buffer, so they survive abandonment.
+                    self.probe = None;
+                  }
+                }
+              }
+            }
+          }
+          return Ok(());
+        }
+        Err(e) if is_transient(&e) => {
+          // Normal backpressure / EOF — pass through unchanged.
+          return Err(Error::Ffmpeg(e));
+        }
+        Err(e) => {
+          if self.probe.is_some() {
+            // advance_probe consumes the error into `attempts` and either
+            // installs a candidate (Ok) or surfaces AllBackendsFailed (Err).
+            self.advance_probe(Error::Ffmpeg(e))?;
+            continue;
+          }
+          return Err(Error::Ffmpeg(e));
+        }
+      }
+    }
+  }
+
+  /// Signal end-of-stream to the decoder.
+  ///
+  /// Recorded for replay only if the underlying `send_eof` succeeds. While
+  /// the probe is active, non-transient errors trigger probe advance and
+  /// retry, matching `send_packet`'s behaviour.
+  pub fn send_eof(&mut self) -> Result<()> {
+    loop {
+      match self.state.inner.send_eof() {
+        Ok(()) => {
+          if let Some(probe) = self.probe.as_mut() {
+            probe.eof_sent = true;
+          }
+          return Ok(());
+        }
+        Err(e) if is_transient(&e) => return Err(Error::Ffmpeg(e)),
+        Err(e) => {
+          if self.probe.is_some() {
+            self.advance_probe(Error::Ffmpeg(e))?;
+            continue;
+          }
+          return Err(Error::Ffmpeg(e));
+        }
+      }
+    }
+  }
+
+  /// Receive a CPU-side decoded frame.
+  ///
+  /// The frame is downloaded with `av_hwframe_transfer_data` and metadata
+  /// is copied via `av_frame_copy_props`. The caller's frame is always
+  /// unref'd first, so reuse across resolution changes or different
+  /// decoders is safe.
+  ///
+  /// While the probe window is open, *any* non-transient failure (decode
+  /// error, transfer error, copy_props error, or a CPU-format frame from a
+  /// HW-opened context) tears down the current decoder and advances to the
+  /// next hardware backend in probe order, replaying buffered packets
+  /// through it. Frames the candidate produced during replay (drained when
+  /// `send_packet` returned EAGAIN) are queued and delivered FIFO via this
+  /// method, so the caller never loses initial frames after a fallback.
+  ///
+  /// This crate is hardware-only: there is no software fallback inside the
+  /// decoder. When every backend in the probe order has been exhausted —
+  /// including the case of a single-backend platform whose only backend
+  /// failed — this returns [`Error::AllBackendsFailed`] with the per-
+  /// backend attempt log so the caller can branch into a software
+  /// decoder of their choice.
+  ///
+  /// Returns the same transient signals as `ffmpeg::decoder::Video`:
+  /// `Error::Ffmpeg(Other { errno: EAGAIN })` when no frame is ready and
+  /// more packets must be sent, and `Error::Ffmpeg(Eof)` once fully drained.
+  pub fn receive_frame(&mut self, frame: &mut Frame) -> Result<()> {
+    // Pre-drain frames queued during probe replay. They are already CPU-side
+    // (transferred at drain time, when the candidate's HW context was alive)
+    // so we just move them into the caller's slot.
+    if self.try_pop_pending(frame) {
+      return Ok(());
+    }
+
+    loop {
+      let res = self.state.inner.receive_frame(&mut self.hw_frame);
+      match res {
+        Err(e) => {
+          // EAGAIN is normal backpressure — pass through unconditionally.
+          if is_eagain(&e) {
+            return Err(Error::Ffmpeg(e));
+          }
+          // EOF (and every other non-transient error): if we are still
+          // probing, treat it as candidate failure — a backend that drains
+          // to EOF without ever producing a frame should not silently
+          // present as "stream over" to the caller. Advance and retry; if
+          // every backend has been exhausted, advance_probe surfaces
+          // AllBackendsFailed and `?` propagates it.
+          if self.probe.is_some() {
+            self.advance_probe(Error::Ffmpeg(e))?;
+            // Probe advance may have populated `pending_frames`; deliver
+            // one of those before reading more from the new candidate.
+            if self.try_pop_pending(frame) {
+              return Ok(());
+            }
+            continue;
+          }
+          // Probe collapsed already — surface the error (including EOF
+          // for a genuinely empty stream).
+          return Err(Error::Ffmpeg(e));
+        }
+        Ok(()) => {
+          // Always attempt the HW→CPU transfer. With strict `get_format`,
+          // libavcodec can only deliver frames in the wired-up HW format
+          // (or fail). If a misbehaving codec ever hands us a CPU-side
+          // frame anyway, `av_hwframe_transfer_data` returns AVERROR(EINVAL)
+          // (neither src nor dst has an AVHWFramesContext attached) and we
+          // route through the same error path below.
+          match unsafe { transfer_hw_frame(frame, &mut self.hw_frame) } {
+            Ok(()) => {
+              self.probe = None;
+              return Ok(());
+            }
+            Err(e) => {
+              if self.probe.is_some() {
+                self.advance_probe(Error::Ffmpeg(e))?;
+                unsafe { av_frame_unref(frame.as_inner_mut().as_mut_ptr()) };
+                if self.try_pop_pending(frame) {
+                  return Ok(());
+                }
+                continue;
+              }
+              return Err(Error::Ffmpeg(e));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Pop one queued frame (produced by a candidate decoder during probe
+  /// replay) into the caller's slot. Returns `true` when a frame was
+  /// delivered, `false` when the queue was empty.
+  fn try_pop_pending(&mut self, frame: &mut Frame) -> bool {
+    let Some(mut buffered) = self.pending_frames.pop_front() else {
+      return false;
+    };
+    // SAFETY: `buffered` is a CPU-side AVFrame we previously transferred
+    // and pushed into the queue; both pointers are valid.
+    unsafe {
+      av_frame_unref(frame.as_inner_mut().as_mut_ptr());
+      av_frame_move_ref(frame.as_inner_mut().as_mut_ptr(), buffered.as_mut_ptr());
+    }
+    // Probe semantics: delivering a frame collapses the probe.
+    self.probe = None;
+    true
+  }
+
+  /// Flush internal buffers (e.g. after a seek).
+  ///
+  /// Discards every frame buffered by the decoder, every frame queued during
+  /// probe replay (`pending_frames`), and the residual `hw_frame` scratch
+  /// buffer. Probe-time replay state (buffered packets, EOF marker) is also
+  /// cleared since post-seek packets do not align with the previously
+  /// captured history. After a flush, the next `receive_frame` waits for new
+  /// post-seek input.
+  pub fn flush(&mut self) {
+    self.state.inner.flush();
+    // SAFETY: hw_frame is a valid AVFrame we own; av_frame_unref is a no-op
+    // for an already-empty frame.
+    unsafe { av_frame_unref(self.hw_frame.as_mut_ptr()) };
+    self.pending_frames.clear();
+    if let Some(probe) = self.probe.as_mut() {
+      probe.buffered_packets.clear();
+      probe.buffered_bytes = 0;
+      probe.eof_sent = false;
+    }
+  }
+
+  /// Try the next backend in `remaining_backends`. Transactional: a
+  /// candidate must successfully build and accept the replayed history
+  /// before any probe state is consumed. Backends that fail to build or
+  /// reject the replay are recorded into `probe.attempts` and the loop
+  /// continues to the next one.
+  ///
+  /// `last_error` is the error that triggered this advance — i.e. the
+  /// failure of the currently active backend on `send_packet` /
+  /// `send_eof` / `receive_frame`. It is recorded against the active
+  /// backend before any candidate is tried so that a final
+  /// `AllBackendsFailed` carries the full attempt log including the
+  /// initially-opened backend's runtime failure.
+  ///
+  /// Returns:
+  /// - `Ok(())` when a candidate is installed and replay completed —
+  ///   caller should retry the operation.
+  /// - `Err(Error::AllBackendsFailed { attempts })` when every remaining
+  ///   backend has been exhausted (including the just-failed active one).
+  ///   This is what the documented `open` contract promises, surfaced at
+  ///   runtime so the caller can branch into a software fallback. On a
+  ///   single-backend platform (e.g. macOS), this fires after the only
+  ///   backend's first-frame failure; on multi-backend platforms it
+  ///   fires after the last candidate's failure.
+  /// - `Err(_)` for other fatal conditions surfaced by probe machinery
+  ///   itself (e.g. `alloc_av_frame` ENOMEM during replay drain).
+  fn advance_probe(&mut self, last_error: Error) -> Result<()> {
+    // Record the failure that triggered this advance against the active
+    // backend. If the probe was somehow already gone (shouldn't happen —
+    // call sites guard with `self.probe.is_some()`), just propagate the
+    // error so behaviour matches the pre-fix code path.
+    let active_backend = self.state.backend;
+    match self.probe.as_mut() {
+      Some(probe) => probe.attempts.push((active_backend, Box::new(last_error))),
+      None => return Err(last_error),
+    }
+
+    // Drop frames previously queued from the backend we're now abandoning.
+    // They came from a candidate that just failed for cause and cannot be
+    // trusted alongside frames we may queue from the next candidate. (If
+    // this method is called repeatedly via chained probe advances, this
+    // also keeps `pending_frames` from accumulating frames from multiple
+    // rejected backends.)
+    self.pending_frames.clear();
+
+    loop {
+      // Snapshot inputs without mutating probe state. Use the checked
+      // clone helper rather than `Parameters::clone` (which masks ENOMEM).
+      let (next_backend, parameters, codec) = match self.probe.as_ref() {
+        Some(probe) if !probe.remaining_backends.is_empty() => {
+          let parameters = match try_clone_parameters(&probe.parameters) {
+            Ok(p) => p,
+            Err(e) => {
+              tracing::warn!(
+                error = %e,
+                "hwdecode: parameters clone failed during probe advance; popping backend and trying next"
+              );
+              let popped = self
+                .probe
+                .as_mut()
+                .expect("probe state present")
+                .remaining_backends
+                .remove(0);
+              self
+                .probe
+                .as_mut()
+                .expect("probe state present")
+                .attempts
+                .push((popped, Box::new(Error::Ffmpeg(e))));
+              continue;
+            }
+          };
+          (probe.remaining_backends[0], parameters, probe.codec)
+        }
+        // No more candidates — surface the accumulated attempt log as
+        // AllBackendsFailed so single- and multi-backend platforms have
+        // the same contract for "every HW backend failed."
+        _ => {
+          let attempts = self.probe.take().map(|p| p.attempts).unwrap_or_default();
+          return Err(Error::AllBackendsFailed { attempts });
+        }
+      };
+
+      let prev_backend = self.state.backend;
+      tracing::warn!(from = ?prev_backend, to = ?next_backend, "hwdecode: advancing probe");
+
+      // Build candidate. On failure, record into attempts and continue
+      // without touching the packet buffer.
+      let mut candidate_state = match Self::build_state(parameters, codec, next_backend) {
+        Ok(s) => s,
+        Err(e) => {
+          tracing::warn!(?next_backend, error = %e, "hwdecode: candidate build failed");
+          self
+            .probe
+            .as_mut()
+            .expect("probe state present")
+            .remaining_backends
+            .remove(0);
+          self
+            .probe
+            .as_mut()
+            .expect("probe state present")
+            .attempts
+            .push((next_backend, Box::new(e)));
+          continue;
+        }
+      };
+
+      // Replay buffered history through the candidate WITHOUT installing it.
+      // We borrow the buffer immutably; if replay fails the candidate's Drop
+      // releases the FFmpeg state and the buffer is preserved for the next
+      // attempt.
+      //
+      // EAGAIN handling: `avcodec_send_packet` may return EAGAIN when its
+      // internal queue is full and the user is expected to drain output
+      // first (B-frame buffering, candidate-specific queue depth, etc.).
+      // This is normal flow — we drain frames out of the candidate, transfer
+      // each one to a CPU frame, and stash them in `local_pending`. After
+      // commit they move to `self.pending_frames` and are delivered FIFO
+      // by `receive_frame`, so the caller never loses initial frames.
+      let mut local_pending: VecDeque<frame::Video> = VecDeque::new();
+      let mut local_pending_bytes: usize = 0;
+      let max_pending_bytes = self.max_probe_pending_bytes;
+      let replay_result: std::result::Result<(), ffmpeg_next::Error> = {
+        let probe = self.probe.as_ref().expect("probe state present");
+        let mut hw_buf = match alloc_av_frame() {
+          Ok(f) => f,
+          Err(e) => return Err(Error::Ffmpeg(e)),
+        };
+        let mut r: std::result::Result<(), ffmpeg_next::Error> = Ok(());
+
+        'replay: for pkt in &probe.buffered_packets {
+          loop {
+            match candidate_state.inner.send_packet(pkt) {
+              Ok(()) => break,
+              Err(e) if is_eagain(&e) => {
+                // Drain candidate output (transferring + queueing each frame)
+                // and retry the same packet.
+                if let Err(de) = drain_into_pending(
+                  &mut candidate_state.inner,
+                  &mut hw_buf,
+                  &mut local_pending,
+                  &mut local_pending_bytes,
+                  max_pending_bytes,
+                ) {
+                  r = Err(de);
+                  break 'replay;
+                }
+              }
+              Err(e) => {
+                r = Err(e);
+                break 'replay;
+              }
+            }
+          }
+        }
+        if r.is_ok() && probe.eof_sent {
+          // `avcodec_send_packet(NULL)` (which `send_eof` becomes) can
+          // return EAGAIN with the same drain-output-first semantics as
+          // a regular send_packet. Loop drain+retry instead of failing
+          // the candidate on backpressure.
+          loop {
+            match candidate_state.inner.send_eof() {
+              Ok(()) => break,
+              Err(e) if is_eagain(&e) => {
+                if let Err(de) = drain_into_pending(
+                  &mut candidate_state.inner,
+                  &mut hw_buf,
+                  &mut local_pending,
+                  &mut local_pending_bytes,
+                  max_pending_bytes,
+                ) {
+                  r = Err(de);
+                  break;
+                }
+              }
+              Err(e) => {
+                r = Err(e);
+                break;
+              }
+            }
+          }
+        }
+        r
+      };
+
+      if let Err(e) = replay_result {
+        tracing::warn!(?next_backend, error = %e, "hwdecode: candidate replay failed");
+        // Drop candidate explicitly so its FFI cleanup runs now. Discard any
+        // frames we drained from this candidate — they're tied to a decoder
+        // we're throwing away.
+        drop(candidate_state);
+        drop(local_pending);
+        self
+          .probe
+          .as_mut()
+          .expect("probe state present")
+          .remaining_backends
+          .remove(0);
+        self
+          .probe
+          .as_mut()
+          .expect("probe state present")
+          .attempts
+          .push((next_backend, Box::new(Error::Ffmpeg(e))));
+        continue;
+      }
+
+      // Commit: install the candidate, clear residual hw_frame, queue the
+      // drained frames for the caller, and pop the now-active backend.
+      self.state = candidate_state;
+      unsafe { av_frame_unref(self.hw_frame.as_mut_ptr()) };
+      self.pending_frames.append(&mut local_pending);
+      self
+        .probe
+        .as_mut()
+        .expect("probe state present")
+        .remaining_backends
+        .remove(0);
+      return Ok(());
+    }
+  }
+
+  /// Build raw FFmpeg state for one hardware backend. Strict `get_format`
+  /// (NONE on missing HW format); cross-backend fallback is the caller's job.
+  fn build_state(
+    parameters: codec::Parameters,
+    codec: Codec,
+    backend: Backend,
+  ) -> Result<DecoderState> {
+    // Use our checked allocator instead of Context::from_parameters, which
+    // does not null-check avcodec_alloc_context3 and would feed a null
+    // AVCodecContext into FFmpeg under OOM.
+    let mut ctx = build_codec_context(&parameters)?;
+    let av_type = backend.av_hwdevice_type();
+
+    // Verify the codec advertises this hwaccel **with the exact HW pix_fmt
+    // we're about to wire up in `get_format`**. FFmpeg's HW config table
+    // is keyed per (device_type, pix_fmt); a codec can advertise the same
+    // device with several HW pix_fmts, so matching only on device_type
+    // would let probing succeed for a backend whose pix_fmt the codec
+    // never offers — the failure would then surface deep inside the
+    // probe/decode loop. Matching the exact pix_fmt keeps the strict
+    // `get_format` honest and gives `open_with` a clean rejection.
+    let hw_pix_fmt = backend.hw_pixel_format();
+    if !codec_supports_hwaccel(unsafe { codec.as_ptr() }, av_type, hw_pix_fmt as i32) {
+      return Err(Error::BackendUnsupportedByCodec(backend));
+    }
+
+    // Create the device context.
+    let mut hw_device_ref: *mut AVBufferRef = ptr::null_mut();
+    // SAFETY: `hw_device_ref` is a stack ptr we hand FFmpeg to fill.
+    let ret = unsafe {
+      av_hwdevice_ctx_create(&mut hw_device_ref, av_type, ptr::null(), ptr::null_mut(), 0)
+    };
+    if ret < 0 {
+      return Err(Error::HwDeviceInitFailed {
+        backend,
+        source: ffmpeg_next::Error::from(ret),
+      });
+    }
+
+    let callback_state = Box::into_raw(Box::new(CallbackState {
+      wanted: hw_pix_fmt,
+      wanted_int: hw_pix_fmt as i32,
+    }));
+    // RAII guard: from now until the end-of-function `into_owned()`, every
+    // early return — `av_buffer_ref` failure, `open_as` failure, codec_type
+    // mismatch, or any future error path added between here and the
+    // `DecoderState` construction — frees `hw_device_ref` and
+    // `callback_state` via the guard's Drop. Without it, each error site
+    // had to remember to clean up these two FFI-owned resources by hand;
+    // the codec_type-mismatch branch was missed and silently leaked one
+    // device ref + one heap allocation per bad input.
+    let guard = PartialBuildState {
+      hw_device_ref,
+      callback_state,
+    };
+
+    // SAFETY: ctx is a freshly-constructed AVCodecContext we own;
+    // av_buffer_ref bumps the refcount of the device buffer for FFmpeg's
+    // use (we keep our own ref in `hw_device_ref` for cleanup).
+    // av_buffer_ref returns NULL on allocation failure; we must check it
+    // before assigning, otherwise the codec context would be opened with a
+    // HW-flagged setup but no actual device reference.
+    let device_ref_for_ctx = unsafe { av_buffer_ref(hw_device_ref) };
+    if device_ref_for_ctx.is_null() {
+      // guard's Drop frees hw_device_ref (the first ref) and callback_state.
+      return Err(Error::Ffmpeg(ffmpeg_next::Error::Other {
+        errno: libc::ENOMEM,
+      }));
+    }
+    // SAFETY: device_ref_for_ctx is a valid AVBufferRef* from av_buffer_ref;
+    // ctx is freshly built and owned by us. After this point ctx aliases
+    // `callback_state` via `opaque` (FFmpeg never frees opaque, so
+    // `callback_state` ownership stays with us / the guard) and aliases
+    // `device_ref_for_ctx` (the second ref) via `hw_device_ctx` (FFmpeg
+    // unrefs that on codec context drop, independent of the guard's first
+    // ref).
+    unsafe {
+      let raw = ctx.as_mut_ptr();
+      (*raw).hw_device_ctx = device_ref_for_ctx;
+      (*raw).opaque = callback_state.cast();
+      (*raw).get_format = Some(get_hw_format);
+    }
+
+    // Open the decoder. On failure `ctx`/`opened` Drop releases the codec
+    // context (and via that the second device ref); the guard releases the
+    // first device ref and the callback state.
+    //
+    // We deliberately bypass `Opened::video()` because it calls
+    // `Context::medium()`, which reads `AVCodecContext.codec_type` as the
+    // bindgen `AVMediaType` enum — the same UB hazard we've been
+    // systematically removing. Instead: validate `codec_type` as a raw
+    // `c_int` ourselves, then construct the `decoder::Video` wrapper
+    // directly via its public tuple field.
+    let opened = ctx.decoder().open_as(codec).map_err(Error::Ffmpeg)?;
+
+    // Validate codec_type as a raw integer — never construct AVMediaType
+    // from an unvalidated runtime value.
+    // SAFETY: codec_type is bound as AVMediaType (`#[repr(i32)]`), same
+    // size and alignment as i32; reading the bytes as i32 cannot be UB.
+    let codec_type_int: i32 =
+      unsafe { ptr::read(ptr::addr_of!((*opened.as_ptr()).codec_type) as *const i32) };
+    let video_type_int: i32 = AVMediaType::AVMEDIA_TYPE_VIDEO as i32;
+    if codec_type_int != video_type_int {
+      // Not a video codec context — surface the same error
+      // `Opened::video()` would have, without going through enum
+      // construction. `opened`'s Drop releases the codec context; the
+      // guard releases the first hw_device_ref and the callback state.
+      return Err(Error::Ffmpeg(ffmpeg_next::Error::InvalidData));
+    }
+    // SAFETY of construction: `decoder::Video` is `pub struct Video(pub Opened)`.
+    // We construct via the public field; this is the same wrapping
+    // `Opened::video()` does on success, just without the enum read.
+    let opened = ffmpeg_next::decoder::Video(opened);
+
+    // Disarm the guard and transfer ownership of both resources into the
+    // returned DecoderState (whose own Drop handles their lifetime).
+    let (hw_device_ref, callback_state) = guard.into_owned();
+    Ok(DecoderState {
+      inner: ManuallyDrop::new(opened),
+      backend,
+      hw_device_ref,
+      callback_state,
+    })
+  }
+}
+
+/// RAII guard for the partially-owned FFmpeg state that
+/// [`VideoDecoder::build_state`] holds between the
+/// `av_hwdevice_ctx_create` and `Box::into_raw(CallbackState)`
+/// allocations and the final `DecoderState` construction.
+///
+/// If `build_state` returns `Err` for any reason in that window
+/// (`av_buffer_ref` ENOMEM, `open_as` failure, codec_type mismatch, or
+/// any future error path), this guard's `Drop` releases
+/// `hw_device_ref` — the first ref returned by `av_hwdevice_ctx_create`,
+/// distinct from the second ref FFmpeg unrefs when the codec context
+/// drops — and the boxed `CallbackState`, which FFmpeg never touches
+/// because `AVCodecContext::opaque` is purely user-owned.
+///
+/// Successful construction calls [`Self::into_owned`] to disarm the
+/// guard and hand both pointers to the new `DecoderState`.
+struct PartialBuildState {
+  hw_device_ref: *mut AVBufferRef,
+  callback_state: *mut CallbackState,
+}
+
+impl PartialBuildState {
+  /// Disarm the guard: return the owned pointers and replace the guard's
+  /// fields with null so its Drop is a no-op.
+  fn into_owned(mut self) -> (*mut AVBufferRef, *mut CallbackState) {
+    let hw = std::mem::replace(&mut self.hw_device_ref, ptr::null_mut());
+    let cb = std::mem::replace(&mut self.callback_state, ptr::null_mut());
+    (hw, cb)
+  }
+}
+
+impl Drop for PartialBuildState {
+  fn drop(&mut self) {
+    // SAFETY: pointers are either freshly allocated by `build_state` (via
+    // `av_hwdevice_ctx_create` and `Box::into_raw`) or null after
+    // `into_owned`. Both `av_buffer_unref` and `Box::from_raw` need the
+    // null check we apply here; both are otherwise sound on resources we
+    // own.
+    unsafe {
+      if !self.hw_device_ref.is_null() {
+        let mut hw = self.hw_device_ref;
+        av_buffer_unref(&mut hw);
+      }
+      if !self.callback_state.is_null() {
+        drop(Box::from_raw(self.callback_state));
+      }
+    }
+  }
+}
+
+/// Download a HW frame into a CPU [`Frame`]. Always unrefs the destination
+/// first so reuse across resolution changes is safe.
+///
+/// Deliberately does **not** call `av_frame_copy_props`. That FFmpeg
+/// helper deep-copies AVFrame side data (SEI, mastering display, ICC
+/// profiles, dynamic HDR, etc.), the metadata dict, and bumps both
+/// `opaque_ref` and `private_ref` on every receive — none of which
+/// `Frame` exposes via its public accessors. On a crafted stream with
+/// megabytes of per-frame metadata that would mean an unbounded
+/// allocation per receive, with no caller-visible benefit. We instead
+/// copy only the scalar fields the public API can read (today: `pts`);
+/// pixel layout (`width`, `height`, `format`, `linesize`, `data`) is
+/// already set by `av_hwframe_transfer_data`. If `Frame` ever grows
+/// accessors for timing extras (`duration`, `time_base`, `pkt_dts`) or
+/// color metadata, add those to `copy_frame_props_minimal` at the same
+/// time.
+unsafe fn transfer_hw_frame(
+  dst: &mut Frame,
+  src: &mut frame::Video,
+) -> std::result::Result<(), ffmpeg_next::Error> {
+  unsafe {
+    av_frame_unref(dst.as_inner_mut().as_mut_ptr());
+    let ret = av_hwframe_transfer_data(dst.as_inner_mut().as_mut_ptr(), src.as_ptr(), 0);
+    if ret < 0 {
+      return Err(ffmpeg_next::Error::from(ret));
+    }
+    copy_frame_props_minimal(dst.as_inner_mut().as_mut_ptr(), src.as_ptr());
+  }
+  Ok(())
+}
+
+/// Bounded substitute for `av_frame_copy_props`. Copies only the scalar
+/// AVFrame fields the public `Frame` API needs from `src` to `dst` —
+/// today just `pts`. Skips every allocating field (`av_dict_copy` for
+/// `metadata`, `av_frame_new_side_data` + memcpy for each `side_data[i]`,
+/// `av_buffer_replace` for `opaque_ref` / `private_ref`) so the cost is
+/// O(1) per frame regardless of what the source attaches.
+///
+/// # Safety
+/// Both pointers must be valid `AVFrame` pointers we own; field
+/// projection touches only POD scalars, no enums or buffer refs.
+unsafe fn copy_frame_props_minimal(dst: *mut AVFrame, src: *const AVFrame) {
+  unsafe {
+    (*dst).pts = (*src).pts;
+  }
+}
+
+/// `EAGAIN` and `EOF` are normal flow signals from `avcodec_receive_frame`
+/// and must not be treated as backend failures.
+fn is_transient(e: &ffmpeg_next::Error) -> bool {
+  is_eagain(e) || matches!(e, ffmpeg_next::Error::Eof)
+}
+
+/// Reject a `codec::Parameters` whose inner `*mut AVCodecParameters` is
+/// null. This guards the public trust boundary: ffmpeg-next can produce
+/// such a `Parameters` under OOM (`Parameters::new()` does not check
+/// `avcodec_parameters_alloc`), and a safe caller can legally hand one
+/// in. Without this check, the very next `(*p.as_ptr()).field` read
+/// would be a null deref.
+fn ensure_parameters_non_null(parameters: &codec::Parameters) -> Result<()> {
+  // SAFETY: as_ptr() returns the inner *const AVCodecParameters; we just
+  // inspect the pointer value (no deref).
+  if unsafe { parameters.as_ptr() }.is_null() {
+    return Err(Error::Ffmpeg(ffmpeg_next::Error::Other {
+      errno: libc::ENOMEM,
+    }));
+  }
+  Ok(())
+}
+
+/// Allocate a fresh `frame::Video`, checking that `av_frame_alloc` did not
+/// return NULL. ffmpeg-next's `frame::Video::empty()` does not surface that
+/// failure and the resulting null pointer would be UB on the next field
+/// access; this wrapper catches it and surfaces it as `ENOMEM`.
+fn alloc_av_frame() -> std::result::Result<frame::Video, ffmpeg_next::Error> {
+  let inner = frame::Video::empty();
+  // SAFETY: as_ptr() just exposes the inner pointer for inspection.
+  if unsafe { inner.as_ptr() }.is_null() {
+    return Err(ffmpeg_next::Error::Other {
+      errno: libc::ENOMEM,
+    });
+  }
+  Ok(inner)
+}
+
+/// Build a fresh `Context` from `parameters`, checking the underlying
+/// `avcodec_alloc_context3` for NULL before passing it to
+/// `avcodec_parameters_to_context`. ffmpeg-next's `Context::from_parameters`
+/// skips that check and would feed a null pointer into FFmpeg under OOM —
+/// undefined behavior. This helper surfaces the failure as `ENOMEM` and
+/// frees the context if `parameters_to_context` itself errors.
+fn build_codec_context(parameters: &codec::Parameters) -> Result<Context> {
+  ensure_parameters_non_null(parameters)?;
+  // SAFETY: avcodec_alloc_context3(NULL) returns a fresh AVCodecContext
+  // or NULL on allocation failure.
+  let ctx_ptr = unsafe { avcodec_alloc_context3(ptr::null()) };
+  if ctx_ptr.is_null() {
+    return Err(Error::Ffmpeg(ffmpeg_next::Error::Other {
+      errno: libc::ENOMEM,
+    }));
+  }
+  // SAFETY: ctx_ptr is non-null and freshly allocated; parameters.as_ptr()
+  // returns a valid AVCodecParameters pointer; the function copies bytes
+  // out of parameters into the context.
+  let ret = unsafe { avcodec_parameters_to_context(ctx_ptr, parameters.as_ptr()) };
+  if ret < 0 {
+    // SAFETY: ctx_ptr was allocated by us and never handed to anyone else.
+    let mut p = ctx_ptr;
+    unsafe { avcodec_free_context(&mut p) };
+    return Err(Error::Ffmpeg(ffmpeg_next::Error::from(ret)));
+  }
+  // SAFETY: ctx_ptr is valid; passing `owner: None` means our wrapper owns
+  // the allocation and `Context::drop` will run `avcodec_free_context`.
+  Ok(unsafe { Context::wrap(ctx_ptr, None) })
+}
+
+/// Checked deep-clone of `codec::Parameters`. ffmpeg-next's
+/// `Parameters::clone` allocates via `avcodec_parameters_alloc` without
+/// checking for NULL and runs `avcodec_parameters_copy` without checking
+/// the return code. On `ENOMEM` the result is a `Parameters` with a null
+/// inner pointer, which becomes UB when later passed to FFmpeg.
+///
+/// This helper performs both calls explicitly, frees a partial allocation
+/// on failure, and surfaces the AVERROR. The returned `Parameters` has
+/// `owner: None`, severing any Rc link to the caller's demuxer (the
+/// reason we deep-clone in the first place — see Send safety in
+/// `VideoDecoder::open`).
+fn try_clone_parameters(
+  src: &codec::Parameters,
+) -> std::result::Result<codec::Parameters, ffmpeg_next::Error> {
+  // Reject a null inner pointer at the boundary; a deref inside
+  // avcodec_parameters_copy below would otherwise be UB.
+  if unsafe { src.as_ptr() }.is_null() {
+    return Err(ffmpeg_next::Error::Other {
+      errno: libc::ENOMEM,
+    });
+  }
+  // SAFETY: avcodec_parameters_alloc returns a fresh AVCodecParameters
+  // pointer or NULL on allocation failure.
+  let dst_ptr = unsafe { avcodec_parameters_alloc() };
+  if dst_ptr.is_null() {
+    return Err(ffmpeg_next::Error::Other {
+      errno: libc::ENOMEM,
+    });
+  }
+  // SAFETY: dst_ptr is non-null and freshly allocated; src.as_ptr() is
+  // a valid AVCodecParameters pointer; the function copies bytes from
+  // src into dst.
+  let ret = unsafe { avcodec_parameters_copy(dst_ptr, src.as_ptr()) };
+  if ret < 0 {
+    // SAFETY: dst_ptr was allocated by us and never handed out.
+    let mut p = dst_ptr;
+    unsafe { avcodec_parameters_free(&mut p) };
+    return Err(ffmpeg_next::Error::from(ret));
+  }
+  // SAFETY: dst_ptr is a valid AVCodecParameters; passing `owner: None`
+  // means our wrapper owns the allocation and `Parameters::drop` will
+  // call `avcodec_parameters_free`.
+  Ok(unsafe { codec::Parameters::wrap(dst_ptr, None) })
+}
+
+/// Checked counterpart to `Packet::clone()`. ffmpeg-next's `clone_from`
+/// calls `av_packet_ref` and ignores the int return value; on `ENOMEM`
+/// the destination is left empty while the caller assumes the clone
+/// succeeded — corrupting any later replay history. This helper surfaces
+/// the AVERROR. The result is a refcounted shallow clone — the payload
+/// buffer is shared with `src` rather than deep-copied; the probe replay
+/// only sends packets through `avcodec_send_packet`, which does not
+/// require a writable buffer.
+fn try_clone_packet(src: &Packet) -> std::result::Result<Packet, ffmpeg_next::Error> {
+  let mut dst = Packet::empty();
+  // SAFETY: dst is a freshly zero-initialized Packet (av_init_packet inside
+  // Packet::empty); av_packet_ref initializes its data fields from src's
+  // refcounted buffer or returns AVERROR(ENOMEM) on failure.
+  let ret = unsafe { av_packet_ref(dst.as_mut_ptr(), src.as_ptr()) };
+  if ret < 0 {
+    return Err(ffmpeg_next::Error::from(ret));
+  }
+  Ok(dst)
+}
+
+/// Sum of `AVPacket.side_data[i].size` across every entry, plus
+/// `nb_entries * SIDE_DATA_ENTRY_OVERHEAD` (descriptor + AVBufferRef +
+/// allocator bookkeeping per entry). `av_packet_ref` performs a deep
+/// copy of side data via `av_packet_copy_props`, so each probe-buffered
+/// clone retains every one of these bytes. Charging both keeps
+/// `MAX_PROBE_PACKET_BYTES` a true upper bound — without the overhead,
+/// many zero-size entries slip past the cap on pure descriptor cost.
+///
+/// Walks at most `max_entries` entries even when `side_data_elems`
+/// reports a larger count. Defense-in-depth against a corrupt or hostile
+/// packet whose `side_data_elems` lies about the actual array length:
+/// the caller is expected to also reject any packet whose count exceeds
+/// the cap (so the inflated clone is never created), but bounding the
+/// walk here means a stale or weaponised value can never trigger an
+/// unbounded raw-pointer scan from the safe API.
+///
+/// Reads only the `size` field of each `AVPacketSideData` entry — never
+/// touches the bindgen `AVPacketSideDataType` enum, so no UB even if a
+/// future FFmpeg adds a side-data type discriminant our build doesn't
+/// know.
+fn packet_side_data_bytes(packet: &Packet, max_entries: usize) -> usize {
+  // SAFETY: AVPacket.side_data is `*mut AVPacketSideData` and
+  // side_data_elems is `c_int`; both are raw struct fields safe to read.
+  // Field projection (`.size`) does not reconstruct the enum-typed `type_`
+  // field, so the bindgen-enum UB hazard does not apply here.
+  unsafe {
+    let raw = packet.as_ptr();
+    let nel = (*raw).side_data_elems;
+    let arr = (*raw).side_data;
+    if arr.is_null() || nel <= 0 || max_entries == 0 {
+      return 0;
+    }
+    let count = (nel as usize).min(max_entries);
+    let mut total = count.saturating_mul(SIDE_DATA_ENTRY_OVERHEAD);
+    for i in 0..count {
+      let entry = arr.add(i);
+      total = total.saturating_add((*entry).size);
+    }
+    total
+  }
+}
+
+/// Number of `AVPacketSideData` entries on `packet`. The probe buffer
+/// uses this to enforce [`MAX_PROBE_PACKET_SIDE_DATA_ENTRIES`] before
+/// cloning, so a packet whose entry count alone would dominate retained
+/// memory is rejected up front.
+fn packet_side_data_count(packet: &Packet) -> usize {
+  // SAFETY: side_data_elems is `c_int`, safe to read; clamp negatives to 0.
+  let nel = unsafe { (*packet.as_ptr()).side_data_elems };
+  if nel <= 0 {
+    0
+  } else {
+    nel as usize
+  }
+}
+
+/// Just `EAGAIN` (separate from EOF — the FFmpeg send/receive state machine
+/// distinguishes "drain output and retry" from "stream over").
+fn is_eagain(e: &ffmpeg_next::Error) -> bool {
+  matches!(e, ffmpeg_next::Error::Other { errno } if *errno == ffmpeg_next::error::EAGAIN)
+}
+
+/// Look up the decoder for `parameters` without going through the bindgen
+/// `AVCodecID` Rust enum. Reads the codec_id field as raw `u32` via
+/// `addr_of!` + `ptr::read` so a value not in our build's discriminant
+/// set never invokes UB.
+fn find_decoder(parameters: &codec::Parameters) -> Result<Codec> {
+  ensure_parameters_non_null(parameters)?;
+  // SAFETY: parameters' inner pointer is non-null (checked above);
+  // addr_of! projects to the codec_id field; the *const u32 cast is sound
+  // because AVCodecID is `#[repr(u32)]` (same size and alignment as u32).
+  // Reading as u32 cannot be UB regardless of the value FFmpeg wrote.
+  let raw_id: u32 =
+    unsafe { ptr::read(ptr::addr_of!((*parameters.as_ptr()).codec_id) as *const u32) };
+
+  // Call C `avcodec_find_decoder` via our local `c_int`-typed shim — we
+  // never construct an `AVCodecID` enum from `raw_id`. The C function
+  // returns NULL for unknown ids, which we surface as `Error::NoCodec`.
+  // SAFETY: avcodec_find_decoder is a pure FFmpeg lookup; passing any
+  // c_int is sound (returns NULL for unknown).
+  let codec_ptr = unsafe { c_shims::avcodec_find_decoder(raw_id as libc::c_int) };
+  if codec_ptr.is_null() {
+    return Err(Error::NoCodec(raw_id));
+  }
+  // SAFETY: codec_ptr is a non-null *const AVCodec into FFmpeg's static
+  // codec table; it lives for the duration of the program.
+  Ok(unsafe { Codec::wrap(codec_ptr) })
+}
+
+/// Drain output frames from a candidate decoder during probe replay,
+/// transferring each one from the candidate's HW context to a fresh CPU
+/// frame and queueing it. Returns `Ok(())` once the candidate signals
+/// EAGAIN/EOF. The transfer happens while the candidate is still alive
+/// (its `AVHWFramesContext` is reachable); the resulting CPU frames remain
+/// valid after the candidate is committed because they hold their own
+/// buffer references with no dependency on the original device context.
+fn drain_into_pending(
+  decoder: &mut ffmpeg_next::decoder::Video,
+  hw_buf: &mut frame::Video,
+  pending: &mut VecDeque<frame::Video>,
+  pending_bytes: &mut usize,
+  max_bytes: usize,
+) -> std::result::Result<(), ffmpeg_next::Error> {
+  loop {
+    match decoder.receive_frame(hw_buf) {
+      Ok(()) => {
+        // Pre-transfer cap check: if we are already at or over either cap,
+        // the candidate is producing more than we can hold. Treat as an
+        // explicit candidate failure so `advance_probe` can try the next
+        // backend instead of committing a stream with silently-dropped
+        // frames in the middle.
+        //
+        // TODO: at very large frame sizes (8K HDR P010, > ~96 MiB each)
+        // even a single retained frame is significant. Future direction:
+        // memmap-backed pending frames (write to a temp file or shared
+        // memory segment) so the resident set stays bounded even when the
+        // byte cap is raised. Out of scope for v0.0.0.
+        if pending.len() >= MAX_PROBE_PENDING_FRAMES || *pending_bytes >= max_bytes {
+          tracing::warn!(
+            frames = pending.len(),
+            bytes = *pending_bytes,
+            max_frames = MAX_PROBE_PENDING_FRAMES,
+            max_bytes = max_bytes,
+            "hwdecode: probe pending cap reached; failing candidate replay"
+          );
+          // SAFETY: hw_buf is owned and valid; unref of an empty frame is a no-op.
+          unsafe { av_frame_unref(hw_buf.as_mut_ptr()) };
+          return Err(ffmpeg_next::Error::Other {
+            errno: libc::ENOMEM,
+          });
+        }
+        // Pre-transfer size guard: `av_hwframe_transfer_data` will
+        // allocate the CPU buffer based on `hw_buf`'s dimensions. If a
+        // single frame's worst-case footprint already pushes past the
+        // cap, refuse the candidate **before** allocating so RSS does
+        // not spike on a frame we'd immediately drop. Uses a width *
+        // height * `WORST_CASE_BYTES_PER_PIXEL` upper bound; the
+        // post-transfer accounting via `cpu_frame_bytes` below stays in
+        // place as a backstop using the actual stride/format.
+        let estimated_bytes = match estimate_transfer_bytes(hw_buf) {
+          Some(b) => b,
+          None => {
+            // SAFETY: AVFrame.width/height are c_int reads.
+            let (w, h) = unsafe {
+              let raw = hw_buf.as_ptr();
+              ((*raw).width, (*raw).height)
+            };
+            tracing::warn!(
+              width = w,
+              height = h,
+              "hwdecode: HW frame dimensions invalid for sizing; failing candidate replay"
+            );
+            unsafe { av_frame_unref(hw_buf.as_mut_ptr()) };
+            return Err(ffmpeg_next::Error::Other {
+              errno: libc::ENOMEM,
+            });
+          }
+        };
+        let estimated_total = pending_bytes.saturating_add(estimated_bytes);
+        if estimated_total > max_bytes {
+          // SAFETY: AVFrame.width/height are c_int reads.
+          let (w, h) = unsafe {
+            let raw = hw_buf.as_ptr();
+            ((*raw).width, (*raw).height)
+          };
+          tracing::warn!(
+            pending_bytes = *pending_bytes,
+            estimated_bytes,
+            width = w,
+            height = h,
+            max_bytes = max_bytes,
+            "hwdecode: pre-transfer size estimate exceeds cap; \
+             refusing candidate replay before allocating CPU frame"
+          );
+          unsafe { av_frame_unref(hw_buf.as_mut_ptr()) };
+          return Err(ffmpeg_next::Error::Other {
+            errno: libc::ENOMEM,
+          });
+        }
+        let mut cpu = alloc_av_frame()?;
+        // SAFETY: hw_buf is a freshly-decoded HW frame;
+        // `av_hwframe_transfer_data` allocates pixel buffers on `cpu`.
+        // We use `copy_frame_props_minimal` (only `pts`) instead of
+        // `av_frame_copy_props` for the same reason as
+        // `transfer_hw_frame`: the public `Frame` API does not expose
+        // side data / metadata / opaque refs, so deep-copying them per
+        // frame is pure cost and an unbounded allocation source on
+        // attacker-controlled streams.
+        unsafe {
+          let r1 = av_hwframe_transfer_data(cpu.as_mut_ptr(), hw_buf.as_ptr(), 0);
+          if r1 < 0 {
+            return Err(ffmpeg_next::Error::from(r1));
+          }
+        }
+        let pixel_bytes = match cpu_frame_bytes(&cpu) {
+          Some(b) => b,
+          None => {
+            // Unknown pix_fmt or vertically-flipped layout — we cannot
+            // bound this frame's contribution against the byte cap, so up
+            // to MAX_PROBE_PENDING_FRAMES of them could exhaust memory.
+            // Fail the candidate so probing tries the next backend
+            // rather than queueing untracked allocations.
+            // SAFETY: AVFrame.format is c_int, safe to read.
+            let pix_fmt: i32 = unsafe { (*cpu.as_ptr()).format };
+            tracing::warn!(
+              pix_fmt,
+              "hwdecode: cannot size unknown CPU pix_fmt during replay; failing candidate"
+            );
+            // cpu drops here.
+            return Err(ffmpeg_next::Error::Other {
+              errno: libc::ENOMEM,
+            });
+          }
+        };
+        let new_total = pending_bytes.saturating_add(pixel_bytes);
+        if new_total > max_bytes {
+          tracing::warn!(
+            pending_bytes = *pending_bytes,
+            pixel_bytes,
+            max_bytes,
+            "hwdecode: queueing this frame would exceed byte cap; \
+             failing candidate replay"
+          );
+          // cpu drops here without ever paying a metadata deep copy.
+          return Err(ffmpeg_next::Error::Other {
+            errno: libc::ENOMEM,
+          });
+        }
+        // Cap check passed — copy only the scalar AVFrame fields the
+        // public API needs. SAFETY: cpu and hw_buf are both valid
+        // AVFrames we own.
+        unsafe {
+          copy_frame_props_minimal(cpu.as_mut_ptr(), hw_buf.as_ptr());
+        }
+        *pending_bytes = new_total;
+        pending.push_back(cpu);
+      }
+      Err(e) if is_transient(&e) => return Ok(()),
+      Err(e) => return Err(e),
+    }
+  }
+}
+
+/// Conservative upper-bound estimate of the bytes
+/// `av_hwframe_transfer_data` will allocate when downloading `hw_buf` to
+/// a CPU frame. Used by [`drain_into_pending`] as a pre-transfer guard
+/// so a candidate replay can refuse a frame whose footprint would
+/// exceed the byte budget *without* first paying the allocation. The
+/// estimate is `width * height * WORST_CASE_BYTES_PER_PIXEL` — see that
+/// constant for why we err on the high side.
+///
+/// Returns `None` when the frame's `width` or `height` are not strictly
+/// positive (caller treats as candidate failure — a HW frame with
+/// non-positive dimensions cannot be transferred meaningfully).
+fn estimate_transfer_bytes(hw_buf: &frame::Video) -> Option<usize> {
+  // SAFETY: AVFrame.width / height are c_int reads.
+  let (w, h) = unsafe {
+    let raw = hw_buf.as_ptr();
+    ((*raw).width, (*raw).height)
+  };
+  if w <= 0 || h <= 0 {
+    return None;
+  }
+  Some(
+    (w as usize)
+      .saturating_mul(h as usize)
+      .saturating_mul(WORST_CASE_BYTES_PER_PIXEL),
+  )
+}
+
+/// Approximate resident size of a CPU frame: sum of `linesize[plane] *
+/// plane_height` across populated planes.
+///
+/// Returns `None` for pixel formats not in our chroma-subsampling table or
+/// for frames whose `linesize` is negative — both signal an allocation we
+/// cannot account for, so the caller refuses to queue them. Returning 0
+/// in either case would silently bypass the byte cap and let an unbounded
+/// number of large frames into `pending_frames`.
+///
+/// Distinguishes `linesize == 0` (FFmpeg's sentinel for "no more populated
+/// planes" — terminates the scan) from `linesize < 0` (FFmpeg's vertically-
+/// flipped layout — `Frame::row` rejects those as unusable, so queueing one
+/// during probe replay would only delay the failure to the consumer side
+/// while wasting `|linesize| * plane_h` bytes of unaccounted memory).
+fn cpu_frame_bytes(frame: &frame::Video) -> Option<usize> {
+  // SAFETY: AVFrame.height / format / linesize are c_int reads.
+  let (height, pix_fmt, linesizes) = unsafe {
+    let raw = frame.as_ptr();
+    ((*raw).height as usize, (*raw).format, (*raw).linesize)
+  };
+  let mut total: usize = 0;
+  let mut any_plane = false;
+  for (plane, linesize) in linesizes.iter().enumerate() {
+    if *linesize == 0 {
+      // End of populated planes — FFmpeg zeroes the trailing entries.
+      break;
+    }
+    if *linesize < 0 {
+      // Vertically-flipped layout — refuse to size so `drain_into_pending`
+      // fails the candidate. The same pre-fix code path silently returned
+      // `Some(0)` for a frame whose first plane was negative, allowing up
+      // to MAX_PROBE_PENDING_FRAMES frames of unaccounted memory.
+      return None;
+    }
+    any_plane = true;
+    let stride = *linesize as usize;
+    // If we can't size *any* populated plane, the format is outside our
+    // table — refuse to size the frame at all (conservative; discarding
+    // is safer than under-counting against the byte cap).
+    let plane_h = crate::frame::plane_height_for(pix_fmt, plane, height)?;
+    total = total.saturating_add(stride.saturating_mul(plane_h));
+  }
+  if !any_plane {
+    // Genuinely empty frame (no populated planes) — nothing to account for.
+    return Some(0);
+  }
+  Some(total)
+}
+
+#[allow(dead_code)]
+fn _assert_send() {
+  fn check<T: Send>() {}
+  check::<VideoDecoder>();
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  #[test]
+  fn no_codec_for_unknown_id() {
+    let err = Error::NoCodec(0);
+    assert!(format!("{err}").contains("no decoder"));
+  }
+
+  #[test]
+  fn videodecoder_is_send() {
+    _assert_send();
+  }
+
+  #[test]
+  fn is_transient_recognises_eagain_and_eof() {
+    let eagain = ffmpeg_next::Error::Other {
+      errno: ffmpeg_next::error::EAGAIN,
+    };
+    assert!(is_transient(&eagain));
+    assert!(is_transient(&ffmpeg_next::Error::Eof));
+    let other = ffmpeg_next::Error::InvalidData;
+    assert!(!is_transient(&other));
+  }
+
+  /// Regression: a `codec::Parameters` with a null inner pointer must be
+  /// rejected at the entrypoint, not deref'd. ffmpeg-next's
+  /// `Parameters::new()` does not check `avcodec_parameters_alloc()`, so a
+  /// safe caller can hand us such a value under OOM.
+  #[test]
+  fn open_rejects_null_parameters() {
+    // SAFETY: Parameters::wrap accepts any pointer; we explicitly construct
+    // one with null inner. avcodec_parameters_free is null-safe on Drop.
+    let null_params = unsafe { codec::Parameters::wrap(std::ptr::null_mut(), None) };
+    match VideoDecoder::open(null_params) {
+      Ok(_) => panic!("open should fail on null parameters"),
+      Err(Error::Ffmpeg(ffmpeg_next::Error::Other { errno })) => {
+        assert_eq!(errno, libc::ENOMEM, "expected ENOMEM, got {errno}");
+      }
+      Err(other) => panic!("expected Ffmpeg(Other {{ ENOMEM }}), got {other:?}"),
+    }
+  }
+
+  #[test]
+  fn open_with_rejects_null_parameters() {
+    // SAFETY: see open_rejects_null_parameters.
+    let null_params = unsafe { codec::Parameters::wrap(std::ptr::null_mut(), None) };
+    match VideoDecoder::open_with(null_params, Backend::VideoToolbox) {
+      Ok(_) => panic!("open_with should fail on null parameters"),
+      Err(Error::Ffmpeg(ffmpeg_next::Error::Other { errno })) => {
+        assert_eq!(errno, libc::ENOMEM, "expected ENOMEM, got {errno}");
+      }
+      Err(other) => panic!("expected Ffmpeg(Other {{ ENOMEM }}), got {other:?}"),
+    }
+  }
+
+  /// `try_clone_packet` calls `av_packet_ref`, which deep-copies side
+  /// data via `av_packet_copy_props`. The probe budget therefore has to
+  /// include side-data bytes — otherwise a stream with a 16-byte payload
+  /// and a 1 MiB side-data attachment would only consume 16 bytes of the
+  /// 64 MiB budget per packet, and 256 buffered clones would retain
+  /// ~256 MiB of side data while logs claim a few KiB.
+  #[test]
+  fn packet_side_data_counts_against_probe_budget() {
+    use ffmpeg_next::ffi::{av_packet_new_side_data, AVPacketSideDataType};
+
+    const PAYLOAD_SIZE: usize = 16;
+    const SIDE_DATA_SIZE: usize = 1024 * 1024; // 1 MiB
+
+    let mut packet = Packet::new(PAYLOAD_SIZE);
+    // SAFETY: packet is a freshly allocated AVPacket; av_packet_new_side_data
+    // attaches a fresh `SIDE_DATA_SIZE`-byte buffer of the requested type
+    // to it and returns a writable pointer (or NULL on OOM).
+    let p = unsafe {
+      av_packet_new_side_data(
+        packet.as_mut_ptr(),
+        AVPacketSideDataType::AV_PKT_DATA_NEW_EXTRADATA,
+        SIDE_DATA_SIZE,
+      )
+    };
+    assert!(!p.is_null(), "av_packet_new_side_data returned NULL");
+
+    assert_eq!(packet.size(), PAYLOAD_SIZE);
+    let side = packet_side_data_bytes(&packet, MAX_PROBE_PACKET_SIDE_DATA_ENTRIES);
+    assert!(
+      side >= SIDE_DATA_SIZE,
+      "side-data accounting must include the attached buffer; got {side}"
+    );
+    let total = packet.size().saturating_add(side);
+    assert!(
+      total >= PAYLOAD_SIZE + SIDE_DATA_SIZE,
+      "probe budget must charge payload + side data; got {total}"
+    );
+  }
+
+  #[test]
+  fn packet_side_data_is_zero_when_no_side_data() {
+    let packet = Packet::new(64);
+    assert_eq!(
+      packet_side_data_bytes(&packet, MAX_PROBE_PACKET_SIDE_DATA_ENTRIES),
+      0
+    );
+    assert_eq!(packet_side_data_count(&packet), 0);
+  }
+
+  /// Packets with many tiny side-data entries must be charged the
+  /// per-entry descriptor + ref overhead, even when each entry's payload
+  /// `size` is zero. Without `SIDE_DATA_ENTRY_OVERHEAD`, a packet stuffed
+  /// with N zero-byte entries would charge 0 bytes against the budget
+  /// while `av_packet_ref` still allocates ~`N * 80` bytes of descriptor
+  /// + AVBufferRef + allocator overhead per cloned copy.
+  #[test]
+  fn packet_side_data_bytes_charges_descriptor_overhead_for_zero_size_entries() {
+    use ffmpeg_next::ffi::{av_packet_new_side_data, AVPacketSideDataType};
+
+    let mut packet = Packet::new(0);
+    // Attach two zero-byte entries of distinct types so neither call
+    // replaces the other.
+    let p1 = unsafe {
+      av_packet_new_side_data(
+        packet.as_mut_ptr(),
+        AVPacketSideDataType::AV_PKT_DATA_NEW_EXTRADATA,
+        0,
+      )
+    };
+    let p2 = unsafe {
+      av_packet_new_side_data(
+        packet.as_mut_ptr(),
+        AVPacketSideDataType::AV_PKT_DATA_PALETTE,
+        0,
+      )
+    };
+    assert!(
+      !p1.is_null() && !p2.is_null(),
+      "av_packet_new_side_data NULL"
+    );
+
+    assert_eq!(packet_side_data_count(&packet), 2);
+    let bytes = packet_side_data_bytes(&packet, MAX_PROBE_PACKET_SIDE_DATA_ENTRIES);
+    assert!(
+      bytes >= 2 * SIDE_DATA_ENTRY_OVERHEAD,
+      "must charge descriptor overhead per entry even at zero payload; got {bytes}"
+    );
+  }
+
+  /// `packet_side_data_bytes` must clamp its walk to `max_entries`
+  /// regardless of `side_data_elems`. Defense-in-depth: the caller is
+  /// expected to short-circuit packets whose count exceeds the cap, but
+  /// if a corrupt or weaponised packet ever does reach the helper, the
+  /// internal cap prevents an unbounded raw-pointer walk.
+  ///
+  /// This test attaches 5 entries of distinct types and asks the helper
+  /// to walk only the first 2. Result must equal exactly `2 * overhead +
+  /// (size_a + size_b)`, confirming entries 3-5 were not even read.
+  #[test]
+  fn packet_side_data_bytes_respects_max_entries_cap() {
+    use ffmpeg_next::ffi::{av_packet_new_side_data, AVPacketSideDataType};
+
+    let mut packet = Packet::new(0);
+    // Five distinct side-data types so each `av_packet_new_side_data`
+    // call appends rather than replaces.
+    let types_and_sizes: [(AVPacketSideDataType, usize); 5] = [
+      (AVPacketSideDataType::AV_PKT_DATA_NEW_EXTRADATA, 100),
+      (AVPacketSideDataType::AV_PKT_DATA_PALETTE, 200),
+      (AVPacketSideDataType::AV_PKT_DATA_REPLAYGAIN, 300),
+      (AVPacketSideDataType::AV_PKT_DATA_DISPLAYMATRIX, 400),
+      (AVPacketSideDataType::AV_PKT_DATA_STEREO3D, 500),
+    ];
+    for (ty, size) in types_and_sizes {
+      let p = unsafe { av_packet_new_side_data(packet.as_mut_ptr(), ty, size) };
+      assert!(!p.is_null(), "av_packet_new_side_data returned NULL");
+    }
+    assert_eq!(packet_side_data_count(&packet), 5);
+
+    let walked_2 = packet_side_data_bytes(&packet, 2);
+    let walked_5 = packet_side_data_bytes(&packet, 5);
+
+    assert_eq!(
+      walked_2,
+      2 * SIDE_DATA_ENTRY_OVERHEAD + 100 + 200,
+      "max_entries=2 must walk exactly the first two entries"
+    );
+    assert_eq!(
+      walked_5,
+      5 * SIDE_DATA_ENTRY_OVERHEAD + 100 + 200 + 300 + 400 + 500,
+      "max_entries=5 must walk all five entries"
+    );
+    // max_entries=0 short-circuits to 0.
+    assert_eq!(packet_side_data_bytes(&packet, 0), 0);
+    // max_entries larger than the actual count clamps to the actual count
+    // (no out-of-bounds walk past `side_data_elems`).
+    let walked_huge = packet_side_data_bytes(&packet, 1_000_000);
+    assert_eq!(walked_huge, walked_5);
+  }
+
+  /// `MAX_PROBE_PACKET_SIDE_DATA_ENTRIES` is the cliff above which a
+  /// packet is rejected from the probe buffer regardless of byte total —
+  /// pure descriptor inflation is its own attack vector. Sanity-check
+  /// that `packet_side_data_count` reports the value the cap is checked
+  /// against.
+  #[test]
+  fn packet_side_data_count_reports_attached_entries() {
+    use ffmpeg_next::ffi::{av_packet_new_side_data, AVPacketSideDataType};
+
+    let mut packet = Packet::new(0);
+    let _p1 = unsafe {
+      av_packet_new_side_data(
+        packet.as_mut_ptr(),
+        AVPacketSideDataType::AV_PKT_DATA_NEW_EXTRADATA,
+        4,
+      )
+    };
+    let _p2 = unsafe {
+      av_packet_new_side_data(
+        packet.as_mut_ptr(),
+        AVPacketSideDataType::AV_PKT_DATA_PALETTE,
+        4,
+      )
+    };
+    assert_eq!(packet_side_data_count(&packet), 2);
+  }
+
+  /// `cpu_frame_bytes` must refuse to size a frame whose first plane has
+  /// a negative `linesize`. Pre-fix, the loop break treated negative the
+  /// same as zero (FFmpeg's "no more populated planes" sentinel), so a
+  /// vertically-flipped frame returned `Some(0)` and `drain_into_pending`
+  /// would queue it as a 0-byte allocation — letting up to
+  /// `MAX_PROBE_PENDING_FRAMES` such frames bypass the configured byte
+  /// budget entirely.
+  #[test]
+  fn cpu_frame_bytes_rejects_negative_first_plane_linesize() {
+    let mut f = frame::Video::empty();
+    // SAFETY: f is freshly allocated; we set `format` to NV12 and the
+    // first plane's linesize negative (FFmpeg's vertical-flip convention).
+    // No backing data buffer is allocated — cpu_frame_bytes must reject
+    // before any pointer dereference.
+    unsafe {
+      let raw = f.as_mut_ptr();
+      (*raw).format = crate::pix_fmt::NV12;
+      (*raw).width = 1920;
+      (*raw).height = 1080;
+      (*raw).linesize[0] = -1920;
+      (*raw).linesize[1] = -1920;
+    }
+    assert!(
+      cpu_frame_bytes(&f).is_none(),
+      "negative linesize must be unsizeable, not Some(0)"
+    );
+  }
+
+  /// Sanity-check the positive path: a synthesized NV12 frame with valid
+  /// linesizes must report the sum across populated planes (Y full height
+  /// + UV half height).
+  #[test]
+  fn cpu_frame_bytes_sums_populated_planes() {
+    let mut f = frame::Video::empty();
+    let stride = 1920usize;
+    let height = 1080usize;
+    // SAFETY: same scheme as above; we only mutate primitive struct fields.
+    unsafe {
+      let raw = f.as_mut_ptr();
+      (*raw).format = crate::pix_fmt::NV12;
+      (*raw).width = 1920;
+      (*raw).height = height as i32;
+      (*raw).linesize[0] = stride as i32;
+      (*raw).linesize[1] = stride as i32;
+    }
+    let expected = stride * height + stride * (height / 2);
+    assert_eq!(cpu_frame_bytes(&f), Some(expected));
+  }
+
+  /// A frame with only a zero linesize in plane 0 is "no populated
+  /// planes" — must return `Some(0)`, not `None`. Distinguishes the
+  /// FFmpeg sentinel from the vertically-flipped layout.
+  #[test]
+  fn cpu_frame_bytes_zero_first_plane_returns_zero() {
+    let f = frame::Video::empty();
+    // Default-allocated empty AVFrame already has all linesizes zero.
+    assert_eq!(cpu_frame_bytes(&f), Some(0));
+  }
+
+  /// `estimate_transfer_bytes` is the pre-transfer size guard for
+  /// `drain_into_pending`: it must compute `width * height *
+  /// WORST_CASE_BYTES_PER_PIXEL` so the candidate replay can refuse a
+  /// frame *before* `av_hwframe_transfer_data` allocates.
+  #[test]
+  fn estimate_transfer_bytes_uses_worst_case_per_pixel() {
+    let mut f = frame::Video::empty();
+    // SAFETY: f is freshly allocated; we set width/height directly.
+    unsafe {
+      let raw = f.as_mut_ptr();
+      (*raw).width = 1920;
+      (*raw).height = 1080;
+    }
+    assert_eq!(
+      estimate_transfer_bytes(&f),
+      Some(1920 * 1080 * WORST_CASE_BYTES_PER_PIXEL),
+    );
+  }
+
+  /// Non-positive dimensions surface as `None` so `drain_into_pending`
+  /// fails the candidate before allocating anything. A zero-width or
+  /// zero-height frame would silently yield a 0-byte estimate under the
+  /// raw multiplication, letting the cap check pass and exposing the
+  /// allocation path to whatever the actual transfer would do.
+  #[test]
+  fn estimate_transfer_bytes_rejects_non_positive_dimensions() {
+    let mut f = frame::Video::empty();
+    unsafe {
+      let raw = f.as_mut_ptr();
+      (*raw).width = 0;
+      (*raw).height = 1080;
+    }
+    assert!(estimate_transfer_bytes(&f).is_none());
+
+    unsafe {
+      (*f.as_mut_ptr()).width = 1920;
+      (*f.as_mut_ptr()).height = -1;
+    }
+    assert!(estimate_transfer_bytes(&f).is_none());
+  }
+
+  /// 8K HDR P010 has actual ~96 MiB resident size; the estimate should
+  /// over-charge it (the right side to err on for a memory cap) while
+  /// still fitting within the configurable
+  /// [`DEFAULT_MAX_PROBE_PENDING_BYTES`] cap (256 MiB) for a single
+  /// frame so a default-configured decoder is not forced to reject 8K
+  /// streams outright.
+  #[test]
+  fn estimate_transfer_bytes_8k_fits_default_cap() {
+    let mut f = frame::Video::empty();
+    unsafe {
+      let raw = f.as_mut_ptr();
+      (*raw).width = 7680;
+      (*raw).height = 4320;
+    }
+    let estimate = estimate_transfer_bytes(&f).expect("8K is sizable");
+    // ~256 MiB exactly — at-or-just-under the default cap.
+    assert!(
+      estimate <= DEFAULT_MAX_PROBE_PENDING_BYTES,
+      "8K estimate {estimate} must fit DEFAULT_MAX_PROBE_PENDING_BYTES \
+       {DEFAULT_MAX_PROBE_PENDING_BYTES}; otherwise the default cap rejects \
+       even a single 8K frame at probe time"
+    );
+    // And strictly larger than a typical 8K P010 (~96 MiB) so the guard
+    // is actually conservative, not under-charging.
+    assert!(
+      estimate > 96 * 1024 * 1024,
+      "estimate must over-charge real 8K P010 to bound the worst case; got {estimate}"
+    );
+  }
+
+  /// `PartialBuildState`'s `Drop` must be a no-op when both pointers are
+  /// null — the disarmed-by-`into_owned` post-state. A panic / double-free
+  /// here would break the success path of every `build_state` call.
+  #[test]
+  fn partial_build_state_drop_is_no_op_on_null_pointers() {
+    let _g = PartialBuildState {
+      hw_device_ref: ptr::null_mut(),
+      callback_state: ptr::null_mut(),
+    };
+    // Drops at end of scope. Test passes if it doesn't panic / crash.
+  }
+
+  /// `into_owned` must return the original pointers and disarm the guard
+  /// (so the guard's Drop becomes a no-op and the caller can safely
+  /// transfer ownership to `DecoderState` without double-freeing).
+  #[test]
+  fn partial_build_state_into_owned_disarms_and_returns_originals() {
+    use ffmpeg_next::ffi::{av_buffer_alloc, av_buffer_unref, AVPixelFormat};
+
+    // SAFETY: av_buffer_alloc returns a fresh AVBufferRef* with refcount
+    // 1, or NULL on OOM. We free it ourselves below (after into_owned
+    // disarms the guard).
+    let hw_ptr = unsafe { av_buffer_alloc(64) };
+    assert!(!hw_ptr.is_null(), "av_buffer_alloc(64) returned NULL");
+    let cb_ptr = Box::into_raw(Box::new(CallbackState {
+      wanted: AVPixelFormat::AV_PIX_FMT_NONE,
+      wanted_int: AVPixelFormat::AV_PIX_FMT_NONE as i32,
+    }));
+
+    let g = PartialBuildState {
+      hw_device_ref: hw_ptr,
+      callback_state: cb_ptr,
+    };
+    let (hw_back, cb_back) = g.into_owned();
+    assert_eq!(
+      hw_back, hw_ptr,
+      "into_owned must return the original device ref"
+    );
+    assert_eq!(
+      cb_back, cb_ptr,
+      "into_owned must return the original callback box"
+    );
+
+    // Guard is now disarmed (its Drop ran with null pointers as soon as
+    // into_owned consumed it). We own the pointers and must free them.
+    // SAFETY: hw_ptr and cb_ptr are still the freshly-allocated values.
+    unsafe {
+      let mut hw = hw_back;
+      av_buffer_unref(&mut hw);
+      drop(Box::from_raw(cb_back));
+    }
+  }
+
+  /// Probe-abandon paths in `send_packet` (cap exceeded, packet clone
+  /// failed) must not drop frames already queued in `pending_frames`.
+  /// Those frames belong to the currently active backend — possibly a
+  /// candidate that `advance_probe` just committed earlier in the same
+  /// `send_packet` call — and are valid output the caller will dequeue
+  /// via `receive_frame`.
+  ///
+  /// Pre-fix, both abandon branches called `pending_frames.clear()`
+  /// alongside `self.probe = None;`, silently dropping initial frames at
+  /// exactly the cap-overflow / OOM-stress paths.
+  ///
+  /// Live HW required: a real `VideoDecoder` is the only way to construct
+  /// a valid `DecoderState` (its `Drop` invokes FFmpeg cleanup), and
+  /// `send_packet` must reach the Ok branch on a real decoder for the
+  /// cap check to fire.
+  #[test]
+  #[ignore = "requires HWDECODE_SAMPLE_VIDEO and a working hardware backend"]
+  fn cap_overflow_preserves_pending_frames_from_active_backend() {
+    use ffmpeg_next::{format, media};
+
+    let path = std::env::var_os("HWDECODE_SAMPLE_VIDEO")
+      .expect("HWDECODE_SAMPLE_VIDEO must be set for this test");
+
+    ffmpeg_next::init().expect("ffmpeg init");
+    let mut input = format::input(&path).expect("open input");
+    let stream_index = input
+      .streams()
+      .best(media::Type::Video)
+      .expect("video stream")
+      .index();
+    let stream_params = input
+      .streams()
+      .best(media::Type::Video)
+      .expect("video stream")
+      .parameters();
+
+    let mut decoder = VideoDecoder::open(stream_params).expect("open decoder");
+    assert!(
+      decoder.probe.is_some(),
+      "probe must be active immediately after open"
+    );
+
+    // Inject sentinel frames as if `advance_probe` had drained them from
+    // a freshly-committed candidate during this same send_packet call.
+    decoder.pending_frames.push_back(frame::Video::empty());
+    decoder.pending_frames.push_back(frame::Video::empty());
+    let pending_before = decoder.pending_frames.len();
+
+    // Fast-forward the probe state to the byte cap so the next successful
+    // send_packet trips the cap-overflow branch.
+    decoder
+      .probe
+      .as_mut()
+      .expect("probe present")
+      .buffered_bytes = MAX_PROBE_PACKET_BYTES;
+
+    // Find the first video packet and feed it. We don't care whether the
+    // underlying decoder actually accepts it cleanly; we only need to
+    // exercise the Ok branch's cap-overflow accounting at least once.
+    let mut hit_ok = false;
+    for (s, packet) in input.packets() {
+      if s.index() != stream_index {
+        continue;
+      }
+      if decoder.send_packet(&packet).is_ok() {
+        hit_ok = true;
+        break;
+      }
+    }
+    assert!(
+      hit_ok,
+      "expected at least one send_packet to succeed and trigger the cap-overflow branch"
+    );
+
+    assert!(
+      decoder.probe.is_none(),
+      "probe must be abandoned after cap overflow"
+    );
+    assert_eq!(
+      decoder.pending_frames.len(),
+      pending_before,
+      "pending_frames belong to the active backend; abandon must not drop them"
+    );
+  }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..955d215
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,43 @@
+use crate::backend::Backend;
+
+/// Crate result alias.
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Errors returned from [`crate::VideoDecoder`].
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+  /// An underlying FFmpeg error.
+  #[error("ffmpeg error: {0}")]
+  Ffmpeg(#[from] ffmpeg_next::Error),
+
+  /// `avcodec_find_decoder` returned null for the input codec id. The id
+  /// is reported as the raw integer (`AVCodecID` discriminant) — we do not
+  /// construct the bindgen `AVCodecID` enum from a runtime value, since
+  /// values outside our build's discriminant set would invoke UB.
+  #[error("no decoder for codec id {0}")]
+  NoCodec(u32),
+
+  /// The codec does not advertise a hardware configuration matching the
+  /// requested backend (via `avcodec_get_hw_config`).
+  #[error("codec does not support backend {0:?}")]
+  BackendUnsupportedByCodec(Backend),
+
+  /// `av_hwdevice_ctx_create` failed for the requested backend.
+  #[error("hardware device init failed for {backend:?}: {source}")]
+  HwDeviceInitFailed {
+    /// Backend that failed to initialise.
+    backend: Backend,
+    /// Underlying FFmpeg error.
+    source: ffmpeg_next::Error,
+  },
+
+  /// Auto-probe exhausted every backend in the platform's order. Empty
+  /// `attempts` means the platform has no hardware backends listed in
+  /// [`crate::Backend`] for the current `target_os` — callers must
+  /// fall back to a software decoder of their choice.
+  #[error("all hardware backends failed; attempts: {attempts:?}")]
+  AllBackendsFailed {
+    /// Per-backend errors collected during probing, in the order tried.
+    attempts: Vec<(Backend, Box<Error>)>,
+  },
+}
diff --git a/src/ffi.rs b/src/ffi.rs
new file mode 100644
index 0000000..04aa50f
--- /dev/null
+++ b/src/ffi.rs
@@ -0,0 +1,272 @@
+//! FFI shims used by the decoder. Kept in one place so the unsafe surface is
+//! easy to audit.
+//!
+//! All reads of `AVPixelFormat` / `AVHWDeviceType` values returned by FFmpeg
+//! at runtime go through `ptr::read::<i32>` after a pointer cast, never
+//! through the bindgen-generated Rust enum. The enums are `#[repr(i32)]`
+//! and constructing them from a value not in the listed discriminants is
+//! undefined behavior — exactly the situation header/library skew creates.
+//! See the doc comments on individual functions for what is read as raw
+//! integer vs. constructed from a known constant.
+
+use std::ptr;
+
+use ffmpeg_next::ffi::{
+  avcodec_get_hw_config, AVCodec, AVCodecContext, AVHWDeviceType, AVPixelFormat,
+  AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX,
+};
+
+/// State pointed to by `AVCodecContext::opaque` so [`get_hw_format`] can pick
+/// the correct hardware pixel format without globals. One instance per
+/// decoder; freed by [`crate::VideoDecoder`] after the codec context is
+/// dropped.
+///
+/// `wanted` is set from a hardcoded `AVPixelFormat` constant in our bindings
+/// (via `Backend::hw_pixel_format`), so it is always a valid enum value. We
+/// also store its raw `i32` so the callback can compare against the offered
+/// list without going through enum reads.
+#[repr(C)]
+pub(crate) struct CallbackState {
+  /// Hardware pixel format we want the decoder to produce. Constructed
+  /// from a known constant; safe to use as the callback's return value.
+  pub(crate) wanted: AVPixelFormat,
+  /// Same value as `wanted` cast to `i32`, cached so the callback's
+  /// pix_fmts walk doesn't have to convert per iteration.
+  pub(crate) wanted_int: i32,
+}
+
+/// `AVCodecContext::get_format` callback. FFmpeg invokes it with the list of
+/// pixel formats the codec is willing to output for the current stream.
+///
+/// The offered list is walked as `*const i32` (cast from `*const AVPixelFormat`)
+/// to avoid constructing the bindgen enum from values that may not be in our
+/// build's discriminant set. The return value is either `wanted` (a known
+/// constant) or `AV_PIX_FMT_NONE` (also a known constant) — both safe to
+/// produce as `AVPixelFormat`.
+pub(crate) unsafe extern "C" fn get_hw_format(
+  ctx: *mut AVCodecContext,
+  pix_fmts: *const AVPixelFormat,
+) -> AVPixelFormat {
+  debug_assert!(!ctx.is_null());
+  debug_assert!(!pix_fmts.is_null());
+
+  // SAFETY: opaque was set by `try_open` to a valid `Box<CallbackState>`
+  // pointer that outlives the codec context (we only free it after the
+  // codec context's drop runs). When opaque is null we treat the call as
+  // strict — a stray invocation cannot silently downgrade.
+  let state = unsafe { (*ctx).opaque as *const CallbackState };
+  let (wanted, wanted_int) = if state.is_null() {
+    (
+      AVPixelFormat::AV_PIX_FMT_NONE,
+      AVPixelFormat::AV_PIX_FMT_NONE as i32,
+    )
+  } else {
+    unsafe { ((*state).wanted, (*state).wanted_int) }
+  };
+
+  // Walk the offered list as i32. The pointer cast is sound because
+  // `AVPixelFormat` is `#[repr(i32)]` (same size and alignment as i32).
+  // Reading as i32 cannot be UB regardless of the value FFmpeg wrote.
+  let mut p = pix_fmts as *const i32;
+  let none_int = AVPixelFormat::AV_PIX_FMT_NONE as i32;
+  loop {
+    // SAFETY: FFmpeg guarantees the list is terminated by AV_PIX_FMT_NONE.
+    // We bail at the sentinel; reads up to and including it are in-bounds.
+    let v = unsafe { ptr::read(p) };
+    if v == none_int {
+      return AVPixelFormat::AV_PIX_FMT_NONE;
+    }
+    if v == wanted_int {
+      return wanted;
+    }
+    p = unsafe { p.add(1) };
+  }
+}
+
+/// Walk the codec's `AVCodecHWConfig` table and return whether the codec
+/// advertises support for `device_type` **with** `wanted_pix_fmt` via the
+/// `HW_DEVICE_CTX` setup method.
+///
+/// FFmpeg's HW config table is keyed per (device_type, pix_fmt) pair: a
+/// codec can advertise the same device with several different hardware
+/// pixel formats (e.g. VAAPI codecs that offer both `AV_PIX_FMT_VAAPI`
+/// and `AV_PIX_FMT_DRM_PRIME`). Matching only on `device_type` would let
+/// us proceed to install a strict `get_format` callback for a format the
+/// codec never advertises, and the failure would surface deep inside the
+/// probe / decode path instead of up front. Requiring the codec to
+/// advertise the **exact** pix_fmt our `Backend` uses keeps the strict
+/// `get_format` honest and gives `open_with` a clean rejection signal.
+///
+/// All reads from the FFmpeg-supplied `AVCodecHWConfig` are performed as
+/// raw integers via `addr_of!` + `ptr::read::<i32>` to avoid copying or
+/// interpreting enum-typed fields whose runtime values might not match
+/// our build's discriminant set.
+pub(crate) fn codec_supports_hwaccel(
+  codec: *const AVCodec,
+  device_type: AVHWDeviceType,
+  wanted_pix_fmt: i32,
+) -> bool {
+  debug_assert!(!codec.is_null());
+  let device_type_int = device_type as i32;
+  let mut i = 0;
+  loop {
+    // SAFETY: `avcodec_get_hw_config` returns null past the end; we stop then.
+    let cfg = unsafe { avcodec_get_hw_config(codec, i) };
+    if cfg.is_null() {
+      return false;
+    }
+    // Read each field as raw integer rather than copying the whole struct
+    // (which would interpret `pix_fmt` and `device_type` as their enum types).
+    // SAFETY: `cfg` is non-null and points to a valid `AVCodecHWConfig` for
+    // the lifetime of the call; `addr_of!` projects to a sized field; the
+    // `*const i32` cast is sound because `methods` is `c_int` (i32),
+    // `device_type` is `AVHWDeviceType` (`#[repr(u32)]`, but FFmpeg's
+    // assigned values fit in i32 and the runtime layout is i32-sized),
+    // and `pix_fmt` is `AVPixelFormat` (`#[repr(i32)]`).
+    let methods: i32 = unsafe { ptr::read(ptr::addr_of!((*cfg).methods)) };
+    let cfg_device_type_int: i32 =
+      unsafe { ptr::read(ptr::addr_of!((*cfg).device_type) as *const i32) };
+    let cfg_pix_fmt_int: i32 = unsafe { ptr::read(ptr::addr_of!((*cfg).pix_fmt) as *const i32) };
+
+    if methods & (AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX as i32) != 0
+      && cfg_device_type_int == device_type_int
+      && cfg_pix_fmt_int == wanted_pix_fmt
+    {
+      return true;
+    }
+    i += 1;
+  }
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  // The callback derefs `(*ctx).opaque`, so we need a real-looking
+  // AVCodecContext. We construct a zeroed one (the callback only reads opaque).
+  struct FakeCtx(*mut AVCodecContext);
+  impl FakeCtx {
+    fn new(state: *mut CallbackState) -> Self {
+      let boxed: Box<AVCodecContext> = unsafe { Box::new(std::mem::zeroed()) };
+      let raw = Box::into_raw(boxed);
+      unsafe { (*raw).opaque = state.cast() };
+      Self(raw)
+    }
+  }
+  impl Drop for FakeCtx {
+    fn drop(&mut self) {
+      unsafe { drop(Box::from_raw(self.0)) };
+    }
+  }
+
+  fn make_state(wanted: AVPixelFormat) -> CallbackState {
+    CallbackState {
+      wanted,
+      wanted_int: wanted as i32,
+    }
+  }
+
+  fn run(state: &CallbackState, mut offered: Vec<i32>) -> AVPixelFormat {
+    // Build the offered list as raw i32, terminated by AV_PIX_FMT_NONE.
+    offered.push(AVPixelFormat::AV_PIX_FMT_NONE as i32);
+    let ctx = FakeCtx::new(state as *const _ as *mut _);
+    // SAFETY: we cast the i32 buffer pointer to *const AVPixelFormat
+    // because that's the function's declared signature. The callback only
+    // ever reads through *const i32 internally, so this transit through
+    // *const AVPixelFormat is purely a type system formality.
+    unsafe { get_hw_format(ctx.0, offered.as_ptr() as *const AVPixelFormat) }
+  }
+
+  #[test]
+  fn returns_wanted_hw_format_when_offered() {
+    let state = make_state(AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX);
+    let got = run(
+      &state,
+      vec![
+        AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX as i32,
+        AVPixelFormat::AV_PIX_FMT_NV12 as i32,
+      ],
+    );
+    assert_eq!(got, AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX);
+  }
+
+  #[test]
+  fn returns_none_when_wanted_absent() {
+    let state = make_state(AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX);
+    let got = run(
+      &state,
+      vec![
+        AVPixelFormat::AV_PIX_FMT_NV12 as i32,
+        AVPixelFormat::AV_PIX_FMT_YUV420P as i32,
+      ],
+    );
+    assert_eq!(got, AVPixelFormat::AV_PIX_FMT_NONE);
+  }
+
+  #[test]
+  fn null_opaque_is_treated_as_strict() {
+    let boxed: Box<AVCodecContext> = unsafe { Box::new(std::mem::zeroed()) };
+    let ctx_raw = Box::into_raw(boxed);
+    unsafe { (*ctx_raw).opaque = ptr::null_mut() };
+    let offered = [
+      AVPixelFormat::AV_PIX_FMT_NV12 as i32,
+      AVPixelFormat::AV_PIX_FMT_NONE as i32,
+    ];
+    let got = unsafe { get_hw_format(ctx_raw, offered.as_ptr() as *const AVPixelFormat) };
+    assert_eq!(got, AVPixelFormat::AV_PIX_FMT_NONE);
+    unsafe { drop(Box::from_raw(ctx_raw)) };
+  }
+
+  #[test]
+  fn unknown_offered_value_is_skipped_without_ub() {
+    // Simulate a header-skewed FFmpeg that offers a pixel-format value we
+    // don't have a binding constant for (e.g. some future format). The
+    // callback walks the list as i32 — no enum is constructed from that
+    // value, so this read is sound.
+    let state = make_state(AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX);
+    let got = run(
+      &state,
+      vec![
+        99_999_i32, // imaginary unknown
+        AVPixelFormat::AV_PIX_FMT_NV12 as i32,
+      ],
+    );
+    assert_eq!(got, AVPixelFormat::AV_PIX_FMT_NONE);
+  }
+
+  /// `codec_supports_hwaccel` must reject a (device_type, pix_fmt) pair
+  /// that the codec does not advertise — even if the device alone is
+  /// listed. Without this check, the strict `get_format` callback would
+  /// be wired up for a HW pix_fmt the codec never offers and the failure
+  /// would surface deep inside the probe / decode path instead of at
+  /// `open_with` / probe-build time.
+  ///
+  /// macOS-only: the test relies on FFmpeg's H.264 decoder advertising
+  /// `(AV_HWDEVICE_TYPE_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX)`, which is
+  /// only present in builds with VideoToolbox compiled in.
+  #[cfg(target_os = "macos")]
+  #[test]
+  fn codec_supports_hwaccel_requires_matching_pix_fmt() {
+    use ffmpeg_next::ffi::{avcodec_find_decoder, AVCodecID, AVHWDeviceType, AVPixelFormat};
+
+    // SAFETY: AV_CODEC_ID_H264 is a known constant in our build's
+    // `AVCodecID` discriminant set; constructing it does not invoke the
+    // bindgen-enum UB we worry about for runtime-derived ids.
+    let codec_ptr = unsafe { avcodec_find_decoder(AVCodecID::AV_CODEC_ID_H264) };
+    assert!(!codec_ptr.is_null(), "H.264 decoder must be present");
+
+    let device = AVHWDeviceType::AV_HWDEVICE_TYPE_VIDEOTOOLBOX;
+    let videotoolbox = AVPixelFormat::AV_PIX_FMT_VIDEOTOOLBOX as i32;
+    let nv12 = AVPixelFormat::AV_PIX_FMT_NV12 as i32;
+
+    assert!(
+      codec_supports_hwaccel(codec_ptr, device, videotoolbox),
+      "VideoToolbox + AV_PIX_FMT_VIDEOTOOLBOX must be advertised by FFmpeg's H.264 decoder"
+    );
+    assert!(
+      !codec_supports_hwaccel(codec_ptr, device, nv12),
+      "VideoToolbox + AV_PIX_FMT_NV12 must NOT match the codec's HW config — \
+       the strict get_format would have no offered HW format to return"
+    );
+  }
+}
diff --git a/src/frame.rs b/src/frame.rs
new file mode 100644
index 0000000..4642184
--- /dev/null
+++ b/src/frame.rs
@@ -0,0 +1,630 @@
+//! CPU-side decoded video frame.
+//!
+//! Wraps `ffmpeg_next::frame::Video`. All accessors read from raw `AVFrame`
+//! fields (`format`, `linesize`, `data`, `width`, `height`, `pts`) directly
+//! and never go through ffmpeg-next's `Video::format()` / `plane_height()`
+//! / `plane_width()` / `data()` — those construct `AVPixelFormat` from the
+//! frame's raw `format` integer via `transmute`, which is undefined behavior
+//! when the value isn't in the build's bindgen-generated discriminant set
+//! (the exact failure mode this crate is designed to survive).
+//!
+//! Per-row sizes for [`Frame::row`] / [`Frame::rows`] are computed from
+//! hardcoded chroma-subsampling and bit-depth tables keyed on the safe
+//! `pix_fmt()` integer, covering only the formats `hwdecode` produces (the
+//! NV* and P0xx/P2xx/P4xx families after `av_hwframe_transfer_data`). For
+//! any other format, the row accessors return `None` rather than guessing
+//! at a slice length.
+//!
+//! Why per-row, not whole-plane: FFmpeg allocates each row at
+//! `linesize[plane]` ([`Frame::stride`]) bytes for SIMD alignment, but
+//! hardware transfer paths only initialize the first
+//! [`Frame::row_bytes`]`(plane)` of every row. Exposing a stride-inclusive
+//! `&[u8]` over an entire plane would let safe code observe those
+//! uninitialized padding bytes, which violates `slice::from_raw_parts`.
+//! Per-row slices are tightly clipped to the visible byte width so the
+//! safe API never hands out an uninitialized byte. Callers that need a
+//! single base pointer (e.g. SIMD pixel converters keyed off stride) can
+//! reach for [`Frame::as_ptr`] and consume `stride * plane_h` bytes
+//! themselves under their own `unsafe` contract.
+//!
+//! Compare formats against integer constants in [`crate::pix_fmt`].
+
+use std::slice;
+
+use ffmpeg_next::frame;
+
+use crate::{
+  error::{Error, Result},
+  pix_fmt,
+};
+
+/// CPU-side decoded video frame produced by [`crate::VideoDecoder`].
+pub struct Frame {
+  inner: frame::Video,
+}
+
+impl Frame {
+  /// Construct an empty frame, suitable as the destination passed to
+  /// [`crate::VideoDecoder::receive_frame`].
+  ///
+  /// Returns `Err(Error::Ffmpeg(Other { errno: ENOMEM }))` when the
+  /// underlying `av_frame_alloc()` returns NULL — `ffmpeg_next` does not
+  /// surface that failure, so we check it here rather than letting a null
+  /// pointer flow into the safe accessors and become UB on first read.
+  pub fn empty() -> Result<Self> {
+    // SAFETY: as_ptr() is safe; we just inspect the value (potentially null).
+    let inner = frame::Video::empty();
+    if unsafe { inner.as_ptr() }.is_null() {
+      return Err(Error::Ffmpeg(ffmpeg_next::Error::Other {
+        errno: libc::ENOMEM,
+      }));
+    }
+    Ok(Self { inner })
+  }
+
+  /// Width in pixels.
+  pub fn width(&self) -> u32 {
+    // SAFETY: AVFrame.width is c_int; safe to read regardless of value.
+    unsafe { (*self.inner.as_ptr()).width as u32 }
+  }
+
+  /// Height in pixels.
+  pub fn height(&self) -> u32 {
+    // SAFETY: AVFrame.height is c_int.
+    unsafe { (*self.inner.as_ptr()).height as u32 }
+  }
+
+  /// Pixel format, returned as the raw `i32` value FFmpeg wrote to
+  /// `AVFrame.format`. Sound regardless of the linked FFmpeg version —
+  /// no `AVPixelFormat` enum is constructed.
+  ///
+  /// Compare against constants in [`crate::pix_fmt`].
+  pub fn pix_fmt(&self) -> i32 {
+    // SAFETY: AVFrame.format is bound as c_int.
+    unsafe { (*self.inner.as_ptr()).format }
+  }
+
+  /// Presentation timestamp in stream time base, or `None` for
+  /// `AV_NOPTS_VALUE`.
+  pub fn pts(&self) -> Option<i64> {
+    // ffmpeg-next's Frame::pts performs no enum conversion; safe to use.
+    self.inner.pts()
+  }
+
+  /// Number of populated planes (1 for packed formats, 2 for NV12/P010,
+  /// 3 for planar YUV, etc.). Computed by scanning `linesize` for the
+  /// first zero entry — no enum reads.
+  pub fn planes(&self) -> usize {
+    // SAFETY: AVFrame.linesize is `[c_int; 8]`; reads are sound.
+    unsafe {
+      let linesize = &(*self.inner.as_ptr()).linesize;
+      for (i, ls) in linesize.iter().enumerate() {
+        if *ls == 0 {
+          return i;
+        }
+      }
+      linesize.len()
+    }
+  }
+
+  /// Bytes per row for `plane`. Reads `AVFrame.linesize[plane]` directly.
+  /// Panics if `plane >= planes()` or the linesize is non-positive (FFmpeg
+  /// allows negative linesize for vertically-flipped formats; this crate
+  /// does not surface those — call [`Self::data`] first to test safely).
+  pub fn stride(&self, plane: usize) -> usize {
+    let n = self.planes();
+    assert!(
+      plane < n,
+      "stride: plane {plane} out of bounds (planes={n})"
+    );
+    // SAFETY: bounds-checked above; linesize is `[c_int; 8]`.
+    let linesize: i32 = unsafe { (*self.inner.as_ptr()).linesize[plane] };
+    assert!(
+      linesize > 0,
+      "stride: non-positive linesize {linesize} for plane {plane} \
+       (negative linesize means vertically-flipped — not supported)"
+    );
+    linesize as usize
+  }
+
+  /// Visible byte width of `plane` — the number of initialized bytes at
+  /// the start of every row in that plane.
+  ///
+  /// Distinct from [`Self::stride`], which returns the FFmpeg `linesize`.
+  /// `linesize` is `>= row_bytes` and may include trailing alignment
+  /// padding bytes that FFmpeg's hardware transfer paths do not
+  /// initialize. `row_bytes` is what `slice::from_raw_parts` can safely
+  /// see.
+  ///
+  /// Returns `None` when the format is not in the supported HW-output set
+  /// (see crate `pix_fmt`) or the plane is out of range.
+  pub fn row_bytes(&self, plane: usize) -> Option<usize> {
+    if plane >= self.planes() {
+      return None;
+    }
+    plane_row_bytes_for(self.pix_fmt(), plane, self.width() as usize)
+  }
+
+  /// Pixel data for one row of `plane`, tightly clipped to the visible
+  /// byte width ([`Self::row_bytes`]).
+  ///
+  /// Excludes the trailing alignment padding that [`Self::stride`]
+  /// includes — those bytes are not guaranteed to be initialized by
+  /// FFmpeg's hardware transfer paths and must not be exposed through a
+  /// safe `&[u8]`.
+  ///
+  /// Returns `None` for any of the following — never panics:
+  /// - The frame's pixel format is not one of the supported hardware-
+  ///   output formats listed in [`crate::pix_fmt`].
+  /// - The plane index is out of range.
+  /// - `y` is past the plane's row count.
+  /// - `AVFrame.linesize[plane]` is `<= 0` or `AVFrame.height` is `<= 0`.
+  /// - The plane's data pointer is null.
+  /// - The plane size would overflow `isize::MAX`.
+  pub fn row(&self, plane: usize, y: usize) -> Option<&[u8]> {
+    let info = self.plane_info(plane)?;
+    if y >= info.plane_h {
+      return None;
+    }
+    // y < plane_h and plane_h * stride ≤ isize::MAX (verified in plane_info),
+    // so y * stride is bounded by (plane_h - 1) * stride ≤ isize::MAX.
+    let offset = y * info.stride;
+    // SAFETY:
+    // - `info.plane_ptr` is non-null (verified in plane_info).
+    // - `offset + row_bytes ≤ plane_h * stride`, which is the size of the
+    //   FFmpeg allocation for this plane.
+    // - Bytes 0..row_bytes of every row are written by FFmpeg's HW
+    //   transfer; the slice is fully initialized.
+    // - `row_bytes ≤ stride ≤ isize::MAX` per plane_info.
+    unsafe {
+      let row_ptr = info.plane_ptr.add(offset);
+      Some(slice::from_raw_parts(row_ptr, info.row_bytes))
+    }
+  }
+
+  /// Iterator over every row of `plane`. Each yielded slice has length
+  /// [`Self::row_bytes`]`(plane)` — never includes the trailing alignment
+  /// padding that lives within [`Self::stride`].
+  ///
+  /// Returns `None` under the same conditions as [`Self::row`].
+  pub fn rows(&self, plane: usize) -> Option<impl Iterator<Item = &[u8]> + '_> {
+    let info = self.plane_info(plane)?;
+    Some((0..info.plane_h).map(move |y| {
+      // Same bounds argument as `row()`.
+      let offset = y * info.stride;
+      // SAFETY: see `row()` — the same invariants hold here, and the
+      // iterator's lifetime is tied to `&self` so the pointer remains
+      // valid for every yielded slice.
+      unsafe { slice::from_raw_parts(info.plane_ptr.add(offset), info.row_bytes) }
+    }))
+  }
+
+  /// Raw base pointer to `plane`'s allocation, or `None` if the plane
+  /// fails the same layout validation [`Self::row`] applies.
+  ///
+  /// Returns `None` whenever any of the following is true:
+  /// - The plane index is out of range (`plane >= planes()`).
+  /// - The frame's pixel format is not in the supported HW-output set.
+  /// - `linesize[plane] <= 0`. **In particular, FFmpeg permits negative
+  ///   linesizes for vertically-flipped frames with `data[n]` pointing
+  ///   at the *end* of the image. Returning that pointer with the
+  ///   advertised "valid for `stride * plane_h` bytes forward" contract
+  ///   would let a downstream converter walk past the buffer.** This
+  ///   accessor refuses the layout instead of handing back a pointer the
+  ///   caller cannot safely interpret as forward-addressable.
+  /// - `height <= 0`, the data pointer is null, `row_bytes > stride`, or
+  ///   the total plane size would overflow `isize::MAX`.
+  ///
+  /// On `Some(ptr)` the pointer is valid for
+  /// `stride(plane) * plane_height` *forward-addressable* bytes, and
+  /// only the first [`Self::row_bytes`]`(plane)` bytes of each row are
+  /// guaranteed to be initialized. The trailing per-row alignment padding
+  /// is uninitialized; callers performing wide SIMD loads that read past
+  /// `row_bytes` must mask the result and never surface those bytes
+  /// through a safe `&[u8]`.
+  ///
+  /// This accessor exists for downstream pixel-format converters
+  /// (`colconv`) that work in `(ptr, stride, width, height)` quadruples;
+  /// safe code should prefer [`Self::row`] / [`Self::rows`].
+  pub fn as_ptr(&self, plane: usize) -> Option<*const u8> {
+    // Share the full plane-layout validation so the unsafe escape hatch
+    // never escapes a layout that `row()` / `rows()` reject. Returning a
+    // pointer for a negative-stride frame (FFmpeg's vertical-flip
+    // convention, where `data[n]` points at the *end* of the image)
+    // would invite forward-walking out-of-bounds reads from a caller
+    // that trusts the documented "valid for stride × plane_h bytes"
+    // contract.
+    self.plane_info(plane).map(|info| info.plane_ptr)
+  }
+
+  /// Read every per-plane field needed by the row accessors with the
+  /// safety preconditions enforced once.
+  fn plane_info(&self, plane: usize) -> Option<PlaneInfo> {
+    if plane >= self.planes() {
+      return None;
+    }
+    // SAFETY: bounds-checked plane index; linesize/height/data are raw
+    // c_int / pointer reads that cannot themselves be UB.
+    let (stride_int, height_int, plane_ptr) = unsafe {
+      let raw = self.inner.as_ptr();
+      ((*raw).linesize[plane], (*raw).height, (*raw).data[plane])
+    };
+    if stride_int <= 0 || height_int <= 0 || plane_ptr.is_null() {
+      return None;
+    }
+    let stride = stride_int as usize;
+    let plane_h = plane_height_for(self.pix_fmt(), plane, height_int as usize)?;
+    let row_bytes = plane_row_bytes_for(self.pix_fmt(), plane, self.width() as usize)?;
+    if row_bytes > stride {
+      return None;
+    }
+    // Bound the entire plane allocation to isize::MAX so any byte offset
+    // computed as `y * stride` (y < plane_h) stays representable, satisfying
+    // the safety contract of `pointer::add` and `slice::from_raw_parts`.
+    let plane_size = stride.checked_mul(plane_h)?;
+    if plane_size > isize::MAX as usize {
+      return None;
+    }
+    Some(PlaneInfo {
+      plane_ptr,
+      stride,
+      plane_h,
+      row_bytes,
+    })
+  }
+
+  /// Crate-internal: hand the wrapped frame to FFmpeg / our decoder code.
+  pub(crate) fn as_inner_mut(&mut self) -> &mut frame::Video {
+    &mut self.inner
+  }
+}
+
+#[derive(Clone, Copy)]
+struct PlaneInfo {
+  plane_ptr: *const u8,
+  stride: usize,
+  plane_h: usize,
+  row_bytes: usize,
+}
+
+// `Default` intentionally omitted: constructing a frame can fail (OOM
+// in `av_frame_alloc`), and a panicking `default()` would defeat the
+// safety stance of [`Frame::empty`]. Use `Frame::empty()?` directly.
+
+/// Visible byte width of `plane`'s rows for a frame of `frame_width` and
+/// the given pixel format. `None` for formats not in the supported HW-
+/// output set.
+///
+/// Distinct from `linesize` (FFmpeg's per-row stride, which may include
+/// alignment padding). HW transfer paths only initialize bytes
+/// `0..plane_row_bytes_for(...)` of each row; everything from there to
+/// `stride` is uninitialized padding and must not be exposed via
+/// `slice::from_raw_parts`.
+fn plane_row_bytes_for(pix_fmt_int: i32, plane: usize, frame_width: usize) -> Option<usize> {
+  match pix_fmt_int {
+    // 8-bit semi-planar 4:2:0 / 4:2:2: Y at full width (1 byte/sample);
+    // UV interleaved at horizontally-subsampled chroma with `ceil(W/2)`
+    // U+V pairs at 2 bytes per pair. For even W the chroma row equals
+    // `W` bytes (the simple case); for odd W it must round *up* to the
+    // next even byte so the trailing chroma sample is not silently
+    // dropped on width = 2k+1 frames.
+    pix_fmt::NV12 | pix_fmt::NV21 | pix_fmt::NV16 => match plane {
+      0 => Some(frame_width),
+      1 => Some(frame_width.div_ceil(2).checked_mul(2)?),
+      _ => None,
+    },
+    // 8-bit 4:4:4 semi-planar: chroma at full horizontal resolution,
+    // 2 bytes per pixel (1 byte U + 1 byte V) — no rounding required.
+    pix_fmt::NV24 => match plane {
+      0 => Some(frame_width),
+      1 => Some(frame_width.checked_mul(2)?),
+      _ => None,
+    },
+    // 10/12/16-bit semi-planar 4:2:0 / 4:2:2: Y is 2 bytes/sample
+    // (high-bit-depth packed in 16-bit). UV interleaved at horizontally-
+    // subsampled chroma with `ceil(W/2)` U+V pairs at 4 bytes per pair
+    // (2 bytes U + 2 bytes V). Same odd-width rounding as the 8-bit
+    // chroma path, scaled by 2 bytes per sample.
+    pix_fmt::P010LE
+    | pix_fmt::P010BE
+    | pix_fmt::P012LE
+    | pix_fmt::P016LE
+    | pix_fmt::P210LE
+    | pix_fmt::P212LE
+    | pix_fmt::P216LE => match plane {
+      0 => Some(frame_width.checked_mul(2)?),
+      1 => Some(frame_width.div_ceil(2).checked_mul(4)?),
+      _ => None,
+    },
+    // 10/12/16-bit 4:4:4 semi-planar: Y is 2 bytes/sample; UV at full
+    // horizontal resolution with 4 bytes per pixel (2 bytes U + 2 bytes V).
+    pix_fmt::P410LE | pix_fmt::P412LE | pix_fmt::P416LE => match plane {
+      0 => Some(frame_width.checked_mul(2)?),
+      1 => Some(frame_width.checked_mul(4)?),
+      _ => None,
+    },
+    _ => None,
+  }
+}
+
+/// Number of rows in `plane` for a frame of `frame_height` and the given
+/// pixel format. `None` for formats not in the supported HW-output set.
+///
+/// Crate-internal so the decoder's probe-replay accountant can compute
+/// per-frame byte sizes without re-implementing the chroma-subsampling
+/// table.
+pub(crate) fn plane_height_for(
+  pix_fmt_int: i32,
+  plane: usize,
+  frame_height: usize,
+) -> Option<usize> {
+  match pix_fmt_int {
+    // 4:2:0 semi-planar — Y full height, chroma half height.
+    pix_fmt::NV12
+    | pix_fmt::NV21
+    | pix_fmt::P010LE
+    | pix_fmt::P010BE
+    | pix_fmt::P012LE
+    | pix_fmt::P016LE => match plane {
+      0 => Some(frame_height),
+      1 => Some(frame_height.div_ceil(2)),
+      _ => None,
+    },
+    // 4:2:2 / 4:4:4 semi-planar — both planes full height.
+    pix_fmt::NV16
+    | pix_fmt::NV24
+    | pix_fmt::P210LE
+    | pix_fmt::P212LE
+    | pix_fmt::P216LE
+    | pix_fmt::P410LE
+    | pix_fmt::P412LE
+    | pix_fmt::P416LE => match plane {
+      0 | 1 => Some(frame_height),
+      _ => None,
+    },
+    _ => None,
+  }
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  #[test]
+  fn empty_frame_has_zero_dimensions_and_no_pts() {
+    let f = Frame::empty().expect("alloc");
+    assert_eq!(f.width(), 0);
+    assert_eq!(f.height(), 0);
+    assert_eq!(f.pts(), None);
+    // AVFrame.format defaults to -1 (AV_PIX_FMT_NONE) for an empty frame.
+    assert_eq!(f.pix_fmt(), -1);
+    // No active planes for an empty frame (all linesize entries are 0).
+    assert_eq!(f.planes(), 0);
+  }
+
+  #[test]
+  fn row_returns_none_for_unknown_format() {
+    let f = Frame::empty().expect("alloc");
+    // pix_fmt is NONE (-1), not in the supported set.
+    assert!(f.row(0, 0).is_none());
+    assert!(f.rows(0).is_none());
+    assert!(f.row_bytes(0).is_none());
+  }
+
+  /// Synthesize a frame with a negative linesize (FFmpeg's vertical-flip
+  /// convention) and assert the row accessors refuse to construct a slice.
+  /// Without the linesize > 0 check, the negative `i32 as usize` would
+  /// produce a huge positive length and `from_raw_parts` would be UB.
+  ///
+  /// `as_ptr` shares the same validation — handing back the data pointer
+  /// for a negative-stride frame would let a downstream converter
+  /// following the "valid for stride × plane_h bytes forward" contract
+  /// walk past the buffer.
+  #[test]
+  fn row_returns_none_for_negative_linesize() {
+    let mut f = Frame::empty().expect("alloc");
+    unsafe {
+      let raw = f.inner.as_mut_ptr();
+      (*raw).format = pix_fmt::NV12;
+      (*raw).width = 1920;
+      (*raw).height = 1080;
+      (*raw).linesize[0] = -1920; // vertically-flipped
+      (*raw).linesize[1] = -1920;
+      // data pointers stay null; the accessors would also reject on null,
+      // but should bail earlier on the linesize sign.
+    }
+    assert!(f.row(0, 0).is_none());
+    assert!(f.row(1, 0).is_none());
+    assert!(f.rows(0).is_none());
+    assert!(
+      f.as_ptr(0).is_none(),
+      "as_ptr must share row()/rows() validation — a negative-stride \
+       frame must not leak a forward-readable plane pointer"
+    );
+    assert!(f.as_ptr(1).is_none());
+  }
+
+  #[test]
+  fn row_returns_none_for_non_positive_height() {
+    let mut f = Frame::empty().expect("alloc");
+    unsafe {
+      let raw = f.inner.as_mut_ptr();
+      (*raw).format = pix_fmt::NV12;
+      (*raw).width = 1920;
+      (*raw).height = 0;
+      (*raw).linesize[0] = 1920;
+      (*raw).linesize[1] = 1920;
+    }
+    assert!(f.row(0, 0).is_none());
+  }
+
+  /// Synthesize a frame backed by a manually-allocated buffer with stride
+  /// strictly larger than visible row bytes (the exact case where
+  /// FFmpeg's HW transfer leaves trailing padding uninitialized) and
+  /// confirm the safe row accessor returns slices clipped to the visible
+  /// width.
+  #[test]
+  fn row_clips_to_visible_width_not_stride() {
+    use std::alloc::{alloc, dealloc, Layout};
+    let width = 64usize;
+    let height = 4usize;
+    // Stride > width: 16 bytes of padding per row in the Y plane.
+    let stride = 80usize;
+    let plane_size = stride * height;
+    // Allocate ourselves so we can fully control initialization. Fill
+    // bytes 0..width with 0xAA per row (the "valid pixel" range) and
+    // bytes width..stride with 0xFF (the simulated alignment padding —
+    // FFmpeg would leave these uninitialized; we set them to a sentinel
+    // that the test can detect if the safe slice ever exposes them).
+    let layout = Layout::from_size_align(plane_size, 32).unwrap();
+    let buf = unsafe { alloc(layout) };
+    assert!(!buf.is_null());
+    for y in 0..height {
+      let row = unsafe { buf.add(y * stride) };
+      for x in 0..width {
+        unsafe { *row.add(x) = 0xAA };
+      }
+      for x in width..stride {
+        unsafe { *row.add(x) = 0xFF };
+      }
+    }
+
+    let mut f = Frame::empty().expect("alloc");
+    unsafe {
+      let raw = f.inner.as_mut_ptr();
+      (*raw).format = pix_fmt::NV12;
+      (*raw).width = width as i32;
+      (*raw).height = height as i32;
+      (*raw).linesize[0] = stride as i32;
+      // linesize[1] = 0 keeps planes() at 1 so the test stays focused on
+      // plane 0 without owning a second allocation.
+      (*raw).data[0] = buf;
+    }
+
+    assert_eq!(f.row_bytes(0), Some(width));
+    assert_eq!(f.stride(0), stride);
+    let row0 = f.row(0, 0).expect("row 0");
+    assert_eq!(
+      row0.len(),
+      width,
+      "safe row must be clipped to visible width"
+    );
+    assert!(
+      row0.iter().all(|&b| b == 0xAA),
+      "row must not include padding sentinel 0xFF"
+    );
+
+    let collected: Vec<&[u8]> = f.rows(0).expect("rows iterator").collect();
+    assert_eq!(collected.len(), height);
+    for r in &collected {
+      assert_eq!(r.len(), width);
+      assert!(r.iter().all(|&b| b == 0xAA));
+    }
+
+    // `as_ptr` accepts the valid layout and returns the same base pointer
+    // FFmpeg wrote into `data[0]`, so SIMD callers can reach the plane
+    // through the documented unsafe contract.
+    assert_eq!(
+      f.as_ptr(0),
+      Some(buf as *const u8),
+      "as_ptr must surface the plane base for a valid forward-stride frame"
+    );
+
+    // Out-of-range row index returns None instead of panicking.
+    assert!(f.row(0, height).is_none());
+
+    // Detach the buffer before drop so AVFrame's own free path doesn't
+    // touch our manual allocation.
+    unsafe {
+      (*f.inner.as_mut_ptr()).data[0] = std::ptr::null_mut();
+      dealloc(buf, layout);
+    }
+  }
+
+  #[test]
+  #[should_panic(expected = "non-positive linesize")]
+  fn stride_panics_on_negative_linesize() {
+    let mut f = Frame::empty().expect("alloc");
+    unsafe {
+      let raw = f.inner.as_mut_ptr();
+      (*raw).linesize[0] = -1920;
+    }
+    let _ = f.stride(0);
+  }
+
+  #[test]
+  fn frame_is_send() {
+    fn check<T: Send>() {}
+    check::<Frame>();
+  }
+
+  #[test]
+  fn plane_height_table_covers_supported_formats() {
+    // Spot-check the chroma subsampling table.
+    assert_eq!(plane_height_for(pix_fmt::NV12, 0, 1080), Some(1080));
+    assert_eq!(plane_height_for(pix_fmt::NV12, 1, 1080), Some(540));
+    assert_eq!(plane_height_for(pix_fmt::NV12, 1, 1081), Some(541));
+    assert_eq!(plane_height_for(pix_fmt::P010LE, 1, 1080), Some(540));
+    assert_eq!(plane_height_for(pix_fmt::NV16, 1, 1080), Some(1080));
+    assert_eq!(plane_height_for(pix_fmt::NV24, 1, 1080), Some(1080));
+    assert_eq!(plane_height_for(pix_fmt::P416LE, 1, 1080), Some(1080));
+    assert_eq!(plane_height_for(pix_fmt::NONE, 0, 1080), None);
+    assert_eq!(plane_height_for(pix_fmt::NV12, 2, 1080), None);
+  }
+
+  /// 4:2:0 / 4:2:2 chroma planes carry `ceil(W/2)` U+V pairs per row.
+  /// For odd `W`, dropping the round-up silently truncates the last chroma
+  /// sample — and the safe row slice would expose a buffer one byte (8-bit)
+  /// or two bytes (high-bit-depth) shorter than the data FFmpeg actually
+  /// wrote. Y planes and 4:4:4 chroma planes are unaffected because their
+  /// row count is just `W` or a fixed multiple of `W`.
+  #[test]
+  fn plane_row_bytes_rounds_up_chroma_for_odd_widths() {
+    // 8-bit subsampled chroma — odd W gains one byte (the missing sample
+    // pair).
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 1, 1921), Some(1922));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV21, 1, 1921), Some(1922));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV16, 1, 1921), Some(1922));
+    // High-bit-depth subsampled chroma — odd W gains two bytes.
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010LE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010BE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P012LE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P016LE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P210LE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P212LE, 1, 1921), Some(3844));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P216LE, 1, 1921), Some(3844));
+    // Y planes always at full width regardless of subsampling.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 0, 1921), Some(1921));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010LE, 0, 1921), Some(3842));
+    // 4:4:4 chroma is at full horizontal resolution — no rounding.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV24, 1, 1921), Some(3842));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P410LE, 1, 1921), Some(7684));
+    // Even widths must still match the original (pre-fix) values so the
+    // change is purely additive on the dominant code path.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 1, 1920), Some(1920));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010LE, 1, 1920), Some(3840));
+  }
+
+  #[test]
+  fn plane_row_bytes_table_covers_supported_formats() {
+    // 8-bit 4:2:0 / 4:2:2 — both planes at width.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 0, 1920), Some(1920));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 1, 1920), Some(1920));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV21, 1, 1920), Some(1920));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV16, 1, 1920), Some(1920));
+    // 8-bit 4:4:4 — chroma plane is 2 * width.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV24, 0, 1920), Some(1920));
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV24, 1, 1920), Some(3840));
+    // 10/12/16-bit 4:2:0 / 4:2:2 — both planes at 2 * width.
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010LE, 0, 1920), Some(3840));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P010LE, 1, 1920), Some(3840));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P210LE, 1, 1920), Some(3840));
+    // 10/12/16-bit 4:4:4 — Y is 2 * width, chroma is 4 * width.
+    assert_eq!(plane_row_bytes_for(pix_fmt::P410LE, 0, 1920), Some(3840));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P410LE, 1, 1920), Some(7680));
+    assert_eq!(plane_row_bytes_for(pix_fmt::P416LE, 1, 1920), Some(7680));
+    // Unsupported / out-of-range.
+    assert_eq!(plane_row_bytes_for(pix_fmt::NONE, 0, 1920), None);
+    assert_eq!(plane_row_bytes_for(pix_fmt::NV12, 2, 1920), None);
+  }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 0a58390..3654016 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,11 +1,35 @@
-//! A template for creating Rust open-source repo on GitHub
-#![cfg_attr(not(feature = "std"), no_std)]
+//! Cross-platform **hardware** video decoder built on top of `ffmpeg-next`.
+//!
+//! [`VideoDecoder`] mirrors the surface of `ffmpeg::decoder::Video`
+//! (`send_packet`/`receive_frame`/`send_eof`/`flush`) and auto-probes the
+//! host's hardware backends (VideoToolbox / VAAPI / NVDEC / D3D11VA).
+//! There is **no software fallback inside this crate** — if no hardware
+//! backend can decode the stream, [`Error::AllBackendsFailed`] surfaces
+//! either from [`VideoDecoder::open`] (when no backend even opens) or
+//! from [`VideoDecoder::receive_frame`] / [`VideoDecoder::send_packet`] /
+//! [`VideoDecoder::send_eof`] (when the initially-opened backend or any
+//! later candidate fails at decode time and the probe order is
+//! exhausted). On single-backend platforms (e.g. macOS, where the order
+//! is `[VideoToolbox]`), only the runtime path can return it. The
+//! caller picks how to fall back to a software decoder of their choice
+//! (e.g. by opening an `ffmpeg::decoder::Video` directly).
+//!
+//! Output frames returned by [`VideoDecoder::receive_frame`] are CPU-side
+//! and downloaded via `av_hwframe_transfer_data` (NV12 for 8-bit input,
+//! P010 for 10-bit). Pixel-format conversion is intentionally out of
+//! scope; downstream code handles that (e.g. via `colconv`).
 #![cfg_attr(docsrs, feature(doc_cfg))]
 #![cfg_attr(docsrs, allow(unused_attributes))]
 #![deny(missing_docs)]
 
-#[cfg(all(not(feature = "std"), feature = "alloc"))]
-extern crate alloc as std;
+mod backend;
+mod decoder;
+mod error;
+mod ffi;
+mod frame;
+pub mod pix_fmt;
 
-#[cfg(feature = "std")]
-extern crate std;
+pub use backend::Backend;
+pub use decoder::VideoDecoder;
+pub use error::{Error, Result};
+pub use frame::Frame;
diff --git a/src/pix_fmt.rs b/src/pix_fmt.rs
new file mode 100644
index 0000000..f3c594e
--- /dev/null
+++ b/src/pix_fmt.rs
@@ -0,0 +1,113 @@
+//! Stable `i32` constants for the pixel formats produced by `hwdecode`'s
+//! hardware decoders after `av_hwframe_transfer_data`.
+//!
+//! `Frame::pix_fmt()` returns the raw integer FFmpeg wrote to `AVFrame.format`
+//! (as a plain `i32` to avoid the enum-construction UB that an unvalidated
+//! cast would invoke). This module names the constants relevant to dispatch
+//! after a successful hardware decode.
+//!
+//! Because `hwdecode` is hardware-only, the formats listed here cover what
+//! the supported HW backends actually produce — the **NV** family (semi-
+//! planar 8-bit) and the **P0xx / P2xx / P4xx** family (semi-planar 10/12/16
+//! bit). VideoToolbox, VAAPI, NVDEC, and D3D11VA all download into one of
+//! these.
+//!
+//! Software-decoder output formats (`YUV420P`, `YUV422P`, `RGB24`, etc.) are
+//! intentionally **not** listed: callers handle software fallback outside
+//! this crate, and dispatch tables for those formats belong with the SW
+//! pipeline.
+//!
+//! For values not listed here, write `AVPixelFormat::AV_PIX_FMT_X as i32`
+//! directly — that's exactly the cast we use to define these constants.
+//!
+//! ```ignore
+//! use hwdecode::{pix_fmt, Frame};
+//! match frame.pix_fmt() {
+//!     pix_fmt::NV12   => /* 8-bit 4:2:0  → colconv::frame::Nv12Frame  */,
+//!     pix_fmt::P010LE => /* 10-bit 4:2:0 → colconv::frame::PnFrame<10> */,
+//!     other           => unimplemented!("pix_fmt {other}"),
+//! }
+//! ```
+
+use ffmpeg_next::ffi::AVPixelFormat;
+
+// --- semi-planar YUV (NV*) — 8-bit hardware download outputs ----------------
+
+/// 4:2:0, 8-bit, Y plane + interleaved Cb/Cr (`AV_PIX_FMT_NV12`). The
+/// dominant 8-bit HW download format on every supported backend.
+pub const NV12: i32 = AVPixelFormat::AV_PIX_FMT_NV12 as i32;
+/// 4:2:0, 8-bit, Y plane + interleaved Cr/Cb (`AV_PIX_FMT_NV21`).
+pub const NV21: i32 = AVPixelFormat::AV_PIX_FMT_NV21 as i32;
+/// 4:2:2, 8-bit, Y plane + interleaved Cb/Cr (`AV_PIX_FMT_NV16`).
+pub const NV16: i32 = AVPixelFormat::AV_PIX_FMT_NV16 as i32;
+/// 4:4:4, 8-bit, Y plane + interleaved Cb/Cr (`AV_PIX_FMT_NV24`).
+pub const NV24: i32 = AVPixelFormat::AV_PIX_FMT_NV24 as i32;
+
+// --- semi-planar YUV (P0xx) — 4:2:0 high-bit-depth HW downloads -------------
+
+/// 4:2:0, 10-bit, semi-planar little-endian (`AV_PIX_FMT_P010LE`). The
+/// dominant 10-bit HW download format.
+pub const P010LE: i32 = AVPixelFormat::AV_PIX_FMT_P010LE as i32;
+/// 4:2:0, 10-bit, semi-planar big-endian (`AV_PIX_FMT_P010BE`).
+pub const P010BE: i32 = AVPixelFormat::AV_PIX_FMT_P010BE as i32;
+/// 4:2:0, 12-bit, semi-planar little-endian (`AV_PIX_FMT_P012LE`).
+pub const P012LE: i32 = AVPixelFormat::AV_PIX_FMT_P012LE as i32;
+/// 4:2:0, 16-bit, semi-planar little-endian (`AV_PIX_FMT_P016LE`).
+pub const P016LE: i32 = AVPixelFormat::AV_PIX_FMT_P016LE as i32;
+
+// --- semi-planar YUV (P2xx) — 4:2:2 high-bit-depth HW downloads -------------
+
+/// 4:2:2, 10-bit, semi-planar little-endian (`AV_PIX_FMT_P210LE`).
+pub const P210LE: i32 = AVPixelFormat::AV_PIX_FMT_P210LE as i32;
+/// 4:2:2, 12-bit, semi-planar little-endian (`AV_PIX_FMT_P212LE`, FFmpeg 5.0+).
+pub const P212LE: i32 = AVPixelFormat::AV_PIX_FMT_P212LE as i32;
+/// 4:2:2, 16-bit, semi-planar little-endian (`AV_PIX_FMT_P216LE`).
+pub const P216LE: i32 = AVPixelFormat::AV_PIX_FMT_P216LE as i32;
+
+// --- semi-planar YUV (P4xx) — 4:4:4 high-bit-depth HW downloads -------------
+
+/// 4:4:4, 10-bit, semi-planar little-endian (`AV_PIX_FMT_P410LE`).
+pub const P410LE: i32 = AVPixelFormat::AV_PIX_FMT_P410LE as i32;
+/// 4:4:4, 12-bit, semi-planar little-endian (`AV_PIX_FMT_P412LE`, FFmpeg 5.0+).
+pub const P412LE: i32 = AVPixelFormat::AV_PIX_FMT_P412LE as i32;
+/// 4:4:4, 16-bit, semi-planar little-endian (`AV_PIX_FMT_P416LE`).
+pub const P416LE: i32 = AVPixelFormat::AV_PIX_FMT_P416LE as i32;
+
+// --- sentinel ---------------------------------------------------------------
+
+/// Sentinel value FFmpeg writes to `AVFrame.format` for an unset frame
+/// (`AV_PIX_FMT_NONE`). [`crate::Frame::empty`] returns this until the frame
+/// is filled by a decoder.
+pub const NONE: i32 = AVPixelFormat::AV_PIX_FMT_NONE as i32;
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  /// Regression check: if the underlying `AVPixelFormat` discriminants ever
+  /// change in `ffmpeg-sys-next`'s bindings, this catches it.
+  #[test]
+  fn constants_match_bindings() {
+    assert_eq!(NV12, AVPixelFormat::AV_PIX_FMT_NV12 as i32);
+    assert_eq!(P010LE, AVPixelFormat::AV_PIX_FMT_P010LE as i32);
+    assert_eq!(P416LE, AVPixelFormat::AV_PIX_FMT_P416LE as i32);
+    assert_eq!(NONE, -1, "AV_PIX_FMT_NONE must be -1 (FFmpeg ABI sentinel)");
+  }
+
+  #[test]
+  fn match_dispatch_compiles() {
+    fn classify(v: i32) -> &'static str {
+      match v {
+        NV12 => "nv12",
+        NV21 => "nv21",
+        P010LE => "p010le",
+        P210LE => "p210le",
+        P410LE => "p410le",
+        _ => "other",
+      }
+    }
+    assert_eq!(classify(NV12), "nv12");
+    assert_eq!(classify(P010LE), "p010le");
+    assert_eq!(classify(NONE), "other");
+  }
+}
diff --git a/tests/decode.rs b/tests/decode.rs
new file mode 100644
index 0000000..2431ff1
--- /dev/null
+++ b/tests/decode.rs
@@ -0,0 +1,78 @@
+//! Integration test: open the auto-probed decoder against a real video file
+//! and decode the first 30 frames. Skipped (with a clear message) when no
+//! sample is configured.
+//!
+//! Set `HWDECODE_SAMPLE_VIDEO` to an absolute path to enable.
+
+use ffmpeg::{format, media};
+use ffmpeg_next as ffmpeg;
+use hwdecode::{Frame, VideoDecoder};
+
+const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
+
+#[test]
+fn auto_open_decodes_at_least_one_frame() {
+  let Some(path) = std::env::var_os(SAMPLE_ENV) else {
+    eprintln!("skipping: set {SAMPLE_ENV} to a video file path to run this test");
+    return;
+  };
+
+  ffmpeg::init().expect("ffmpeg init");
+
+  let mut input = format::input(&path).expect("open input");
+  let stream = input
+    .streams()
+    .best(media::Type::Video)
+    .expect("video stream");
+  let stream_index = stream.index();
+  let expected_w = unsafe { (*stream.parameters().as_ptr()).width as u32 };
+  let expected_h = unsafe { (*stream.parameters().as_ptr()).height as u32 };
+
+  let mut decoder = match VideoDecoder::open(stream.parameters()) {
+    Ok(d) => d,
+    Err(hwdecode::Error::AllBackendsFailed { attempts }) => {
+      eprintln!(
+        "skipping: no hardware backend available ({} attempts)",
+        attempts.len()
+      );
+      return;
+    }
+    Err(e) => panic!("open decoder: {e}"),
+  };
+  eprintln!("optimistic backend = {:?}", decoder.backend());
+
+  assert_eq!(decoder.width(), expected_w);
+  assert_eq!(decoder.height(), expected_h);
+
+  let mut frame = Frame::empty().expect("alloc frame");
+  let mut count = 0_usize;
+  let target = 30_usize;
+
+  'outer: for (s, packet) in input.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet).expect("send packet");
+    loop {
+      match decoder.receive_frame(&mut frame) {
+        Ok(()) => {
+          assert_eq!(frame.width(), expected_w);
+          assert_eq!(frame.height(), expected_h);
+          count += 1;
+          if count >= target {
+            break 'outer;
+          }
+        }
+        Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+          if errno == ffmpeg::error::EAGAIN =>
+        {
+          break;
+        }
+        Err(e) => panic!("receive_frame: {e}"),
+      }
+    }
+  }
+
+  assert!(count >= 1, "expected at least 1 decoded frame, got {count}");
+  eprintln!("decoded {count} frames via backend {:?}", decoder.backend());
+}
diff --git a/tests/foo.rs b/tests/foo.rs
deleted file mode 100644
index 8b13789..0000000
--- a/tests/foo.rs
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/tests/hw_smoke.rs b/tests/hw_smoke.rs
new file mode 100644
index 0000000..50d67ca
--- /dev/null
+++ b/tests/hw_smoke.rs
@@ -0,0 +1,80 @@
+//! `#[ignore]`-gated smoke test that exercises end-to-end hardware decode
+//! against a real video file: opens the auto-probed decoder, drives it
+//! until the first frame is delivered, and asserts the active backend is
+//! one of the supported HW variants. Run with:
+//!
+//! ```sh
+//! HWDECODE_SAMPLE_VIDEO=/path/to/clip.mp4 cargo test --test hw_smoke -- --ignored
+//! ```
+
+use ffmpeg::{format, media};
+use ffmpeg_next as ffmpeg;
+use hwdecode::{Backend, Frame, VideoDecoder};
+
+const SAMPLE_ENV: &str = "HWDECODE_SAMPLE_VIDEO";
+
+#[test]
+#[ignore = "requires HWDECODE_SAMPLE_VIDEO and a working hardware backend"]
+fn auto_probe_picks_hardware_backend() {
+  let path = std::env::var_os(SAMPLE_ENV).unwrap_or_else(|| panic!("{SAMPLE_ENV} not set"));
+
+  ffmpeg::init().expect("ffmpeg init");
+
+  let mut input = format::input(&path).expect("open input");
+  let stream = input
+    .streams()
+    .best(media::Type::Video)
+    .expect("video stream");
+  let stream_index = stream.index();
+
+  let mut decoder = VideoDecoder::open(stream.parameters()).expect("open decoder");
+  eprintln!("auto-probe optimistic backend = {:?}", decoder.backend());
+
+  // Decode at least one frame so the probe collapses, then check the
+  // backend that actually produced it. Checking `decoder.backend()` before
+  // any frame has been received would observe the optimistic pre-probe
+  // value and could false-pass when a HW backend silently degrades.
+  let mut frame = Frame::empty().expect("alloc frame");
+  let mut got_frame = false;
+  for (s, packet) in input.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet).expect("send packet");
+    match decoder.receive_frame(&mut frame) {
+      Ok(()) => {
+        got_frame = true;
+        eprintln!(
+          "first frame: backend={:?} {}x{} pix_fmt={}",
+          decoder.backend(),
+          frame.width(),
+          frame.height(),
+          frame.pix_fmt()
+        );
+        break;
+      }
+      Err(hwdecode::Error::Ffmpeg(ffmpeg::Error::Other { errno }))
+        if errno == ffmpeg::error::EAGAIN =>
+      {
+        continue;
+      }
+      Err(e) => panic!("receive_frame: {e}"),
+    }
+  }
+  assert!(got_frame, "no frames decoded");
+  // After the probe collapses, `backend()` reports the backend that
+  // actually produced the first frame. Make the doc-comment claim
+  // explicit: it must be one of the HW variants. Today the enum is
+  // exhaustively HW-only, so `matches!` here is tautological — but it
+  // documents intent and would catch a future regression that
+  // reintroduces a non-HW variant or leaves the active state
+  // mis-classified.
+  let backend = decoder.backend();
+  assert!(
+    matches!(
+      backend,
+      Backend::VideoToolbox | Backend::Vaapi | Backend::Cuda | Backend::D3d11va
+    ),
+    "expected HW backend, got {backend:?}"
+  );
+}