diff --git a/frontend/src-tauri/Cargo.lock b/frontend/src-tauri/Cargo.lock index 17a0db26..6622bab8 100644 --- a/frontend/src-tauri/Cargo.lock +++ b/frontend/src-tauri/Cargo.lock @@ -8,15 +8,6 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" -[[package]] -name = "adobe-cmap-parser" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3" -dependencies = [ - "pom", -] - [[package]] name = "aead" version = "0.5.2" @@ -27,6 +18,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures 0.2.17", +] + [[package]] name = "ahash" version = "0.7.8" @@ -75,7 +77,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb4e440d04be07da1f1bf44fb4495ebd58669372fe0cffa6e48595ac5bd88a3" dependencies = [ "android_log-sys", - "env_filter", + "env_filter 0.1.4", "log", ] @@ -168,7 +170,7 @@ dependencies = [ "asn1-rs-derive", "asn1-rs-impl", "displaydoc", - "nom", + "nom 7.1.3", "num-traits", "rusticata-macros", "thiserror 1.0.69", @@ -504,6 +506,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-padding" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93" +dependencies = [ + "generic-array", +] + [[package]] name = "block2" version = "0.5.1" @@ -618,6 +629,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "bytecount" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" + [[package]] name = "bytemuck" version = "1.24.0" @@ -630,6 +647,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + [[package]] name = "bytes" version = "1.10.1" @@ -706,6 +729,15 @@ dependencies = [ "toml 0.9.8", ] +[[package]] +name = "cbc" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6" +dependencies = [ + "cipher", +] + [[package]] name = "cc" version = "1.2.44" @@ -763,7 +795,18 @@ checksum = "c3613f74bd2eac03dad61bd53dbe620703d4371614fe0bc3b9f04dd36fe4e818" dependencies = [ "cfg-if", "cipher", - "cpufeatures", + "cpufeatures 0.2.17", +] + +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "rand_core 0.10.0", ] [[package]] @@ -773,7 +816,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "10cd79432192d1c0f4e1a0fef9527696cc039165d729fb41b3f4f4f354c2dc35" dependencies = [ "aead", - "chacha20", + "chacha20 0.9.1", "cipher", "poly1305", "zeroize", @@ -964,6 +1007,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1068,7 +1120,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "curve25519-dalek-derive", "fiat-crypto", "rustc_version", @@ -1146,7 +1198,7 @@ checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553" dependencies = [ "asn1-rs", "displaydoc", - "nom", + "nom 7.1.3", "num-bigint", "num-traits", "rusticata-macros", @@ -1339,6 +1391,15 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +[[package]] +name = "ecb" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7" +dependencies = [ + "cipher", +] + [[package]] name = "either" version = "1.15.0" @@ -1411,6 +1472,29 @@ dependencies = [ "regex", ] +[[package]] +name = "env_filter" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a1c3cc8e57274ec99de65301228b537f1e4eedc1b8e0f9411c6caac8ae7308f" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d" +dependencies = [ + "anstream", + "anstyle", + "env_filter 1.0.0", + "jiff", + "log", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -1438,15 +1522,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "euclid" -version = "0.20.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad" -dependencies = [ - "num-traits", -] - [[package]] name = "event-listener" version = "5.4.1" @@ -1475,7 +1550,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74fef4569247a5f429d9156b9d0a2599914385dd189c539334c625d8099d90ab" dependencies = [ "futures-core", - "nom", + "nom 7.1.3", "pin-project-lite", ] @@ -1485,6 +1560,26 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "fax" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05de7d48f37cd6730705cbca900770cab77a89f413d23e100ad7fad7795a0ab" +dependencies = [ + "fax_derive", +] + +[[package]] +name = "fax_derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0aca10fb742cb43f9e7bb8467c91aa9bcb8e3ffbc6a6f7389bb93ffc920577d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.108", +] + [[package]] name = "fdeflate" version = "0.3.7" @@ -1553,6 +1648,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "foreign-types" version = "0.3.2" @@ -1883,11 +1984,25 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "rand_core 0.10.0", + "wasip2", + "wasip3", +] + [[package]] name = "gio" version = "0.18.4" @@ -2061,6 +2176,17 @@ version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b43ede17f21864e81be2fa654110bf1e793774238d86ef8555c37e6519c0403" +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -2076,6 +2202,15 @@ version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + [[package]] name = "hashbrown" version = "0.16.0" @@ -2293,7 +2428,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc50b891e4acf8fe0e71ef88ec43ad82ee07b3810ad09de10f1d01f072ed4b98" dependencies = [ "byteorder", - "png", + "png 0.17.16", ] [[package]] @@ -2377,6 +2512,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "ident_case" version = "1.0.1" @@ -2404,6 +2545,22 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "image" +version = "0.25.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6506c6c10786659413faa717ceebcb8f70731c0a60cbae39795fdf114519c1a" +dependencies = [ + "bytemuck", + "byteorder-lite", + "moxcms", + "num-traits", + "png 0.18.1", + "tiff", + "zune-core 0.5.1", + "zune-jpeg 0.5.12", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -2442,6 +2599,7 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" dependencies = [ + "block-padding", "generic-array", ] @@ -2515,6 +2673,47 @@ dependencies = [ "system-deps", ] +[[package]] +name = "jiff" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819b44bc7c87d9117eb522f14d46e918add69ff12713c475946b0a29363ed1c2" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", + "windows-sys 0.61.2", +] + +[[package]] +name = "jiff-static" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "470252db18ecc35fd766c0891b1e3ec6cbbcd62507e85276c01bf75d8e94d4a1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.108", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68971ebff725b9e2ca27a601c5eb38a4c5d64422c4cbab0c535f248087eda5c2" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + [[package]] name = "jni" version = "0.21.1" @@ -2598,6 +2797,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libappindicator" version = "0.9.0" @@ -2687,19 +2892,32 @@ dependencies = [ [[package]] name = "lopdf" -version = "0.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff" +version = "0.39.0" +source = "git+https://github.com/J-F-Liu/lopdf?rev=df670a5#df670a5878df541a7a78a909f1555df76d3a12a6" dependencies = [ + "aes", + "bitflags 2.10.0", + "cbc", + "chrono", + "ecb", "encoding_rs", "flate2", + "getrandom 0.4.2", "indexmap 2.12.0", "itoa", + "jiff", "log", "md-5", - "nom", + "nom 8.0.0", + "nom_locate", + "rand 0.10.0", "rangemap", + "rayon", + "sha2", + "stringprep", + "thiserror 2.0.17", "time", + "ttf-parser", "weezl", ] @@ -2723,15 +2941,18 @@ dependencies = [ "axum", "base64 0.22.1", "dirs 5.0.1", + "flate2", "futures-util", "hound", + "image", "log", + "lopdf", "maple-proxy", "ndarray", "once_cell", "openssl", "ort", - "pdf-extract", + "pdf-inspector", "plist", "rand 0.8.5", "rand_distr", @@ -2898,6 +3119,16 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "moxcms" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac9557c559cd6fc9867e122e20d2cbefc9ca29d80d027a8e39310920ed2f0a97" +dependencies = [ + "num-traits", + "pxfm", +] + [[package]] name = "muda" version = "0.17.1" @@ -2913,7 +3144,7 @@ dependencies = [ "objc2-core-foundation", "objc2-foundation 0.3.2", "once_cell", - "png", + "png 0.17.16", "serde", "thiserror 2.0.17", "windows-sys 0.60.2", @@ -3017,6 +3248,26 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom_locate" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d" +dependencies = [ + "bytecount", + "memchr", + "nom 8.0.0", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -3642,18 +3893,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" [[package]] -name = "pdf-extract" -version = "0.7.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575" +name = "pdf-inspector" +version = "0.1.0" +source = "git+https://github.com/firecrawl/pdf-inspector?branch=main#2b5fcd7bbb33dd9840281b97f1ca6244e8660cdf" dependencies = [ - "adobe-cmap-parser", - "encoding_rs", - "euclid", + "env_logger", + "log", "lopdf", - "postscript", - "type1-encoding-parser", - "unicode-normalization", + "once_cell", + "rayon", + "regex", + "thiserror 2.0.17", + "ttf-parser", ] [[package]] @@ -3880,6 +4131,19 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "png" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags 2.10.0", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + [[package]] name = "polling" version = "3.11.0" @@ -3900,17 +4164,11 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8159bd90725d2df49889a078b54f4f79e87f1f8a8444194cdca81d38f5393abf" dependencies = [ - "cpufeatures", + "cpufeatures 0.2.17", "opaque-debug", "universal-hash", ] -[[package]] -name = "pom" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" - [[package]] name = "portable-atomic" version = "1.11.1" @@ -3926,12 +4184,6 @@ dependencies = [ "portable-atomic", ] -[[package]] -name = "postscript" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" - [[package]] name = "potential_utf" version = "0.1.4" @@ -3962,6 +4214,16 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.108", +] + [[package]] name = "proc-macro-crate" version = "1.3.1" @@ -4050,6 +4312,18 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "pxfm" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a041e753da8b807c9255f28de81879c78c876392ff2469cde94799b2896b9d" + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + [[package]] name = "quick-xml" version = "0.38.3" @@ -4129,6 +4403,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "radium" version = "0.7.0" @@ -4170,6 +4450,17 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rand" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" +dependencies = [ + "chacha20 0.10.0", + "getrandom 0.4.2", + "rand_core 0.10.0", +] + [[package]] name = "rand_chacha" version = "0.2.2" @@ -4227,6 +4518,12 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" + [[package]] name = "rand_distr" version = "0.4.3" @@ -4519,7 +4816,7 @@ version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632" dependencies = [ - "nom", + "nom 7.1.3", ] [[package]] @@ -4742,7 +5039,7 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" dependencies = [ - "half", + "half 1.8.3", "serde", ] @@ -4912,7 +5209,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -5088,6 +5385,17 @@ dependencies = [ "quote", ] +[[package]] +name = "stringprep" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", + "unicode-properties", +] + [[package]] name = "strsim" version = "0.11.1" @@ -5354,7 +5662,7 @@ dependencies = [ "ico", "json-patch", "plist", - "png", + "png 0.17.16", "proc-macro2", "quote", "semver", @@ -5739,6 +6047,20 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "tiff" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af9605de7fee8d9551863fd692cce7637f548dbd9db9180fcc07ccc6d26c336f" +dependencies = [ + "fax", + "flate2", + "half 2.7.1", + "quick-error", + "weezl", + "zune-jpeg 0.4.21", +] + [[package]] name = "time" version = "0.3.44" @@ -6092,7 +6414,7 @@ dependencies = [ "objc2-core-graphics", "objc2-foundation 0.3.2", "once_cell", - "png", + "png 0.17.16", "serde", "thiserror 2.0.17", "windows-sys 0.60.2", @@ -6105,13 +6427,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] -name = "type1-encoding-parser" -version = "0.1.0" +name = "ttf-parser" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3d6cc09e1a99c7e01f2afe4953789311a1c50baebbdac5b477ecf78e2e92a5b" -dependencies = [ - "pom", -] +checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31" [[package]] name = "typeid" @@ -6177,6 +6496,12 @@ dependencies = [ "unic-common", ] +[[package]] +name = "unicode-bidi" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" + [[package]] name = "unicode-ident" version = "1.0.22" @@ -6192,12 +6517,24 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-properties" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" + [[package]] name = "unicode-segmentation" version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "universal-hash" version = "0.5.1" @@ -6391,7 +6728,16 @@ version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.46.0", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", ] [[package]] @@ -6452,6 +6798,28 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.12.0", + "wasm-encoder", + "wasmparser", +] + [[package]] name = "wasm-streams" version = "0.4.2" @@ -6465,6 +6833,18 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags 2.10.0", + "hashbrown 0.15.5", + "indexmap 2.12.0", + "semver", +] + [[package]] name = "web-sys" version = "0.3.82" @@ -7134,6 +7514,94 @@ version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck 0.5.0", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck 0.5.0", + "indexmap 2.12.0", + "prettyplease", + "syn 2.0.108", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.108", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags 2.10.0", + "indexmap 2.12.0", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.12.0", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "writeable" version = "0.6.2" @@ -7237,7 +7705,7 @@ dependencies = [ "data-encoding", "der-parser", "lazy_static", - "nom", + "nom 7.1.3", "oid-registry", "rusticata-macros", "thiserror 1.0.69", @@ -7450,6 +7918,36 @@ dependencies = [ "memchr", ] +[[package]] +name = "zune-core" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f423a2c17029964870cfaabb1f13dfab7d092a62a29a89264f4d36990ca414a" + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-jpeg" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29ce2c8a9384ad323cf564b67da86e21d3cfdff87908bc1223ed5c99bc792713" +dependencies = [ + "zune-core 0.4.12", +] + +[[package]] +name = "zune-jpeg" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe" +dependencies = [ + "zune-core 0.5.1", +] + [[package]] name = "zvariant" version = "5.8.0" diff --git a/frontend/src-tauri/Cargo.toml b/frontend/src-tauri/Cargo.toml index 5b166c97..6a1714c8 100644 --- a/frontend/src-tauri/Cargo.toml +++ b/frontend/src-tauri/Cargo.toml @@ -37,7 +37,10 @@ maple-proxy = "0.1.7" tauri-plugin-fs = "2.4.4" anyhow = "1.0" axum = "0.8" -pdf-extract = "0.7" +pdf-inspector = { git = "https://github.com/firecrawl/pdf-inspector", branch = "main" } +lopdf = { git = "https://github.com/J-F-Liu/lopdf", rev = "df670a5" } +image = { version = "0.25", default-features = false, features = ["tiff", "jpeg", "png"] } +flate2 = "1" base64 = "0.22" [target.'cfg(any(target_os = "macos", target_os = "linux", target_os = "windows"))'.dependencies] diff --git a/frontend/src-tauri/src/pdf_extractor.rs b/frontend/src-tauri/src/pdf_extractor.rs index 2b652a24..fa9ec0ab 100644 --- a/frontend/src-tauri/src/pdf_extractor.rs +++ b/frontend/src-tauri/src/pdf_extractor.rs @@ -1,11 +1,16 @@ use base64::{engine::general_purpose::STANDARD as BASE64, Engine}; -use pdf_extract::extract_text_from_mem; +use pdf_inspector::{process_pdf_mem, PdfType}; use serde::{Deserialize, Serialize}; #[derive(Debug, Serialize, Deserialize)] pub struct DocumentData { pub filename: String, pub text_content: String, + /// Base64-encoded page images (data URLs) for scanned/image-based PDFs. + /// Present when the PDF needs OCR -- the frontend should send these to a + /// vision model (e.g. Qwen 3 VL) for text extraction. + #[serde(skip_serializing_if = "Option::is_none")] + pub page_images: Option>, } #[derive(Debug, Serialize, Deserialize)] @@ -25,33 +30,377 @@ pub async fn extract_document_content( .decode(&file_base64) .map_err(|e| format!("Failed to decode base64 file: {e}"))?; - let text_content = match file_type.as_str() { + match file_type.as_str() { "pdf" | "application/pdf" => { - // Extract text from PDF - extract_text_from_mem(&file_bytes) - .map_err(|e| format!("Failed to extract text from PDF: {e}"))? + // Use pdf-inspector for full PDF processing (detect + extract + markdown) + let result = process_pdf_mem(&file_bytes) + .map_err(|e| format!("Failed to extract text from PDF: {e}"))?; + + // Log PDF classification info for debugging + log::info!( + "PDF '{filename}': type={:?}, pages={}, confidence={:.2}, has_encoding_issues={}, time={}ms", + result.pdf_type, + result.page_count, + result.confidence, + result.has_encoding_issues, + result.processing_time_ms + ); + + if result.has_encoding_issues { + log::warn!( + "PDF '{filename}' has encoding issues - extracted text may be incomplete or garbled" + ); + } + + // pdf-inspector extracts what it can from all PDF types. + // For scanned/image-based PDFs it may return limited or no text. + let markdown = result.markdown.unwrap_or_default(); + + if markdown.trim().is_empty() { + // For scanned/image-based PDFs, extract page images for vision-model OCR + if matches!(result.pdf_type, PdfType::Scanned | PdfType::ImageBased) { + log::info!( + "PDF '{filename}' is scanned/image-based -- extracting page images for OCR" + ); + let page_images = extract_page_images(&file_bytes)?; + if page_images.is_empty() { + return Err( + "This PDF appears to be scanned or image-based but no page images could be extracted." + .to_string(), + ); + } + return Ok(DocumentResponse { + document: DocumentData { + filename, + text_content: String::new(), + page_images: Some(page_images), + }, + status: "completed".to_string(), + }); + } + return Err("No text content could be extracted from this PDF.".to_string()); + } + + Ok(DocumentResponse { + document: DocumentData { + filename, + text_content: markdown, + page_images: None, + }, + status: "completed".to_string(), + }) } "txt" | "text/plain" | "md" | "text/markdown" => { // For text files, just convert bytes to string - String::from_utf8(file_bytes).map_err(|e| format!("Failed to decode text file: {e}"))? + let text_content = String::from_utf8(file_bytes) + .map_err(|e| format!("Failed to decode text file: {e}"))?; + Ok(DocumentResponse { + document: DocumentData { + filename, + text_content, + page_images: None, + }, + status: "completed".to_string(), + }) + } + _ => Err(format!("Unsupported file type: {file_type}")), + } +} + +// --------------------------------------------------------------------------- +// Scanned-PDF page image extraction +// --------------------------------------------------------------------------- + +/// Extract embedded images from each page of a scanned PDF. +/// +/// Returns a Vec of base64 data-URL strings (e.g. `data:image/jpeg;base64,...`). +/// Handles the most common image encodings found in scanned documents: +/// - DCTDecode (JPEG) -- raw bytes passed through +/// - CCITTFaxDecode (Group 3/4) -- wrapped in a minimal TIFF, decoded, re-encoded as JPEG +/// - FlateDecode (raw pixels) -- decompressed and encoded as JPEG +fn extract_page_images(pdf_bytes: &[u8]) -> Result, String> { + let doc = lopdf::Document::load_mem(pdf_bytes) + .map_err(|e| format!("Failed to parse PDF for image extraction: {e}"))?; + + let mut images: Vec = Vec::new(); + + for (_page_num, page_id) in doc.get_pages() { + let page = doc + .get_object(page_id) + .map_err(|e| format!("Failed to get page object: {e}"))?; + let dict = match page.as_dict() { + Ok(d) => d, + Err(_) => continue, + }; + + let resources = match dict.get(b"Resources") { + Ok(r) => match doc.dereference(r) { + Ok((_, obj)) => obj.clone(), + Err(_) => continue, + }, + Err(_) => continue, + }; + + let xobjects = match resources.as_dict().and_then(|d| d.get(b"XObject")) { + Ok(x) => match doc.dereference(x) { + Ok((_, obj)) => obj.clone(), + Err(_) => continue, + }, + Err(_) => continue, + }; + + let xobj_dict = match xobjects.as_dict() { + Ok(d) => d, + Err(_) => continue, + }; + + for (_name, obj_ref) in xobj_dict.iter() { + let obj = match doc.dereference(obj_ref) { + Ok((_, o)) => o.clone(), + Err(_) => continue, + }; + + let stream = match obj.as_stream() { + Ok(s) => s, + Err(_) => continue, + }; + + // Only process Image XObjects + let subtype = stream + .dict + .get(b"Subtype") + .and_then(|s| s.as_name()) + .unwrap_or(b""); + if subtype != b"Image" { + continue; + } + + // Determine the filter (compression) type. + // Per PDF spec §7.3.4.2, Filter can be a single Name or an Array of Names. + // In an array, the last filter describes the final data format (e.g. + // [/ASCII85Decode /DCTDecode] means the underlying data is JPEG). + let filter_name: Vec = stream + .dict + .get(b"Filter") + .ok() + .and_then(|f| { + f.as_name().map(|n| n.to_vec()).ok().or_else(|| { + f.as_array() + .ok() + .and_then(|arr| arr.last()) + .and_then(|last| last.as_name().ok()) + .map(|n| n.to_vec()) + }) + }) + .unwrap_or_default(); + + let data_url = match filter_name.as_slice() { + b"DCTDecode" => { + // Raw JPEG -- pass through directly + let b64 = BASE64.encode(&stream.content); + format!("data:image/jpeg;base64,{b64}") + } + b"CCITTFaxDecode" => { + // CCITT Group 3/4 fax data -- wrap in TIFF, decode, re-encode as JPEG + decode_ccitt_to_data_url(stream)? + } + b"FlateDecode" => { + // Raw pixel data compressed with zlib + decode_flate_to_data_url(stream)? + } + other => { + let name = String::from_utf8_lossy(other); + log::warn!("Unsupported PDF image filter '{name}' -- skipping"); + continue; + } + }; + + images.push(data_url); + } + } + + Ok(images) +} + +/// Decode a CCITTFaxDecode image stream by wrapping it in a minimal TIFF file, +/// then decoding with the `image` crate and re-encoding as JPEG. +fn decode_ccitt_to_data_url(stream: &lopdf::Stream) -> Result { + let width = + get_stream_int(&stream.dict, b"Width").ok_or("CCITTFaxDecode image missing Width")?; + let height = + get_stream_int(&stream.dict, b"Height").ok_or("CCITTFaxDecode image missing Height")?; + + // K < 0 = Group 4, K = 0 = Group 3 1-D, K > 0 = Group 3 2-D + let k = get_decode_parm_int(&stream.dict, b"K").unwrap_or(-1); + + let tiff_bytes = wrap_ccitt_as_tiff(&stream.content, width as u32, height as u32, k as i32); + + let img = image::load_from_memory(&tiff_bytes) + .map_err(|e| format!("Failed to decode CCITT image via TIFF wrapper: {e}"))?; + + encode_image_to_jpeg_data_url(&img) +} + +/// Decode a FlateDecode (zlib-compressed raw pixels) image stream. +fn decode_flate_to_data_url(stream: &lopdf::Stream) -> Result { + let width = + get_stream_int(&stream.dict, b"Width").ok_or("FlateDecode image missing Width")? as u32; + let height = + get_stream_int(&stream.dict, b"Height").ok_or("FlateDecode image missing Height")? as u32; + let bpc = get_stream_int(&stream.dict, b"BitsPerComponent").unwrap_or(8) as u32; + + // Decompress zlib data + use std::io::Read; + let mut decoder = flate2::read::ZlibDecoder::new(&stream.content[..]); + let mut raw_pixels = Vec::new(); + decoder + .read_to_end(&mut raw_pixels) + .map_err(|e| format!("Failed to decompress FlateDecode image: {e}"))?; + + // Determine color space + let cs_name = stream + .dict + .get(b"ColorSpace") + .and_then(|c| c.as_name().map(|n| n.to_vec())) + .unwrap_or_else(|_| b"DeviceGray".to_vec()); + + let img: image::DynamicImage = match (cs_name.as_slice(), bpc) { + (b"DeviceGray", 8) => { + let gray = image::GrayImage::from_raw(width, height, raw_pixels) + .ok_or("Failed to construct grayscale image from raw pixels")?; + image::DynamicImage::ImageLuma8(gray) + } + (b"DeviceRGB", 8) => { + let rgb = image::RgbImage::from_raw(width, height, raw_pixels) + .ok_or("Failed to construct RGB image from raw pixels")?; + image::DynamicImage::ImageRgb8(rgb) + } + (b"DeviceGray", 1) => { + // 1-bit grayscale -- expand to 8-bit + let expanded = expand_1bit_to_8bit(&raw_pixels, width, height); + let gray = image::GrayImage::from_raw(width, height, expanded) + .ok_or("Failed to construct 1-bit grayscale image")?; + image::DynamicImage::ImageLuma8(gray) } _ => { - return Err(format!("Unsupported file type: {file_type}")); + let name = String::from_utf8_lossy(&cs_name); + return Err(format!( + "Unsupported FlateDecode color space / bpc: {name}/{bpc}" + )); } }; - Ok(DocumentResponse { - document: DocumentData { - filename, - text_content, - }, - status: "completed".to_string(), - }) + encode_image_to_jpeg_data_url(&img) +} + +/// Encode a DynamicImage as a JPEG data URL. +fn encode_image_to_jpeg_data_url(img: &image::DynamicImage) -> Result { + let mut jpeg_buf = Vec::new(); + let mut cursor = std::io::Cursor::new(&mut jpeg_buf); + img.write_to(&mut cursor, image::ImageFormat::Jpeg) + .map_err(|e| format!("Failed to encode image as JPEG: {e}"))?; + let b64 = BASE64.encode(&jpeg_buf); + Ok(format!("data:image/jpeg;base64,{b64}")) +} + +/// Expand 1-bit-per-pixel data to 8-bit grayscale. +fn expand_1bit_to_8bit(data: &[u8], width: u32, height: u32) -> Vec { + let row_bytes = (width as usize).div_ceil(8); + let mut out = Vec::with_capacity((width * height) as usize); + for y in 0..height as usize { + for x in 0..width as usize { + let byte_idx = y * row_bytes + x / 8; + let bit_idx = 7 - (x % 8); + let bit = if byte_idx < data.len() { + (data[byte_idx] >> bit_idx) & 1 + } else { + 0 + }; + // In PDF, 0 is typically black for 1-bit images + out.push(if bit == 0 { 0 } else { 255 }); + } + } + out +} + +// --------------------------------------------------------------------------- +// TIFF wrapper for CCITT Group 3/4 data +// --------------------------------------------------------------------------- + +/// Wrap raw CCITT Group 3/4 data in a minimal TIFF container so the `image` +/// crate (via its `tiff` backend) can decode it. +fn wrap_ccitt_as_tiff(ccitt_data: &[u8], width: u32, height: u32, k: i32) -> Vec { + // TIFF compression tag: 3 = CCITT Group 3, 4 = CCITT Group 4 + let compression: u16 = if k < 0 { 4 } else { 3 }; + + let num_entries: u16 = 8; + let ifd_size = 2 + (num_entries as u32) * 12 + 4; + let data_offset: u32 = 8 + ifd_size; + + let mut buf = Vec::with_capacity(data_offset as usize + ccitt_data.len()); + + // TIFF header (8 bytes) + buf.extend_from_slice(b"II"); // little-endian + buf.extend_from_slice(&42u16.to_le_bytes()); // magic + buf.extend_from_slice(&8u32.to_le_bytes()); // offset to first IFD + + // IFD entry count + buf.extend_from_slice(&num_entries.to_le_bytes()); + + // IFD entries (must be sorted by tag) + write_ifd_long(&mut buf, 256, width); // ImageWidth + write_ifd_long(&mut buf, 257, height); // ImageLength + write_ifd_short(&mut buf, 258, 1); // BitsPerSample + write_ifd_short(&mut buf, 259, compression); // Compression + write_ifd_short(&mut buf, 262, 0); // PhotometricInterpretation (WhiteIsZero) + write_ifd_long(&mut buf, 273, data_offset); // StripOffsets + write_ifd_long(&mut buf, 278, height); // RowsPerStrip + write_ifd_long(&mut buf, 279, ccitt_data.len() as u32); // StripByteCounts + + // Next IFD offset (0 = no more IFDs) + buf.extend_from_slice(&0u32.to_le_bytes()); + + // Image data + buf.extend_from_slice(ccitt_data); + + buf +} + +fn write_ifd_short(buf: &mut Vec, tag: u16, value: u16) { + buf.extend_from_slice(&tag.to_le_bytes()); + buf.extend_from_slice(&3u16.to_le_bytes()); // type = SHORT + buf.extend_from_slice(&1u32.to_le_bytes()); // count + buf.extend_from_slice(&(value as u32).to_le_bytes()); +} + +fn write_ifd_long(buf: &mut Vec, tag: u16, value: u32) { + buf.extend_from_slice(&tag.to_le_bytes()); + buf.extend_from_slice(&4u16.to_le_bytes()); // type = LONG + buf.extend_from_slice(&1u32.to_le_bytes()); // count + buf.extend_from_slice(&value.to_le_bytes()); +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Read an integer value from a PDF stream dictionary. +fn get_stream_int(dict: &lopdf::Dictionary, key: &[u8]) -> Option { + dict.get(key).ok().and_then(|v| v.as_i64().ok()) +} + +/// Read an integer from the DecodeParms sub-dictionary of a stream. +fn get_decode_parm_int(dict: &lopdf::Dictionary, key: &[u8]) -> Option { + dict.get(b"DecodeParms") + .ok() + .and_then(|v| v.as_dict().ok()) + .and_then(|d| d.get(key).ok()) + .and_then(|v| v.as_i64().ok()) } #[cfg(test)] mod tests { - use super::extract_document_content; + use super::*; use base64::{engine::general_purpose::STANDARD as BASE64, Engine}; #[tokio::test] @@ -118,4 +467,121 @@ mod tests { "unexpected error: {err}" ); } + + #[tokio::test] + async fn extract_bitcoin_whitepaper_pdf() { + // Read the Bitcoin whitepaper PDF from the test fixtures directory (not embedded in binary) + let pdf_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("test_fixtures/bitcoin_whitepaper.pdf"); + let pdf_bytes = std::fs::read(&pdf_path).unwrap_or_else(|e| { + panic!("failed to read test fixture at {}: {e}", pdf_path.display()) + }); + + let file_base64 = BASE64.encode(&pdf_bytes); + + let resp = + extract_document_content(file_base64, "bitcoin.pdf".to_string(), "pdf".to_string()) + .await + .expect("expected Bitcoin PDF extraction to succeed"); + + assert_eq!(resp.status, "completed"); + assert_eq!(resp.document.filename, "bitcoin.pdf"); + + let content = &resp.document.text_content; + + // Verify meaningful content was extracted (whitepaper is ~9 pages) + assert!( + content.len() > 1000, + "expected substantial content from Bitcoin whitepaper, got {} chars", + content.len() + ); + + // Verify key content from the Bitcoin whitepaper is present + assert!( + content.contains("Bitcoin") || content.contains("bitcoin"), + "expected 'Bitcoin' in extracted text" + ); + assert!( + content.contains("peer-to-peer") || content.contains("peer to peer"), + "expected 'peer-to-peer' in extracted text" + ); + assert!( + content.contains("Satoshi") || content.contains("Nakamoto"), + "expected author name in extracted text" + ); + } + + #[tokio::test] + async fn extract_scanned_pdf_returns_page_images() { + // Read a real scanned PDF (a scanned letter) from the test fixtures directory. + // Since it's a scanned document, pdf-inspector won't extract text -- instead + // we should get page images back for vision-model OCR. + let pdf_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("test_fixtures/scanned_letter.pdf"); + let pdf_bytes = std::fs::read(&pdf_path).unwrap_or_else(|e| { + panic!("failed to read test fixture at {}: {e}", pdf_path.display()) + }); + + let file_base64 = BASE64.encode(&pdf_bytes); + + let resp = extract_document_content( + file_base64, + "scanned_letter.pdf".to_string(), + "pdf".to_string(), + ) + .await + .expect("expected scanned PDF extraction to succeed (with page images)"); + + assert_eq!(resp.status, "completed"); + assert_eq!(resp.document.filename, "scanned_letter.pdf"); + + // Text content should be empty for a scanned PDF + assert!( + resp.document.text_content.is_empty(), + "scanned PDF should have empty text_content" + ); + + // Should have page images for OCR + let page_images = resp + .document + .page_images + .expect("scanned PDF should have page_images"); + + assert_eq!( + page_images.len(), + 1, + "scanned letter has 1 page, expected 1 image" + ); + + // Each image should be a valid JPEG data URL + assert!( + page_images[0].starts_with("data:image/jpeg;base64,"), + "page image should be a JPEG data URL" + ); + + // Verify the image has reasonable size (not empty/corrupt) + let b64_data = page_images[0] + .strip_prefix("data:image/jpeg;base64,") + .unwrap(); + let decoded_size = BASE64.decode(b64_data).unwrap().len(); + assert!( + decoded_size > 1000, + "JPEG image should be >1KB, got {} bytes", + decoded_size + ); + } + + #[test] + fn test_wrap_ccitt_as_tiff_structure() { + let fake_data = vec![0u8; 100]; + let tiff = wrap_ccitt_as_tiff(&fake_data, 100, 200, -1); + + // Check TIFF header + assert_eq!(&tiff[0..2], b"II"); // little-endian + assert_eq!(u16::from_le_bytes([tiff[2], tiff[3]]), 42); // magic + assert_eq!(u32::from_le_bytes([tiff[4], tiff[5], tiff[6], tiff[7]]), 8); // IFD offset + + // Should end with our image data + assert_eq!(&tiff[tiff.len() - 100..], &fake_data[..]); + } } diff --git a/frontend/src-tauri/test_fixtures/bitcoin_whitepaper.pdf b/frontend/src-tauri/test_fixtures/bitcoin_whitepaper.pdf new file mode 100644 index 00000000..1e19b739 Binary files /dev/null and b/frontend/src-tauri/test_fixtures/bitcoin_whitepaper.pdf differ diff --git a/frontend/src-tauri/test_fixtures/scanned_letter.pdf b/frontend/src-tauri/test_fixtures/scanned_letter.pdf new file mode 100644 index 00000000..8d3068e3 Binary files /dev/null and b/frontend/src-tauri/test_fixtures/scanned_letter.pdf differ diff --git a/frontend/src/components/UnifiedChat.tsx b/frontend/src/components/UnifiedChat.tsx index 830958ee..757db7cf 100644 --- a/frontend/src/components/UnifiedChat.tsx +++ b/frontend/src/components/UnifiedChat.tsx @@ -1733,6 +1733,7 @@ export function UnifiedChat() { document: { filename: string; text_content: string; + page_images?: string[]; }; status: string; } @@ -1743,7 +1744,23 @@ export function UnifiedChat() { fileType: "pdf" }); - if (result.document?.text_content) { + if (result.document?.page_images?.length) { + // Scanned/image-based PDF — store placeholder only (no base64 image data). + // page_images are intentionally excluded from documentText to avoid + // sending multi-MB base64 blobs as plain text to the AI model. + // Vision-model OCR integration will use a separate code path. + const scannedData = { + document: { + filename: result.document.filename, + text_content: + "[Scanned PDF: " + + result.document.page_images.length + + " page image(s) extracted. OCR via vision model is not yet supported.]" + } + }; + setDocumentText(JSON.stringify(scannedData)); + setDocumentName(file.name); + } else if (result.document?.text_content) { // Create a cleaned version with image references removed const cleanedParsed = { document: {