From 2a9b039b01660e021310ee9f15a05da755d0d99c Mon Sep 17 00:00:00 2001 From: CocoRoF Date: Tue, 17 Mar 2026 22:14:12 +0900 Subject: [PATCH 1/2] Add debugging scripts and comprehensive tests for Googer library - Introduced `debug_html.py` to fetch and save HTML responses from Google searches using different user agents. - Created `debug_parse.py` to parse the saved GSA HTML using BeautifulSoup and extract relevant data. - Added `debug_selectors.py` and `debug_selectors2.py` to check for specific selectors in the HTML and analyze their occurrences. - Developed `debug_site.py` to test the site operator functionality within the Googer library. - Implemented `test_comprehensive.py` for a thorough test of Googer library features, including context manager usage and query building. - Added `test_googer.py` for a simple test case demonstrating basic search functionality. --- Cargo.lock | 980 ++++---------------- Cargo.toml | 4 +- googer_python_library/debug_chrome.html | 18 + googer_python_library/debug_google.html | 18 + googer_python_library/debug_gsa.html | 47 + googer_python_library/debug_html.py | 54 ++ googer_python_library/debug_parse.py | 42 + googer_python_library/debug_selectors.py | 21 + googer_python_library/debug_selectors2.py | 71 ++ googer_python_library/debug_site.py | 28 + googer_python_library/test_comprehensive.py | 51 + googer_python_library/test_googer.py | 6 + pyproject.toml | 2 +- src/config.rs | 2 +- src/engines/base.rs | 4 +- src/http_client.rs | 110 ++- src/utils.rs | 9 +- 17 files changed, 596 insertions(+), 871 deletions(-) create mode 100644 googer_python_library/debug_chrome.html create mode 100644 googer_python_library/debug_google.html create mode 100644 googer_python_library/debug_gsa.html create mode 100644 googer_python_library/debug_html.py create mode 100644 googer_python_library/debug_parse.py create mode 100644 googer_python_library/debug_selectors.py create mode 100644 googer_python_library/debug_selectors2.py create mode 100644 googer_python_library/debug_site.py create mode 100644 googer_python_library/test_comprehensive.py create mode 100644 googer_python_library/test_googer.py diff --git a/Cargo.lock b/Cargo.lock index 9d78662..e68ca30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + [[package]] name = "ahash" version = "0.8.12" @@ -24,12 +30,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "anyhow" -version = "1.0.102" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" - [[package]] name = "arc-swap" version = "1.8.2" @@ -39,12 +39,6 @@ dependencies = [ "rustversion", ] -[[package]] -name = "atomic-waker" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" - [[package]] name = "autocfg" version = "1.5.0" @@ -63,24 +57,12 @@ version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" -[[package]] -name = "bumpalo" -version = "3.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" - [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" -[[package]] -name = "bytes" -version = "1.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" - [[package]] name = "cc" version = "1.2.57" @@ -110,15 +92,15 @@ dependencies = [ [[package]] name = "cookie_store" -version = "0.22.1" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15b2c103cf610ec6cae3da84a766285b42fd16aad564758459e6ecf128c75206" +checksum = "2eac901828f88a5241ee0600950ab981148a18f2f756900ffba1b125ca6a3ef9" dependencies = [ "cookie", "document-features", "idna", + "indexmap", "log", - "publicsuffix", "serde", "serde_derive", "serde_json", @@ -127,21 +109,14 @@ dependencies = [ ] [[package]] -name = "core-foundation" -version = "0.10.1" +name = "crc32fast" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ - "core-foundation-sys", - "libc", + "cfg-if", ] -[[package]] -name = "core-foundation-sys" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" - [[package]] name = "cssparser" version = "0.34.0" @@ -232,22 +207,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" -[[package]] -name = "errno" -version = "0.3.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" -dependencies = [ - "libc", - "windows-sys", -] - -[[package]] -name = "fastrand" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" - [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -255,26 +214,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] -name = "foldhash" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" - -[[package]] -name = "foreign-types" -version = "0.3.2" +name = "flate2" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ - "foreign-types-shared", + "crc32fast", + "miniz_oxide", ] -[[package]] -name = "foreign-types-shared" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" - [[package]] name = "form_urlencoded" version = "1.2.2" @@ -294,55 +242,6 @@ dependencies = [ "new_debug_unreachable", ] -[[package]] -name = "futures-channel" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" - -[[package]] -name = "futures-io" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" - -[[package]] -name = "futures-sink" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" - -[[package]] -name = "futures-task" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" - -[[package]] -name = "futures-util" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" -dependencies = [ - "futures-core", - "futures-io", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "slab", -] - [[package]] name = "fxhash" version = "0.2.1" @@ -380,26 +279,13 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi 5.3.0", - "wasip2", -] - -[[package]] -name = "getrandom" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" -dependencies = [ - "cfg-if", - "libc", - "r-efi 6.0.0", + "r-efi", "wasip2", - "wasip3", ] [[package]] name = "googer" -version = "0.2.4" +version = "0.2.5" dependencies = [ "html-escape", "log", @@ -407,24 +293,15 @@ dependencies = [ "pyo3-log", "rand", "regex", - "reqwest", "scraper", "serde", "serde_json", "thiserror", "unicode-normalization", + "ureq", "urlencoding", ] -[[package]] -name = "hashbrown" -version = "0.15.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" -dependencies = [ - "foldhash", -] - [[package]] name = "hashbrown" version = "0.16.1" @@ -458,105 +335,6 @@ dependencies = [ "match_token", ] -[[package]] -name = "http" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" -dependencies = [ - "bytes", - "itoa", -] - -[[package]] -name = "http-body" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" -dependencies = [ - "bytes", - "http", -] - -[[package]] -name = "http-body-util" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" -dependencies = [ - "bytes", - "futures-core", - "http", - "http-body", - "pin-project-lite", -] - -[[package]] -name = "httparse" -version = "1.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" - -[[package]] -name = "hyper" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" -dependencies = [ - "atomic-waker", - "bytes", - "futures-channel", - "futures-core", - "http", - "http-body", - "httparse", - "itoa", - "pin-project-lite", - "pin-utils", - "smallvec", - "tokio", - "want", -] - -[[package]] -name = "hyper-tls" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" -dependencies = [ - "bytes", - "http-body-util", - "hyper", - "hyper-util", - "native-tls", - "tokio", - "tokio-native-tls", - "tower-service", -] - -[[package]] -name = "hyper-util" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" -dependencies = [ - "base64", - "bytes", - "futures-channel", - "futures-util", - "http", - "http-body", - "hyper", - "ipnet", - "libc", - "percent-encoding", - "pin-project-lite", - "socket2", - "tokio", - "tower-service", - "tracing", -] - [[package]] name = "icu_collections" version = "2.1.1" @@ -638,12 +416,6 @@ dependencies = [ "zerovec", ] -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "idna" version = "1.1.0" @@ -672,9 +444,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", - "hashbrown 0.16.1", - "serde", - "serde_core", + "hashbrown", ] [[package]] @@ -686,56 +456,18 @@ dependencies = [ "rustversion", ] -[[package]] -name = "ipnet" -version = "2.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" - -[[package]] -name = "iri-string" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" -dependencies = [ - "memchr", - "serde", -] - [[package]] name = "itoa" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" -[[package]] -name = "js-sys" -version = "0.3.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" -dependencies = [ - "once_cell", - "wasm-bindgen", -] - -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "libc" version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" -[[package]] -name = "linux-raw-sys" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" - [[package]] name = "litemap" version = "0.8.1" @@ -810,31 +542,13 @@ dependencies = [ ] [[package]] -name = "mio" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" -dependencies = [ - "libc", - "wasi", - "windows-sys", -] - -[[package]] -name = "native-tls" -version = "0.2.18" +name = "miniz_oxide" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", + "adler2", + "simd-adler32", ] [[package]] @@ -855,60 +569,6 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" -[[package]] -name = "openssl" -version = "0.10.76" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf" -dependencies = [ - "bitflags", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "openssl-probe" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" - -[[package]] -name = "openssl-src" -version = "300.5.5+3.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f1787d533e03597a7934fd0a765f0d28e94ecc5fb7789f8053b1e699a56f709" -dependencies = [ - "cc", -] - -[[package]] -name = "openssl-sys" -version = "0.9.112" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb" -dependencies = [ - "cc", - "libc", - "openssl-src", - "pkg-config", - "vcpkg", -] - [[package]] name = "parking_lot" version = "0.12.5" @@ -990,24 +650,6 @@ dependencies = [ "siphasher", ] -[[package]] -name = "pin-project-lite" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkg-config" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" - [[package]] name = "portable-atomic" version = "1.13.1" @@ -1044,16 +686,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" -[[package]] -name = "prettyplease" -version = "0.2.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" -dependencies = [ - "proc-macro2", - "syn", -] - [[package]] name = "proc-macro2" version = "1.0.106" @@ -1063,22 +695,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "psl-types" -version = "2.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" - -[[package]] -name = "publicsuffix" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f42ea446cab60335f76979ec15e12619a2165b5ae2c12166bef27d283a9fadf" -dependencies = [ - "idna", - "psl-types", -] - [[package]] name = "pyo3" version = "0.23.5" @@ -1168,12 +784,6 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" -[[package]] -name = "r-efi" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" - [[package]] name = "rand" version = "0.8.5" @@ -1243,56 +853,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] -name = "reqwest" -version = "0.12.28" +name = "ring" +version = "0.17.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ - "base64", - "bytes", - "cookie", - "cookie_store", - "futures-channel", - "futures-core", - "futures-util", - "http", - "http-body", - "http-body-util", - "hyper", - "hyper-tls", - "hyper-util", - "js-sys", - "log", - "native-tls", - "percent-encoding", - "pin-project-lite", - "rustls-pki-types", - "serde", - "serde_json", - "serde_urlencoded", - "sync_wrapper", - "tokio", - "tokio-native-tls", - "tower", - "tower-http", - "tower-service", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys", ] [[package]] -name = "rustix" -version = "1.1.4" +name = "rustls" +version = "0.23.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys", - "windows-sys", + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", ] [[package]] @@ -1305,25 +891,21 @@ dependencies = [ ] [[package]] -name = "rustversion" -version = "1.0.22" +name = "rustls-webpki" +version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" - -[[package]] -name = "ryu" -version = "1.0.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] [[package]] -name = "schannel" -version = "0.1.29" +name = "rustversion" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" -dependencies = [ - "windows-sys", -] +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "scopeguard" @@ -1347,29 +929,6 @@ dependencies = [ "tendril", ] -[[package]] -name = "security-framework" -version = "3.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" -dependencies = [ - "bitflags", - "core-foundation", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - -[[package]] -name = "security-framework-sys" -version = "2.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "selectors" version = "0.26.0" @@ -1389,12 +948,6 @@ dependencies = [ "smallvec", ] -[[package]] -name = "semver" -version = "1.0.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" - [[package]] name = "serde" version = "1.0.228" @@ -1438,18 +991,6 @@ dependencies = [ "zmij", ] -[[package]] -name = "serde_urlencoded" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" -dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", -] - [[package]] name = "servo_arc" version = "0.4.3" @@ -1466,16 +1007,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] -name = "siphasher" -version = "1.0.2" +name = "simd-adler32" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" [[package]] -name = "slab" -version = "0.4.12" +name = "siphasher" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" [[package]] name = "smallvec" @@ -1484,13 +1025,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" [[package]] -name = "socket2" -version = "0.6.3" +name = "socks" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" dependencies = [ + "byteorder", "libc", - "windows-sys", + "winapi", ] [[package]] @@ -1524,6 +1066,12 @@ dependencies = [ "quote", ] +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.117" @@ -1535,15 +1083,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "sync_wrapper" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" -dependencies = [ - "futures-core", -] - [[package]] name = "synstructure" version = "0.13.2" @@ -1561,19 +1100,6 @@ version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" -[[package]] -name = "tempfile" -version = "3.27.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" -dependencies = [ - "fastrand", - "getrandom 0.4.2", - "once_cell", - "rustix", - "windows-sys", -] - [[package]] name = "tendril" version = "0.4.3" @@ -1661,100 +1187,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "tokio" -version = "1.50.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" -dependencies = [ - "bytes", - "libc", - "mio", - "pin-project-lite", - "socket2", - "windows-sys", -] - -[[package]] -name = "tokio-native-tls" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" -dependencies = [ - "native-tls", - "tokio", -] - -[[package]] -name = "tower" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" -dependencies = [ - "futures-core", - "futures-util", - "pin-project-lite", - "sync_wrapper", - "tokio", - "tower-layer", - "tower-service", -] - -[[package]] -name = "tower-http" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" -dependencies = [ - "bitflags", - "bytes", - "futures-util", - "http", - "http-body", - "iri-string", - "pin-project-lite", - "tower", - "tower-layer", - "tower-service", -] - -[[package]] -name = "tower-layer" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" - -[[package]] -name = "tower-service" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" - -[[package]] -name = "tracing" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" -dependencies = [ - "pin-project-lite", - "tracing-core", -] - -[[package]] -name = "tracing-core" -version = "0.1.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" -dependencies = [ - "once_cell", -] - -[[package]] -name = "try-lock" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" - [[package]] name = "unicode-ident" version = "1.0.24" @@ -1776,18 +1208,37 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - [[package]] name = "unindent" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" +dependencies = [ + "base64", + "cookie", + "cookie_store", + "flate2", + "log", + "once_cell", + "rustls", + "rustls-pki-types", + "socks", + "url", + "webpki-roots 0.26.11", +] + [[package]] name = "url" version = "2.5.8" @@ -1824,27 +1275,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - [[package]] name = "version_check" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" -[[package]] -name = "want" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" -dependencies = [ - "try-lock", -] - [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -1861,219 +1297,129 @@ dependencies = [ ] [[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +name = "webpki-roots" +version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" dependencies = [ - "wit-bindgen", + "webpki-roots 1.0.6", ] [[package]] -name = "wasm-bindgen" -version = "0.2.114" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" -dependencies = [ - "cfg-if", - "once_cell", - "rustversion", - "wasm-bindgen-macro", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.64" +name = "webpki-roots" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" dependencies = [ - "cfg-if", - "futures-util", - "js-sys", - "once_cell", - "wasm-bindgen", - "web-sys", + "rustls-pki-types", ] [[package]] -name = "wasm-bindgen-macro" -version = "0.2.114" +name = "winapi" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ - "quote", - "wasm-bindgen-macro-support", + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", ] [[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.114" +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" -dependencies = [ - "bumpalo", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] -name = "wasm-bindgen-shared" -version = "0.2.114" +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" -dependencies = [ - "unicode-ident", -] +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] -name = "wasm-encoder" -version = "0.244.0" +name = "windows-link" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] -name = "wasm-metadata" -version = "0.244.0" +name = "windows-sys" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "anyhow", - "indexmap", - "wasm-encoder", - "wasmparser", + "windows-targets", ] [[package]] -name = "wasmparser" -version = "0.244.0" +name = "windows-targets" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "bitflags", - "hashbrown 0.15.5", - "indexmap", - "semver", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] [[package]] -name = "web-sys" -version = "0.3.91" +name = "windows_aarch64_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" -dependencies = [ - "js-sys", - "wasm-bindgen", -] +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] -name = "windows-link" -version = "0.2.1" +name = "windows_aarch64_msvc" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] -name = "windows-sys" -version = "0.61.2" +name = "windows_i686_gnu" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" -dependencies = [ - "windows-link", -] +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] -name = "wit-bindgen" -version = "0.51.0" +name = "windows_i686_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] -name = "wit-bindgen-core" -version = "0.51.0" +name = "windows_i686_msvc" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] -name = "wit-bindgen-rust" -version = "0.51.0" +name = "windows_x86_64_gnu" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap", - "prettyplease", - "syn", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" +name = "windows_x86_64_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn", - "wit-bindgen-core", - "wit-bindgen-rust", -] +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] -name = "wit-component" -version = "0.244.0" +name = "windows_x86_64_msvc" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags", - "indexmap", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] -name = "wit-parser" -version = "0.244.0" +name = "wit-bindgen" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" [[package]] name = "writeable" diff --git a/Cargo.toml b/Cargo.toml index 8dc4273..95fe40d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "googer" -version = "0.2.5" +version = "0.2.6" edition = "2021" description = "A powerful, type-safe Google Search library for Python — powered by Rust." license = "Apache-2.0" @@ -11,7 +11,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.23", features = ["extension-module"] } -reqwest = { version = "0.12", default-features = false, features = ["blocking", "socks", "cookies", "native-tls-vendored"] } +ureq = { version = "2", features = ["cookies", "socks-proxy"] } scraper = "0.21" rand = "0.8" regex = "1" diff --git a/googer_python_library/debug_chrome.html b/googer_python_library/debug_chrome.html new file mode 100644 index 0000000..4a187e3 --- /dev/null +++ b/googer_python_library/debug_chrome.html @@ -0,0 +1,18 @@ +Google Search \ No newline at end of file diff --git a/googer_python_library/debug_google.html b/googer_python_library/debug_google.html new file mode 100644 index 0000000..cd6d383 --- /dev/null +++ b/googer_python_library/debug_google.html @@ -0,0 +1,18 @@ +Google Search \ No newline at end of file diff --git a/googer_python_library/debug_gsa.html b/googer_python_library/debug_gsa.html new file mode 100644 index 0000000..9435c07 --- /dev/null +++ b/googer_python_library/debug_gsa.html @@ -0,0 +1,47 @@ +python programming - Google Search

Search Results

AI overview

Thinking
Python is a popular programming language. It was created by Guido van Rossum, and released in 1991. It is used for:
People also ask
Feedback
Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.
Write and run your Python code using our online compiler. Enjoy additional features like code sharing, dark mode, and support for multiple programming ...
Images
Beginning Python Programming — Part 1 | by Bob Roebling | Better ...
Better Programming
Python Tutorial for Absolute Beginners #1 - What Are Variables?
YouTube
Python is a high-level programming language, which can be easily interpreted by human. Python interpreter converts the Python code (high-level) into ...
\ No newline at end of file diff --git a/googer_python_library/debug_html.py b/googer_python_library/debug_html.py new file mode 100644 index 0000000..c672c54 --- /dev/null +++ b/googer_python_library/debug_html.py @@ -0,0 +1,54 @@ +import urllib.request +import re + +# Test with GSA user agent (what the library uses) +gsa_ua = "Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/411.0.879111500 Mobile/15E148 Safari/604.1" + +req = urllib.request.Request( + 'https://www.google.com/search?q=python+programming&start=0&hl=en&lr=lang_en', + headers={'User-Agent': gsa_ua} +) +r = urllib.request.urlopen(req) +html_gsa = r.read().decode('utf-8') +print(f"=== GSA UA HTML length: {len(html_gsa)} ===") +with open('debug_gsa.html', 'w', encoding='utf-8') as f: + f.write(html_gsa) + +# Test with Chrome desktop UA +chrome_ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + +req2 = urllib.request.Request( + 'https://www.google.com/search?q=python+programming&start=0&hl=en&lr=lang_en', + headers={'User-Agent': chrome_ua} +) +r2 = urllib.request.urlopen(req2) +html_chrome = r2.read().decode('utf-8') +print(f"=== Chrome UA HTML length: {len(html_chrome)} ===") +with open('debug_chrome.html', 'w', encoding='utf-8') as f: + f.write(html_chrome) + +# Look for patterns in GSA HTML +print("\n--- GSA HTML patterns ---") +for pattern in ['data-snc', 'data-sncf', 'data-sokoban', 'class="g"', 'class="Gx5Zad"', + 'role="link"', '/url?q=', 'class="BNeawe"', 'class="kCrYT"', + 'class="ZINbbc"', 'class="tF2Cxc"', 'class="yuRUbf"', 'class="VwiC3b"', + 'class="egMi0 kCrYT"', 'class="BVG0Nb"', 'data-hveid']: + count = html_gsa.count(pattern) + if count > 0: + print(f" {pattern}: {count} occurrences") + +print("\n--- Chrome HTML patterns ---") +for pattern in ['data-snc', 'data-sncf', 'data-sokoban', 'class="g"', 'class="Gx5Zad"', + 'role="link"', '/url?q=', 'class="BNeawe"', 'class="kCrYT"', + 'class="ZINbbc"', 'class="tF2Cxc"', 'class="yuRUbf"', 'class="VwiC3b"', + 'class="egMi0 kCrYT"', 'class="BVG0Nb"', 'data-hveid']: + count = html_chrome.count(pattern) + if count > 0: + print(f" {pattern}: {count} occurrences") + +# Check for redirect/consent pages +print("\n--- Consent/redirect check ---") +print(f"GSA consent: {'consent' in html_gsa.lower()}") +print(f"Chrome consent: {'consent' in html_chrome.lower()}") +print(f"GSA captcha: {'captcha' in html_gsa.lower()}") +print(f"Chrome captcha: {'captcha' in html_chrome.lower()}") diff --git a/googer_python_library/debug_parse.py b/googer_python_library/debug_parse.py new file mode 100644 index 0000000..829e8c2 --- /dev/null +++ b/googer_python_library/debug_parse.py @@ -0,0 +1,42 @@ +"""Parse the saved GSA HTML with the same selectors the Rust code uses.""" +try: + from bs4 import BeautifulSoup +except ImportError: + import subprocess, sys + subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'beautifulsoup4', '-q']) + from bs4 import BeautifulSoup + +# Use lxml-like CSS selectors via BeautifulSoup to emulate what scraper crate does +with open('debug_gsa.html', 'r', encoding='utf-8') as f: + html = f.read() + +from bs4 import BeautifulSoup +soup = BeautifulSoup(html, 'html.parser') + +# Current selectors from config.rs +TEXT_ITEMS_SELECTOR = "div[data-snc]" +TEXT_TITLE_SELECTOR = "div[role='link']" +TEXT_HREF_SELECTOR = "a[href]" +TEXT_BODY_SELECTOR = "div[data-sncf]" + +items = soup.select(TEXT_ITEMS_SELECTOR) +print(f"Found {len(items)} data-snc items\n") + +for i, item in enumerate(items): + title_el = item.select_one(TEXT_TITLE_SELECTOR) + href_el = item.select_one(TEXT_HREF_SELECTOR) + body_el = item.select_one(TEXT_BODY_SELECTOR) + + title = title_el.get_text(strip=True) if title_el else "(no title)" + href = href_el.get('href', '(no href)') if href_el else "(no href)" + body = body_el.get_text(strip=True) if body_el else "(no body)" + + # Clean URL + if href.startswith('/url?q='): + href = href.split('/url?q=')[1].split('&')[0] + + print(f"--- Result {i+1} ---") + print(f"Title: {title[:100]}") + print(f"Href: {href[:100]}") + print(f"Body: {body[:150]}") + print() diff --git a/googer_python_library/debug_selectors.py b/googer_python_library/debug_selectors.py new file mode 100644 index 0000000..c96dc13 --- /dev/null +++ b/googer_python_library/debug_selectors.py @@ -0,0 +1,21 @@ +import urllib.request + +req = urllib.request.Request( + 'https://www.google.com/search?q=python+programming&start=0&hl=en&lr=lang_en&safe=active', + headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' + } +) +r = urllib.request.urlopen(req) +html = r.read().decode('utf-8') +print(f"HTML length: {len(html)}") +print(f"data-snc: {'data-snc' in html}") +print(f"data-sncf: {'data-sncf' in html}") +class_g = 'class="g"' +print(f"class g: {class_g in html}") +print(f"div.MjjYud: {'MjjYud' in html}") +print(f"div.g.Ww4FFb: {'Ww4FFb' in html}") + +with open('debug_google.html', 'w', encoding='utf-8') as f: + f.write(html) +print("Saved to debug_google.html") diff --git a/googer_python_library/debug_selectors2.py b/googer_python_library/debug_selectors2.py new file mode 100644 index 0000000..2864bb0 --- /dev/null +++ b/googer_python_library/debug_selectors2.py @@ -0,0 +1,71 @@ +from html.parser import HTMLParser +import re + +class SelectorChecker(HTMLParser): + def __init__(self): + super().__init__() + self.data_snc_count = 0 + self.data_snc_tags = [] + self.data_sncf_count = 0 + self.data_sncf_tags = [] + self.role_link_count = 0 + self.role_link_tags = [] + self.url_q_count = 0 + self.current_snc_item = None + self.in_snc = False + self.snc_items = [] + + def handle_starttag(self, tag, attrs): + attr_dict = dict(attrs) + if 'data-snc' in attr_dict: + self.data_snc_count += 1 + self.data_snc_tags.append((tag, attr_dict.get('data-snc', ''))) + if 'data-sncf' in attr_dict: + self.data_sncf_count += 1 + self.data_sncf_tags.append((tag, attr_dict.get('data-sncf', ''))) + if attr_dict.get('role') == 'link': + self.role_link_count += 1 + self.role_link_tags.append(tag) + href = attr_dict.get('href', '') + if '/url?q=' in href: + self.url_q_count += 1 + +with open('debug_gsa.html', 'r', encoding='utf-8') as f: + html = f.read() + +checker = SelectorChecker() +checker.feed(html) + +print(f"data-snc elements: {checker.data_snc_count}") +for tag, val in checker.data_snc_tags[:5]: + print(f" <{tag} data-snc=\"{val}\">") + +print(f"\ndata-sncf elements: {checker.data_sncf_count}") +for tag, val in checker.data_sncf_tags[:5]: + print(f" <{tag} data-sncf=\"{val}\">") + +print(f"\nrole='link' elements: {checker.role_link_count}") +for tag in checker.role_link_tags[:5]: + print(f" <{tag}>") + +print(f"\n/url?q= links: {checker.url_q_count}") + +# Let me try to extract a sample data-snc block with context +import re + +# Find first data-snc block with surrounding content +snc_pattern = re.compile(r']*data-snc[^>]*>.*?', re.DOTALL) +matches = snc_pattern.findall(html) +print(f"\n=== First data-snc block (first 2000 chars) ===") +if matches: + block = matches[0][:2000] + print(block) + +# Find context around data-snc more broadly +idx = html.find('data-snc') +if idx >= 0: + start = max(0, idx - 200) + end = min(len(html), idx + 2000) + snippet = html[start:end] + print(f"\n=== Context around first data-snc (chars {start}-{end}) ===") + print(snippet[:3000]) diff --git a/googer_python_library/debug_site.py b/googer_python_library/debug_site.py new file mode 100644 index 0000000..c8d9bfc --- /dev/null +++ b/googer_python_library/debug_site.py @@ -0,0 +1,28 @@ +"""Debug the site: operator issue.""" +from googer import Googer, Query, NoResultsException + +g = Googer() + +# Test 1: plain query (should work) +try: + results = g.search("python", max_results=3) + print(f"Plain 'python': got {len(results)} results") +except NoResultsException as e: + print(f"Plain 'python': NoResults - {e}") + +# Test 2: site operator in query string +try: + results = g.search("python site:github.com", max_results=5) + print(f"'python site:github.com': got {len(results)} results") + for r in results: + print(f" {r.title} -> {r.href}") +except NoResultsException as e: + print(f"'python site:github.com': NoResults - {e}") + +# Test 3: Query builder +try: + q = Query("python").site("github.com") + results = g.search(str(q), max_results=5) + print(f"Query builder: got {len(results)} results") +except NoResultsException as e: + print(f"Query builder: NoResults - {e}") diff --git a/googer_python_library/test_comprehensive.py b/googer_python_library/test_comprehensive.py new file mode 100644 index 0000000..ce64da4 --- /dev/null +++ b/googer_python_library/test_comprehensive.py @@ -0,0 +1,51 @@ +"""Comprehensive test of Googer library functionality.""" +from googer import Googer, Query + +print("=" * 60) +print("TEST 1: Basic search with context manager") +print("=" * 60) +with Googer() as g: + results = g.search("2025한국시리즈 우승팀") + print(f"Got {len(results)} results") + for r in results: + print(f" Title: {r.title}") + print(f" URL: {r.href}") + print(f" Body: {r.body[:80]}...") + print() + +print("=" * 60) +print("TEST 2: Search without context manager") +print("=" * 60) +g = Googer() +results = g.search("machine learning", max_results=5) +print(f"Got {len(results)} results") +for r in results: + print(f" {r.title} -> {r.href}") + +print() +print("=" * 60) +print("TEST 3: Query builder") +print("=" * 60) +q = Query("python").site("github.com") +print(f"Query: {q}") +results = g.search(str(q), max_results=5) +print(f"Got {len(results)} results") +for r in results: + print(f" {r.title}") + +print() +print("=" * 60) +print("TEST 4: Dict-like access") +print("=" * 60) +results = g.search("rust programming", max_results=3) +for r in results: + d = r.to_dict() + print(f" keys: {list(d.keys())}") + print(f" title via dict: {d['title']}") + print(f" title via attr: {r.title}") + print(f" 'title' in r: {'title' in r}") + print() + +print("=" * 60) +print("ALL TESTS PASSED!") +print("=" * 60) diff --git a/googer_python_library/test_googer.py b/googer_python_library/test_googer.py new file mode 100644 index 0000000..7b668c6 --- /dev/null +++ b/googer_python_library/test_googer.py @@ -0,0 +1,6 @@ +from googer import Googer + +with Googer() as g: + results = g.search("2025한국시리즈 우승팀") + for r in results: + print(r.title) diff --git a/pyproject.toml b/pyproject.toml index 6389b56..fdc2ed8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "googer" -version = "0.2.5" +version = "0.2.6" description = "A powerful, type-safe Google Search library for Python — powered by Rust." readme = "README.md" requires-python = ">=3.10" diff --git a/src/config.rs b/src/config.rs index 80b6b95..02a90e6 100644 --- a/src/config.rs +++ b/src/config.rs @@ -18,7 +18,7 @@ use std::collections::HashMap; use std::sync::LazyLock; pub static SAFESEARCH_MAP: LazyLock> = - LazyLock::new(|| HashMap::from([("on", "2"), ("moderate", "1"), ("off", "0")])); + LazyLock::new(|| HashMap::from([("on", "active"), ("moderate", "moderate"), ("off", "off")])); pub static TIMELIMIT_MAP: LazyLock> = LazyLock::new(|| HashMap::from([("h", "h"), ("d", "d"), ("w", "w"), ("m", "m"), ("y", "y")])); diff --git a/src/engines/base.rs b/src/engines/base.rs index bb42bf2..a7c638e 100644 --- a/src/engines/base.rs +++ b/src/engines/base.rs @@ -19,8 +19,8 @@ pub fn build_base_params( let safe_val = SAFESEARCH_MAP .get(safesearch.to_lowercase().as_str()) .copied() - .unwrap_or("1"); - params.push(("filter".to_string(), safe_val.to_string())); + .unwrap_or("moderate"); + params.push(("safe".to_string(), safe_val.to_string())); let start = (page.saturating_sub(1)) * RESULTS_PER_PAGE; params.push(("start".to_string(), start.to_string())); diff --git a/src/http_client.rs b/src/http_client.rs index 3aedd83..f0afeab 100644 --- a/src/http_client.rs +++ b/src/http_client.rs @@ -1,19 +1,17 @@ -// HTTP client — mirrors googer/http_client.py +// HTTP client — powered by ureq (synchronous, no tokio dependency) // -// Wraps reqwest with retries, User-Agent rotation, and rate-limit detection. +// Wraps ureq with retries, User-Agent rotation, and rate-limit detection. use std::time::Duration; use log::{debug, warn}; -use reqwest::blocking::{Client, ClientBuilder}; -use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; -use reqwest::Proxy; +use ureq::Agent; use crate::config::{RATE_LIMIT_INDICATORS, RETRY_BACKOFF_FACTOR}; use crate::exceptions::GoogerError; use crate::user_agents::get_gsa_user_agent; -/// A thin wrapper around a blocking reqwest response. +/// A thin wrapper around an HTTP response. pub struct Response { pub status_code: u16, pub text: String, @@ -27,7 +25,7 @@ impl Response { /// HTTP client with retry logic and rate-limit detection. pub struct HttpClient { - client: Client, + agent: Agent, max_retries: u32, } @@ -36,33 +34,25 @@ impl HttpClient { pub fn new( proxy: Option<&str>, timeout: u64, - verify: bool, + _verify: bool, max_retries: u32, ) -> Result { - let mut builder = ClientBuilder::new() - .timeout(Duration::from_secs(timeout)) - .cookie_store(true) - .danger_accept_invalid_certs(!verify); + let mut builder = ureq::AgentBuilder::new() + .timeout_read(Duration::from_secs(timeout)) + .timeout_write(Duration::from_secs(timeout)) + .timeout_connect(Duration::from_secs(timeout.min(15))) + .max_idle_connections(5) + .redirects(10); if let Some(proxy_url) = proxy { - let p = - Proxy::all(proxy_url).map_err(|e| GoogerError::Http(format!("Bad proxy: {e}")))?; - builder = builder.proxy(p); + let proxy = ureq::Proxy::new(proxy_url) + .map_err(|e| GoogerError::Http(format!("Bad proxy: {e}")))?; + builder = builder.proxy(proxy); } - let mut headers = HeaderMap::new(); - headers.insert( - USER_AGENT, - HeaderValue::from_str(&get_gsa_user_agent()).unwrap(), - ); - builder = builder.default_headers(headers); - - let client = builder - .build() - .map_err(|e| GoogerError::Http(format!("Failed to build HTTP client: {e}")))?; - + let agent = builder.build(); Ok(Self { - client, + agent, max_retries, }) } @@ -71,19 +61,34 @@ impl HttpClient { pub fn get(&self, url: &str, params: &[(String, String)]) -> Result { let mut last_err: Option = None; + // Build the full URL with query parameters + let full_url = if params.is_empty() { + url.to_string() + } else { + let query_string: String = params + .iter() + .map(|(k, v)| format!("{}={}", urlencoding::encode(k), urlencoding::encode(v))) + .collect::>() + .join("&"); + format!("{}?{}", url, query_string) + }; + for attempt in 1..=self.max_retries { - debug!("GET {} (attempt {}/{})", url, attempt, self.max_retries); - - match self - .client - .get(url) - .query(params) - .header(USER_AGENT, get_gsa_user_agent()) - .send() - { + debug!("GET {} (attempt {}/{})", full_url, attempt, self.max_retries); + + let ua = get_gsa_user_agent(); + let result = self + .agent + .get(&full_url) + .set("User-Agent", &ua) + .set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + .set("Accept-Language", "en-US,en;q=0.5") + .call(); + + match result { Ok(resp) => { - let status = resp.status().as_u16(); - let text = resp.text().unwrap_or_default(); + let status = resp.status(); + let text = resp.into_string().unwrap_or_default(); let response = Response { status_code: status, text, @@ -102,11 +107,32 @@ impl HttpClient { return Ok(response); } - Err(e) => { - if e.is_timeout() { - last_err = Some(GoogerError::Timeout(e.to_string())); + Err(ureq::Error::Status(code, resp)) => { + let text = resp.into_string().unwrap_or_default(); + let response = Response { + status_code: code, + text, + }; + + if is_rate_limited(&response) { + if attempt < self.max_retries { + warn!("Rate limit detected, retrying..."); + backoff(attempt); + continue; + } + return Err(GoogerError::RateLimit( + "Google rate limit detected.".to_string(), + )); + } + + return Ok(response); + } + Err(ureq::Error::Transport(e)) => { + let err_str = e.to_string(); + if err_str.contains("timeout") || err_str.contains("Timeout") { + last_err = Some(GoogerError::Timeout(err_str)); } else { - last_err = Some(GoogerError::Http(e.to_string())); + last_err = Some(GoogerError::Http(err_str)); } if attempt < self.max_retries { backoff(attempt); diff --git a/src/utils.rs b/src/utils.rs index db3ad08..c0ffe8f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -123,14 +123,11 @@ pub fn build_region_params(region: &str) -> Vec<(String, String)> { }; vec![ - ( - "hl".to_string(), - format!("{}-{}", lang, country.to_uppercase()), - ), + ("hl".to_string(), lang.clone()), ("lr".to_string(), format!("lang_{lang}")), ( - "cr".to_string(), - format!("country{}", country.to_uppercase()), + "gl".to_string(), + country.to_uppercase(), ), ] } From 589c24190d6bd52c15d425df312bf73ffd998c3d Mon Sep 17 00:00:00 2001 From: CocoRoF Date: Tue, 17 Mar 2026 22:16:17 +0900 Subject: [PATCH 2/2] Remove debug HTML and parsing scripts, and clean up related code - Deleted `debug_html.py`, `debug_parse.py`, `debug_selectors.py`, `debug_selectors2.py`, `debug_site.py`, `test_comprehensive.py`, and `test_googer.py` as they are no longer needed. - Simplified the `HttpClient` struct initialization in `http_client.rs`. - Improved formatting in the GET request logging and headers setting in `http_client.rs`. - Cleaned up the `build_region_params` function in `utils.rs` for better readability. --- googer_python_library/debug_chrome.html | 18 ------ googer_python_library/debug_google.html | 18 ------ googer_python_library/debug_gsa.html | 47 -------------- googer_python_library/debug_html.py | 54 ---------------- googer_python_library/debug_parse.py | 42 ------------ googer_python_library/debug_selectors.py | 21 ------ googer_python_library/debug_selectors2.py | 71 --------------------- googer_python_library/debug_site.py | 28 -------- googer_python_library/test_comprehensive.py | 51 --------------- googer_python_library/test_googer.py | 6 -- src/http_client.rs | 15 +++-- src/utils.rs | 5 +- 12 files changed, 10 insertions(+), 366 deletions(-) delete mode 100644 googer_python_library/debug_chrome.html delete mode 100644 googer_python_library/debug_google.html delete mode 100644 googer_python_library/debug_gsa.html delete mode 100644 googer_python_library/debug_html.py delete mode 100644 googer_python_library/debug_parse.py delete mode 100644 googer_python_library/debug_selectors.py delete mode 100644 googer_python_library/debug_selectors2.py delete mode 100644 googer_python_library/debug_site.py delete mode 100644 googer_python_library/test_comprehensive.py delete mode 100644 googer_python_library/test_googer.py diff --git a/googer_python_library/debug_chrome.html b/googer_python_library/debug_chrome.html deleted file mode 100644 index 4a187e3..0000000 --- a/googer_python_library/debug_chrome.html +++ /dev/null @@ -1,18 +0,0 @@ -Google Search \ No newline at end of file diff --git a/googer_python_library/debug_google.html b/googer_python_library/debug_google.html deleted file mode 100644 index cd6d383..0000000 --- a/googer_python_library/debug_google.html +++ /dev/null @@ -1,18 +0,0 @@ -Google Search \ No newline at end of file diff --git a/googer_python_library/debug_gsa.html b/googer_python_library/debug_gsa.html deleted file mode 100644 index 9435c07..0000000 --- a/googer_python_library/debug_gsa.html +++ /dev/null @@ -1,47 +0,0 @@ -python programming - Google Search

Search Results

AI overview

Thinking
Python is a popular programming language. It was created by Guido van Rossum, and released in 1991. It is used for:
People also ask
Feedback
Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.
Write and run your Python code using our online compiler. Enjoy additional features like code sharing, dark mode, and support for multiple programming ...
Images
Beginning Python Programming — Part 1 | by Bob Roebling | Better ...
Better Programming
Python Tutorial for Absolute Beginners #1 - What Are Variables?
YouTube
Python is a high-level programming language, which can be easily interpreted by human. Python interpreter converts the Python code (high-level) into ...
\ No newline at end of file diff --git a/googer_python_library/debug_html.py b/googer_python_library/debug_html.py deleted file mode 100644 index c672c54..0000000 --- a/googer_python_library/debug_html.py +++ /dev/null @@ -1,54 +0,0 @@ -import urllib.request -import re - -# Test with GSA user agent (what the library uses) -gsa_ua = "Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/411.0.879111500 Mobile/15E148 Safari/604.1" - -req = urllib.request.Request( - 'https://www.google.com/search?q=python+programming&start=0&hl=en&lr=lang_en', - headers={'User-Agent': gsa_ua} -) -r = urllib.request.urlopen(req) -html_gsa = r.read().decode('utf-8') -print(f"=== GSA UA HTML length: {len(html_gsa)} ===") -with open('debug_gsa.html', 'w', encoding='utf-8') as f: - f.write(html_gsa) - -# Test with Chrome desktop UA -chrome_ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" - -req2 = urllib.request.Request( - 'https://www.google.com/search?q=python+programming&start=0&hl=en&lr=lang_en', - headers={'User-Agent': chrome_ua} -) -r2 = urllib.request.urlopen(req2) -html_chrome = r2.read().decode('utf-8') -print(f"=== Chrome UA HTML length: {len(html_chrome)} ===") -with open('debug_chrome.html', 'w', encoding='utf-8') as f: - f.write(html_chrome) - -# Look for patterns in GSA HTML -print("\n--- GSA HTML patterns ---") -for pattern in ['data-snc', 'data-sncf', 'data-sokoban', 'class="g"', 'class="Gx5Zad"', - 'role="link"', '/url?q=', 'class="BNeawe"', 'class="kCrYT"', - 'class="ZINbbc"', 'class="tF2Cxc"', 'class="yuRUbf"', 'class="VwiC3b"', - 'class="egMi0 kCrYT"', 'class="BVG0Nb"', 'data-hveid']: - count = html_gsa.count(pattern) - if count > 0: - print(f" {pattern}: {count} occurrences") - -print("\n--- Chrome HTML patterns ---") -for pattern in ['data-snc', 'data-sncf', 'data-sokoban', 'class="g"', 'class="Gx5Zad"', - 'role="link"', '/url?q=', 'class="BNeawe"', 'class="kCrYT"', - 'class="ZINbbc"', 'class="tF2Cxc"', 'class="yuRUbf"', 'class="VwiC3b"', - 'class="egMi0 kCrYT"', 'class="BVG0Nb"', 'data-hveid']: - count = html_chrome.count(pattern) - if count > 0: - print(f" {pattern}: {count} occurrences") - -# Check for redirect/consent pages -print("\n--- Consent/redirect check ---") -print(f"GSA consent: {'consent' in html_gsa.lower()}") -print(f"Chrome consent: {'consent' in html_chrome.lower()}") -print(f"GSA captcha: {'captcha' in html_gsa.lower()}") -print(f"Chrome captcha: {'captcha' in html_chrome.lower()}") diff --git a/googer_python_library/debug_parse.py b/googer_python_library/debug_parse.py deleted file mode 100644 index 829e8c2..0000000 --- a/googer_python_library/debug_parse.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Parse the saved GSA HTML with the same selectors the Rust code uses.""" -try: - from bs4 import BeautifulSoup -except ImportError: - import subprocess, sys - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'beautifulsoup4', '-q']) - from bs4 import BeautifulSoup - -# Use lxml-like CSS selectors via BeautifulSoup to emulate what scraper crate does -with open('debug_gsa.html', 'r', encoding='utf-8') as f: - html = f.read() - -from bs4 import BeautifulSoup -soup = BeautifulSoup(html, 'html.parser') - -# Current selectors from config.rs -TEXT_ITEMS_SELECTOR = "div[data-snc]" -TEXT_TITLE_SELECTOR = "div[role='link']" -TEXT_HREF_SELECTOR = "a[href]" -TEXT_BODY_SELECTOR = "div[data-sncf]" - -items = soup.select(TEXT_ITEMS_SELECTOR) -print(f"Found {len(items)} data-snc items\n") - -for i, item in enumerate(items): - title_el = item.select_one(TEXT_TITLE_SELECTOR) - href_el = item.select_one(TEXT_HREF_SELECTOR) - body_el = item.select_one(TEXT_BODY_SELECTOR) - - title = title_el.get_text(strip=True) if title_el else "(no title)" - href = href_el.get('href', '(no href)') if href_el else "(no href)" - body = body_el.get_text(strip=True) if body_el else "(no body)" - - # Clean URL - if href.startswith('/url?q='): - href = href.split('/url?q=')[1].split('&')[0] - - print(f"--- Result {i+1} ---") - print(f"Title: {title[:100]}") - print(f"Href: {href[:100]}") - print(f"Body: {body[:150]}") - print() diff --git a/googer_python_library/debug_selectors.py b/googer_python_library/debug_selectors.py deleted file mode 100644 index c96dc13..0000000 --- a/googer_python_library/debug_selectors.py +++ /dev/null @@ -1,21 +0,0 @@ -import urllib.request - -req = urllib.request.Request( - 'https://www.google.com/search?q=python+programming&start=0&hl=en&lr=lang_en&safe=active', - headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' - } -) -r = urllib.request.urlopen(req) -html = r.read().decode('utf-8') -print(f"HTML length: {len(html)}") -print(f"data-snc: {'data-snc' in html}") -print(f"data-sncf: {'data-sncf' in html}") -class_g = 'class="g"' -print(f"class g: {class_g in html}") -print(f"div.MjjYud: {'MjjYud' in html}") -print(f"div.g.Ww4FFb: {'Ww4FFb' in html}") - -with open('debug_google.html', 'w', encoding='utf-8') as f: - f.write(html) -print("Saved to debug_google.html") diff --git a/googer_python_library/debug_selectors2.py b/googer_python_library/debug_selectors2.py deleted file mode 100644 index 2864bb0..0000000 --- a/googer_python_library/debug_selectors2.py +++ /dev/null @@ -1,71 +0,0 @@ -from html.parser import HTMLParser -import re - -class SelectorChecker(HTMLParser): - def __init__(self): - super().__init__() - self.data_snc_count = 0 - self.data_snc_tags = [] - self.data_sncf_count = 0 - self.data_sncf_tags = [] - self.role_link_count = 0 - self.role_link_tags = [] - self.url_q_count = 0 - self.current_snc_item = None - self.in_snc = False - self.snc_items = [] - - def handle_starttag(self, tag, attrs): - attr_dict = dict(attrs) - if 'data-snc' in attr_dict: - self.data_snc_count += 1 - self.data_snc_tags.append((tag, attr_dict.get('data-snc', ''))) - if 'data-sncf' in attr_dict: - self.data_sncf_count += 1 - self.data_sncf_tags.append((tag, attr_dict.get('data-sncf', ''))) - if attr_dict.get('role') == 'link': - self.role_link_count += 1 - self.role_link_tags.append(tag) - href = attr_dict.get('href', '') - if '/url?q=' in href: - self.url_q_count += 1 - -with open('debug_gsa.html', 'r', encoding='utf-8') as f: - html = f.read() - -checker = SelectorChecker() -checker.feed(html) - -print(f"data-snc elements: {checker.data_snc_count}") -for tag, val in checker.data_snc_tags[:5]: - print(f" <{tag} data-snc=\"{val}\">") - -print(f"\ndata-sncf elements: {checker.data_sncf_count}") -for tag, val in checker.data_sncf_tags[:5]: - print(f" <{tag} data-sncf=\"{val}\">") - -print(f"\nrole='link' elements: {checker.role_link_count}") -for tag in checker.role_link_tags[:5]: - print(f" <{tag}>") - -print(f"\n/url?q= links: {checker.url_q_count}") - -# Let me try to extract a sample data-snc block with context -import re - -# Find first data-snc block with surrounding content -snc_pattern = re.compile(r']*data-snc[^>]*>.*?', re.DOTALL) -matches = snc_pattern.findall(html) -print(f"\n=== First data-snc block (first 2000 chars) ===") -if matches: - block = matches[0][:2000] - print(block) - -# Find context around data-snc more broadly -idx = html.find('data-snc') -if idx >= 0: - start = max(0, idx - 200) - end = min(len(html), idx + 2000) - snippet = html[start:end] - print(f"\n=== Context around first data-snc (chars {start}-{end}) ===") - print(snippet[:3000]) diff --git a/googer_python_library/debug_site.py b/googer_python_library/debug_site.py deleted file mode 100644 index c8d9bfc..0000000 --- a/googer_python_library/debug_site.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Debug the site: operator issue.""" -from googer import Googer, Query, NoResultsException - -g = Googer() - -# Test 1: plain query (should work) -try: - results = g.search("python", max_results=3) - print(f"Plain 'python': got {len(results)} results") -except NoResultsException as e: - print(f"Plain 'python': NoResults - {e}") - -# Test 2: site operator in query string -try: - results = g.search("python site:github.com", max_results=5) - print(f"'python site:github.com': got {len(results)} results") - for r in results: - print(f" {r.title} -> {r.href}") -except NoResultsException as e: - print(f"'python site:github.com': NoResults - {e}") - -# Test 3: Query builder -try: - q = Query("python").site("github.com") - results = g.search(str(q), max_results=5) - print(f"Query builder: got {len(results)} results") -except NoResultsException as e: - print(f"Query builder: NoResults - {e}") diff --git a/googer_python_library/test_comprehensive.py b/googer_python_library/test_comprehensive.py deleted file mode 100644 index ce64da4..0000000 --- a/googer_python_library/test_comprehensive.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Comprehensive test of Googer library functionality.""" -from googer import Googer, Query - -print("=" * 60) -print("TEST 1: Basic search with context manager") -print("=" * 60) -with Googer() as g: - results = g.search("2025한국시리즈 우승팀") - print(f"Got {len(results)} results") - for r in results: - print(f" Title: {r.title}") - print(f" URL: {r.href}") - print(f" Body: {r.body[:80]}...") - print() - -print("=" * 60) -print("TEST 2: Search without context manager") -print("=" * 60) -g = Googer() -results = g.search("machine learning", max_results=5) -print(f"Got {len(results)} results") -for r in results: - print(f" {r.title} -> {r.href}") - -print() -print("=" * 60) -print("TEST 3: Query builder") -print("=" * 60) -q = Query("python").site("github.com") -print(f"Query: {q}") -results = g.search(str(q), max_results=5) -print(f"Got {len(results)} results") -for r in results: - print(f" {r.title}") - -print() -print("=" * 60) -print("TEST 4: Dict-like access") -print("=" * 60) -results = g.search("rust programming", max_results=3) -for r in results: - d = r.to_dict() - print(f" keys: {list(d.keys())}") - print(f" title via dict: {d['title']}") - print(f" title via attr: {r.title}") - print(f" 'title' in r: {'title' in r}") - print() - -print("=" * 60) -print("ALL TESTS PASSED!") -print("=" * 60) diff --git a/googer_python_library/test_googer.py b/googer_python_library/test_googer.py deleted file mode 100644 index 7b668c6..0000000 --- a/googer_python_library/test_googer.py +++ /dev/null @@ -1,6 +0,0 @@ -from googer import Googer - -with Googer() as g: - results = g.search("2025한국시리즈 우승팀") - for r in results: - print(r.title) diff --git a/src/http_client.rs b/src/http_client.rs index f0afeab..3e9e2b5 100644 --- a/src/http_client.rs +++ b/src/http_client.rs @@ -51,10 +51,7 @@ impl HttpClient { } let agent = builder.build(); - Ok(Self { - agent, - max_retries, - }) + Ok(Self { agent, max_retries }) } /// Perform a GET request with retries. @@ -74,14 +71,20 @@ impl HttpClient { }; for attempt in 1..=self.max_retries { - debug!("GET {} (attempt {}/{})", full_url, attempt, self.max_retries); + debug!( + "GET {} (attempt {}/{})", + full_url, attempt, self.max_retries + ); let ua = get_gsa_user_agent(); let result = self .agent .get(&full_url) .set("User-Agent", &ua) - .set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + .set( + "Accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + ) .set("Accept-Language", "en-US,en;q=0.5") .call(); diff --git a/src/utils.rs b/src/utils.rs index c0ffe8f..ab39bb8 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -125,10 +125,7 @@ pub fn build_region_params(region: &str) -> Vec<(String, String)> { vec![ ("hl".to_string(), lang.clone()), ("lr".to_string(), format!("lang_{lang}")), - ( - "gl".to_string(), - country.to_uppercase(), - ), + ("gl".to_string(), country.to_uppercase()), ] }