From f976dfd21fbd52f662925feb78d081fd19b938b7 Mon Sep 17 00:00:00 2001 From: Toyosatomimi no Miko <110693261+mikomikotaishi@users.noreply.github.com> Date: Sat, 6 Dec 2025 18:07:50 -0500 Subject: [PATCH 1/3] Various code cleanup on pdftotext --- flow/api/parse/pdf/pdftotext.cc | 58 ++++++++++++++++++--------------- flow/api/parse/pdf/pdftotext.go | 8 ++--- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/flow/api/parse/pdf/pdftotext.cc b/flow/api/parse/pdf/pdftotext.cc index 03ed1430e..49482d6e3 100644 --- a/flow/api/parse/pdf/pdftotext.cc +++ b/flow/api/parse/pdf/pdftotext.cc @@ -1,40 +1,44 @@ #include #include +#include #include "poppler/cpp/poppler-document.h" #include "poppler/cpp/poppler-page.h" -void donothing(const std::string &, void *) {} +using String = std::string; +template +using Vector = std::vector; +using Document = poppler::document; extern "C" { - const char *pdftotext(const char *data, int data_size) - { - static bool has_reset_error_function = false; - if (!has_reset_error_function) { - // Do not log errors from poppler to stderr - poppler::set_debug_error_function(donothing, nullptr); - has_reset_error_function = true; - } + [[nodiscard]] + const char* pdfToText(const char* data, int dataSize) noexcept { + static bool hasResetErrorFunction = false; + if (!hasResetErrorFunction) { + // Do not log errors from poppler to stderr + poppler::set_debug_error_function( + []([[maybe_unused]] const String& s, [[maybe_unused]] void* p) -> void {}, + nullptr + ); + hasResetErrorFunction = true; + } - const auto *doc = poppler::document::load_from_raw_data(data, data_size); - if (doc == nullptr) { - return nullptr; - } - const int N = doc->pages(); + const Document* doc = Document::load_from_raw_data(data, dataSize); + if (!doc) { + return nullptr; + } + const int pageCount = doc->pages(); - std::vector contents[N]; - int text_length = 0; - for (int i = 0; i < N; ++i) { - contents[i] = doc->create_page(i)->text().to_utf8(); - text_length += contents[i].size(); - } + String result; + for (int i = 0; i < pageCount; ++i) { + Vector pageText = doc->create_page(i)->text().to_utf8(); + result.append(pageText.begin(), pageText.end()); + } - char *buffer = (char *)std::malloc(text_length + 1); - for (int i = 0, offset = 0; i < N; offset += contents[i].size(), ++i) { - std::memcpy(buffer + offset, contents[i].data(), contents[i].size()); - } - buffer[text_length] = '\0'; + char* buffer = (char*)std::malloc(result.length() + 1); + std::memcpy(buffer, result.data(), result.length()); + buffer[result.length()] = '\0'; - return buffer; - } + return buffer; + } } diff --git a/flow/api/parse/pdf/pdftotext.go b/flow/api/parse/pdf/pdftotext.go index 944935d2a..d5cc701e2 100644 --- a/flow/api/parse/pdf/pdftotext.go +++ b/flow/api/parse/pdf/pdftotext.go @@ -3,7 +3,7 @@ package pdf // #cgo CFLAGS: -O2 -Wall -I/usr/include/poppler/cpp // #cgo LDFLAGS: -lpoppler-cpp // #include -// const char *pdftotext(const char *data, int data_size); +// const char *pdfToText(const char* data, int data_size); import "C" import ( "errors" @@ -12,12 +12,12 @@ import ( func ToText(data []byte) (string, error) { // Is this safe? Kind of: `data`, a []byte is a continguous array in Go, - // so we can safely point a C-land (const char *) to it, + // so we can safely point a C-land (const char*) to it, // *provided* that C code does not attempt to find the end of the string, // as []byte need not be zero-terminated. - // This is true for us, as C.pdftotext treats its first argument as bytes. + // This is true for us, as C.pdfToText treats its first argument as bytes. cData := (*C.char)(unsafe.Pointer(&data[0])) - result := C.pdftotext(cData, C.int(len(data))) + result := C.pdfToText(cData, C.int(len(data))) if result != nil { converted := C.GoString(result) C.free(unsafe.Pointer(result)) From c1a35651fc02ab7bff5f7a27471878cbd4df0d95 Mon Sep 17 00:00:00 2001 From: Toyosatomimi no Miko <110693261+mikomikotaishi@users.noreply.github.com> Date: Sat, 6 Dec 2025 18:16:45 -0500 Subject: [PATCH 2/3] Replace vector header with poppler type --- flow/api/parse/pdf/pdftotext.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/flow/api/parse/pdf/pdftotext.cc b/flow/api/parse/pdf/pdftotext.cc index 49482d6e3..725eab79f 100644 --- a/flow/api/parse/pdf/pdftotext.cc +++ b/flow/api/parse/pdf/pdftotext.cc @@ -1,13 +1,11 @@ #include #include -#include #include "poppler/cpp/poppler-document.h" #include "poppler/cpp/poppler-page.h" using String = std::string; -template -using Vector = std::vector; +using ByteArray = poppler::byte_array; using Document = poppler::document; extern "C" { @@ -31,7 +29,7 @@ extern "C" { String result; for (int i = 0; i < pageCount; ++i) { - Vector pageText = doc->create_page(i)->text().to_utf8(); + ByteArray pageText = doc->create_page(i)->text().to_utf8(); result.append(pageText.begin(), pageText.end()); } From 7071f2c944f1e3b703384ef65d4acb536ae039b1 Mon Sep 17 00:00:00 2001 From: Toyosatomimi no Miko <110693261+mikomikotaishi@users.noreply.github.com> Date: Sun, 28 Dec 2025 17:19:51 -0500 Subject: [PATCH 3/3] Address unfreed Document and race condition --- flow/api/parse/pdf/pdftotext.cc | 52 +++++++++++++++++++++++---------- flow/api/parse/pdf/pdftotext.go | 10 +++++-- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/flow/api/parse/pdf/pdftotext.cc b/flow/api/parse/pdf/pdftotext.cc index 725eab79f..fa52a7dee 100644 --- a/flow/api/parse/pdf/pdftotext.cc +++ b/flow/api/parse/pdf/pdftotext.cc @@ -1,39 +1,59 @@ #include #include +#include +#include -#include "poppler/cpp/poppler-document.h" -#include "poppler/cpp/poppler-page.h" +#include +#include +using std::size_t; +using OnceFlag = std::once_flag; using String = std::string; +template > +using UniquePtr = std::unique_ptr; + using ByteArray = poppler::byte_array; using Document = poppler::document; +using Page = poppler::page; + +namespace { + OnceFlag errorFnFlag; + + void initErrorFunction() { + // Do not log errors from poppler to stderr + poppler::set_debug_error_function( + []([[maybe_unused]] const String& s, [[maybe_unused]] void* p) -> void {}, + nullptr + ); + } +} extern "C" { [[nodiscard]] - const char* pdfToText(const char* data, int dataSize) noexcept { - static bool hasResetErrorFunction = false; - if (!hasResetErrorFunction) { - // Do not log errors from poppler to stderr - poppler::set_debug_error_function( - []([[maybe_unused]] const String& s, [[maybe_unused]] void* p) -> void {}, - nullptr - ); - hasResetErrorFunction = true; - } + const char* pdfToText(const char* data, size_t dataSize) noexcept { + std::call_once(errorFnFlag, initErrorFunction); - const Document* doc = Document::load_from_raw_data(data, dataSize); + UniquePtr doc(Document::load_from_raw_data(data, dataSize)); if (!doc) { return nullptr; } - const int pageCount = doc->pages(); + const int pageCount = doc->pages(); String result; + for (int i = 0; i < pageCount; ++i) { - ByteArray pageText = doc->create_page(i)->text().to_utf8(); + UniquePtr page(doc->create_page(i)); + if (!page) { + continue; // skip invalid pages + } + ByteArray pageText = page->text().to_utf8(); result.append(pageText.begin(), pageText.end()); } - char* buffer = (char*)std::malloc(result.length() + 1); + char* buffer = static_cast(std::malloc(result.length() + 1)); + if (!buffer) { + return nullptr; + } std::memcpy(buffer, result.data(), result.length()); buffer[result.length()] = '\0'; diff --git a/flow/api/parse/pdf/pdftotext.go b/flow/api/parse/pdf/pdftotext.go index d5cc701e2..9e7ad8db7 100644 --- a/flow/api/parse/pdf/pdftotext.go +++ b/flow/api/parse/pdf/pdftotext.go @@ -3,10 +3,11 @@ package pdf // #cgo CFLAGS: -O2 -Wall -I/usr/include/poppler/cpp // #cgo LDFLAGS: -lpoppler-cpp // #include -// const char *pdfToText(const char* data, int data_size); +// const char* pdfToText(const char* data, size_t data_size); import "C" import ( "errors" + "runtime" "unsafe" ) @@ -16,8 +17,13 @@ func ToText(data []byte) (string, error) { // *provided* that C code does not attempt to find the end of the string, // as []byte need not be zero-terminated. // This is true for us, as C.pdfToText treats its first argument as bytes. + if len(data) == 0 { + return "", errors.New("empty PDF data") + } + cData := (*C.char)(unsafe.Pointer(&data[0])) - result := C.pdfToText(cData, C.int(len(data))) + result := C.pdfToText(cData, C.size_t(len(data))) + runtime.KeepAlive(data) if result != nil { converted := C.GoString(result) C.free(unsafe.Pointer(result))