diff --git a/flow/api/parse/pdf/pdftotext.cc b/flow/api/parse/pdf/pdftotext.cc index 03ed1430e..fa52a7dee 100644 --- a/flow/api/parse/pdf/pdftotext.cc +++ b/flow/api/parse/pdf/pdftotext.cc @@ -1,40 +1,62 @@ #include #include +#include +#include -#include "poppler/cpp/poppler-document.h" -#include "poppler/cpp/poppler-page.h" +#include +#include -void donothing(const std::string &, void *) {} +using std::size_t; +using OnceFlag = std::once_flag; +using String = std::string; +template > +using UniquePtr = std::unique_ptr; -extern "C" { - const char *pdftotext(const char *data, int data_size) - { - static bool has_reset_error_function = false; - if (!has_reset_error_function) { +using ByteArray = poppler::byte_array; +using Document = poppler::document; +using Page = poppler::page; + +namespace { + OnceFlag errorFnFlag; + + void initErrorFunction() { // Do not log errors from poppler to stderr - poppler::set_debug_error_function(donothing, nullptr); - has_reset_error_function = true; - } - - const auto *doc = poppler::document::load_from_raw_data(data, data_size); - if (doc == nullptr) { - return nullptr; - } - const int N = doc->pages(); - - std::vector contents[N]; - int text_length = 0; - for (int i = 0; i < N; ++i) { - contents[i] = doc->create_page(i)->text().to_utf8(); - text_length += contents[i].size(); - } - - char *buffer = (char *)std::malloc(text_length + 1); - for (int i = 0, offset = 0; i < N; offset += contents[i].size(), ++i) { - std::memcpy(buffer + offset, contents[i].data(), contents[i].size()); - } - buffer[text_length] = '\0'; - - return buffer; - } + poppler::set_debug_error_function( + []([[maybe_unused]] const String& s, [[maybe_unused]] void* p) -> void {}, + nullptr + ); + } +} + +extern "C" { + [[nodiscard]] + const char* pdfToText(const char* data, size_t dataSize) noexcept { + std::call_once(errorFnFlag, initErrorFunction); + + UniquePtr doc(Document::load_from_raw_data(data, dataSize)); + if (!doc) { + return nullptr; + } + + const int pageCount = doc->pages(); + String result; + + for (int i = 0; i < pageCount; ++i) { + UniquePtr page(doc->create_page(i)); + if (!page) { + continue; // skip invalid pages + } + ByteArray pageText = page->text().to_utf8(); + result.append(pageText.begin(), pageText.end()); + } + + char* buffer = static_cast(std::malloc(result.length() + 1)); + if (!buffer) { + return nullptr; + } + std::memcpy(buffer, result.data(), result.length()); + buffer[result.length()] = '\0'; + + return buffer; + } } diff --git a/flow/api/parse/pdf/pdftotext.go b/flow/api/parse/pdf/pdftotext.go index 944935d2a..9e7ad8db7 100644 --- a/flow/api/parse/pdf/pdftotext.go +++ b/flow/api/parse/pdf/pdftotext.go @@ -3,21 +3,27 @@ package pdf // #cgo CFLAGS: -O2 -Wall -I/usr/include/poppler/cpp // #cgo LDFLAGS: -lpoppler-cpp // #include -// const char *pdftotext(const char *data, int data_size); +// const char* pdfToText(const char* data, size_t data_size); import "C" import ( "errors" + "runtime" "unsafe" ) func ToText(data []byte) (string, error) { // Is this safe? Kind of: `data`, a []byte is a continguous array in Go, - // so we can safely point a C-land (const char *) to it, + // so we can safely point a C-land (const char*) to it, // *provided* that C code does not attempt to find the end of the string, // as []byte need not be zero-terminated. - // This is true for us, as C.pdftotext treats its first argument as bytes. + // This is true for us, as C.pdfToText treats its first argument as bytes. + if len(data) == 0 { + return "", errors.New("empty PDF data") + } + cData := (*C.char)(unsafe.Pointer(&data[0])) - result := C.pdftotext(cData, C.int(len(data))) + result := C.pdfToText(cData, C.size_t(len(data))) + runtime.KeepAlive(data) if result != nil { converted := C.GoString(result) C.free(unsafe.Pointer(result))