Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 55 additions & 33 deletions flow/api/parse/pdf/pdftotext.cc
Original file line number Diff line number Diff line change
@@ -1,40 +1,62 @@
#include <cstring>
#include <string>
#include <memory>
#include <mutex>

#include "poppler/cpp/poppler-document.h"
#include "poppler/cpp/poppler-page.h"
#include <poppler/cpp/poppler-document.h>
#include <poppler/cpp/poppler-page.h>

void donothing(const std::string &, void *) {}
using std::size_t;
using OnceFlag = std::once_flag;
using String = std::string;
template <typename T, typename Del = std::default_delete<T>>
using UniquePtr = std::unique_ptr<T, Del>;

extern "C" {
const char *pdftotext(const char *data, int data_size)
{
static bool has_reset_error_function = false;
if (!has_reset_error_function) {
using ByteArray = poppler::byte_array;
using Document = poppler::document;
using Page = poppler::page;

namespace {
OnceFlag errorFnFlag;

void initErrorFunction() {
// Do not log errors from poppler to stderr
poppler::set_debug_error_function(donothing, nullptr);
has_reset_error_function = true;
}

const auto *doc = poppler::document::load_from_raw_data(data, data_size);
if (doc == nullptr) {
return nullptr;
}
const int N = doc->pages();

std::vector<char> contents[N];
int text_length = 0;
for (int i = 0; i < N; ++i) {
contents[i] = doc->create_page(i)->text().to_utf8();
text_length += contents[i].size();
}

char *buffer = (char *)std::malloc(text_length + 1);
for (int i = 0, offset = 0; i < N; offset += contents[i].size(), ++i) {
std::memcpy(buffer + offset, contents[i].data(), contents[i].size());
}
buffer[text_length] = '\0';

return buffer;
}
poppler::set_debug_error_function(
[]([[maybe_unused]] const String& s, [[maybe_unused]] void* p) -> void {},
nullptr
);
}
}

extern "C" {
[[nodiscard]]
const char* pdfToText(const char* data, size_t dataSize) noexcept {
std::call_once(errorFnFlag, initErrorFunction);

UniquePtr<Document> doc(Document::load_from_raw_data(data, dataSize));
if (!doc) {
return nullptr;
}

const int pageCount = doc->pages();
String result;

for (int i = 0; i < pageCount; ++i) {
UniquePtr<Page> page(doc->create_page(i));
if (!page) {
continue; // skip invalid pages
}
ByteArray pageText = page->text().to_utf8();
result.append(pageText.begin(), pageText.end());
}

char* buffer = static_cast<char*>(std::malloc(result.length() + 1));
if (!buffer) {
return nullptr;
}
std::memcpy(buffer, result.data(), result.length());
buffer[result.length()] = '\0';

return buffer;
}
}
14 changes: 10 additions & 4 deletions flow/api/parse/pdf/pdftotext.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,27 @@ package pdf
// #cgo CFLAGS: -O2 -Wall -I/usr/include/poppler/cpp
// #cgo LDFLAGS: -lpoppler-cpp
// #include <stdlib.h>
// const char *pdftotext(const char *data, int data_size);
// const char* pdfToText(const char* data, size_t data_size);
import "C"
import (
"errors"
"runtime"
"unsafe"
)

func ToText(data []byte) (string, error) {
// Is this safe? Kind of: `data`, a []byte is a continguous array in Go,
// so we can safely point a C-land (const char *) to it,
// so we can safely point a C-land (const char*) to it,
// *provided* that C code does not attempt to find the end of the string,
// as []byte need not be zero-terminated.
// This is true for us, as C.pdftotext treats its first argument as bytes.
// This is true for us, as C.pdfToText treats its first argument as bytes.
if len(data) == 0 {
return "", errors.New("empty PDF data")
}

cData := (*C.char)(unsafe.Pointer(&data[0]))
result := C.pdftotext(cData, C.int(len(data)))
result := C.pdfToText(cData, C.size_t(len(data)))
runtime.KeepAlive(data)
if result != nil {
converted := C.GoString(result)
C.free(unsafe.Pointer(result))
Expand Down