Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/pybind/docling_parser_v2.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <pybind/docling_resources.h>

#include <v2.h>
#include <mutex>

namespace docling
{
Expand Down Expand Up @@ -81,6 +82,8 @@ namespace docling
std::map<std::string, decoder_ptr_type> key2doc;
};



docling_parser_v2::docling_parser_v2():
docling_resources(),
pdf_resources_dir(resource_utils::get_resources_v2_dir(true).string()),
Expand All @@ -94,6 +97,7 @@ namespace docling
data[RESOURCE_DIR_KEY] = pdf_resources_dir;

std::map<std::string, double> timings = {};
// Eagerly initialize font resources at parser construction to enable parallel document loading
pdflib::pdf_resource<pdflib::PAGE_FONT>::initialise(data, timings);
}

Expand All @@ -112,6 +116,7 @@ namespace docling
data[RESOURCE_DIR_KEY] = pdf_resources_dir;

std::map<std::string, double> timings = {};
// Eagerly initialize font resources at parser construction to enable parallel document loading
pdflib::pdf_resource<pdflib::PAGE_FONT>::initialise(data, timings);
}

Expand Down Expand Up @@ -334,6 +339,11 @@ namespace docling
}

auto& decoder = itr->second;

// Lock this specific document to prevent concurrent access to same document
// while allowing different documents to be processed in parallel
auto lock = decoder->get_lock();

decoder->decode_document(page_boundary, do_sanitization);

LOG_S(INFO) << "decoding done for key: " << key;
Expand Down Expand Up @@ -362,6 +372,10 @@ namespace docling

auto& decoder = itr->second;

// Lock this specific document to prevent concurrent access to same document
// while allowing different documents to be processed in parallel
auto lock = decoder->get_lock();

std::vector<int> pages = {page};
decoder->decode_document(pages, page_boundary, do_sanitization);

Expand Down
8 changes: 8 additions & 0 deletions src/v2/pdf_decoders/document.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#define PDF_DOCUMENT_DECODER_H

#include <qpdf/QPDF.hh>
#include <mutex>
//#include <qpdf/QPDFPageObjectHelper.hh>

namespace pdflib
Expand Down Expand Up @@ -33,6 +34,9 @@ namespace pdflib
void decode_document(std::string page_boundary, bool do_sanitization);

void decode_document(std::vector<int>& page_numbers, std::string page_boundary, bool do_sanitization);

// Thread-safe document access methods
std::lock_guard<std::mutex> get_lock() const { return std::lock_guard<std::mutex>(document_mutex); }

private:

Expand All @@ -57,6 +61,10 @@ namespace pdflib
//nlohmann::json json_toc; // table-of-contents
nlohmann::json json_annots;
nlohmann::json json_document;

// Per-document mutex to prevent concurrent access to same document
// while allowing different documents to be processed in parallel
mutable std::mutex document_mutex;
};

pdf_decoder<DOCUMENT>::pdf_decoder():
Expand Down
129 changes: 76 additions & 53 deletions src/v2/pdf_resources/page_font.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
#ifndef PDF_PAGE_FONT_RESOURCE_H
#define PDF_PAGE_FONT_RESOURCE_H

#include <mutex>
#include <atomic>
#include <thread>

namespace pdflib
{

Expand Down Expand Up @@ -92,6 +96,9 @@ namespace pdflib
static font_cids cids;
static font_encodings encodings;
static base_fonts bfonts;

// Thread-safety for font cache initialization
static std::atomic<bool> initialized;

private:

Expand Down Expand Up @@ -148,6 +155,9 @@ namespace pdflib
font_cids pdf_resource<PAGE_FONT>::cids = font_cids();
font_encodings pdf_resource<PAGE_FONT>::encodings = font_encodings();
base_fonts pdf_resource<PAGE_FONT>::bfonts = base_fonts();

// Thread-safety initialization
std::atomic<bool> pdf_resource<PAGE_FONT>::initialized(false);

pdf_resource<PAGE_FONT>::pdf_resource()
{}
Expand All @@ -167,70 +177,83 @@ namespace pdflib
void pdf_resource<PAGE_FONT>::initialise(nlohmann::json data,
std::map<std::string, double>& timings)
{
LOG_S(INFO) << __FUNCTION__ << ": " << data.dump(2);

std::string PDFS_RESOURCES_DIR = "../docling_parse/pdf_resources_v2/";
LOG_S(INFO) << "default pdf-resource-dir: " << PDFS_RESOURCES_DIR;

//if(data.count(RESOURCE_DIR_KEY)==0)
//{
//LOG_S(WARNING) << "resource-dir-key is missing '" << RESOURCE_DIR_KEY << "' in data: \n" << data.dump(2);
//}

//std::string pdf_resources_dir = data.value("pdf-resource-directory", PDFS_RESOURCES_DIR);
std::string pdf_resources_dir = data.value(RESOURCE_DIR_KEY, PDFS_RESOURCES_DIR);
pdf_resources_dir += (pdf_resources_dir.back()=='/'? "" : "/");

std::string glyphs_dir, cids_dir, encodings_dir, bfonts_dir;

if(utils::filesystem::is_dir(pdf_resources_dir))
{
LOG_S(INFO) << "pdf_resources_dir: " << pdf_resources_dir;
// Eager initialization - always initialize if not already done
// This ensures resources are available for parallel document loading
if (!initialized.load()) {
// Use a static mutex to ensure thread-safe initialization
static std::mutex init_mutex;
std::lock_guard<std::mutex> lock(init_mutex);

// Double-check pattern to avoid unnecessary initialization
if (!initialized.load()) {
LOG_S(INFO) << __FUNCTION__ << ": " << data.dump(2);

std::string PDFS_RESOURCES_DIR = "../docling_parse/pdf_resources_v2/";
LOG_S(INFO) << "default pdf-resource-dir: " << PDFS_RESOURCES_DIR;

//if(data.count(RESOURCE_DIR_KEY)==0)
//{
//LOG_S(WARNING) << "resource-dir-key is missing '" << RESOURCE_DIR_KEY << "' in data: \n" << data.dump(2);
//}

//std::string pdf_resources_dir = data.value("pdf-resource-directory", PDFS_RESOURCES_DIR);
std::string pdf_resources_dir = data.value(RESOURCE_DIR_KEY, PDFS_RESOURCES_DIR);
pdf_resources_dir += (pdf_resources_dir.back()=='/'? "" : "/");

std::string glyphs_dir, cids_dir, encodings_dir, bfonts_dir;

if(utils::filesystem::is_dir(pdf_resources_dir))
{
LOG_S(INFO) << "pdf_resources_dir: " << pdf_resources_dir;

glyphs_dir = pdf_resources_dir+"glyphs/";
cids_dir = pdf_resources_dir+"cmap-resources/";
encodings_dir = pdf_resources_dir+"encodings/";
bfonts_dir = pdf_resources_dir+"fonts/";
}
else
glyphs_dir = pdf_resources_dir+"glyphs/";
cids_dir = pdf_resources_dir+"cmap-resources/";
encodings_dir = pdf_resources_dir+"encodings/";
bfonts_dir = pdf_resources_dir+"fonts/";
}
else
{
std::string message = "no existing pdf_resources_dir: " + pdf_resources_dir;
LOG_S(ERROR) << message;
throw std::logic_error(message);
}

utils::timer timer;

{
std::string message = "no existing pdf_resources_dir: " + pdf_resources_dir;
LOG_S(ERROR) << message;
throw std::logic_error(message);
}

utils::timer timer;

{
timer.reset();
timer.reset();

glyphs.initialise(glyphs_dir);
glyphs.initialise(glyphs_dir);

timings["init-glyphs"] = timer.get_time();
}
timings["init-glyphs"] = timer.get_time();
}

{
timer.reset();

cids.initialise(cids_dir);

timings["init-cids"] = timer.get_time();
}
{
timer.reset();
cids.initialise(cids_dir);
timings["init-cids"] = timer.get_time();
}

{
timer.reset();
{
timer.reset();

encodings.initialise(encodings_dir, glyphs);
encodings.initialise(encodings_dir, glyphs);

timings["init-encodings"] = timer.get_time();
}
timings["init-encodings"] = timer.get_time();
}

{
timer.reset();
{
timer.reset();

bfonts.initialise(bfonts_dir, glyphs);
bfonts.initialise(bfonts_dir, glyphs);

timings["init-bfonts"] = timer.get_time();
timings["init-bfonts"] = timer.get_time();
}

initialized.store(true, std::memory_order_release);
}
}
}

Expand Down
26 changes: 21 additions & 5 deletions src/v2/pdf_resources/page_font/base_fonts.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include <set>
#include <map>
#include <mutex>
#include <atomic>

//#include <filesystem>

Expand Down Expand Up @@ -43,15 +45,19 @@ namespace pdflib

private:

bool initialized;
static std::atomic<bool> initialized;
static std::mutex init_mutex;

std::set<std::string> core_14_fonts;

std::map<std::string, base_font_type> name_to_basefont;
};

base_fonts::base_fonts():
initialized(false)
// Static member definitions
std::atomic<bool> base_fonts::initialized(false);
std::mutex base_fonts::init_mutex;

base_fonts::base_fonts()
{}

base_fonts::~base_fonts()
Expand Down Expand Up @@ -160,7 +166,17 @@ namespace pdflib
template<typename glyphs_type>
void base_fonts::initialise(std::string dirname, glyphs_type& glyphs)
{
if(initialized)
// Use double-checked locking pattern for thread-safe initialization
if(initialized.load(std::memory_order_acquire))
{
LOG_S(WARNING) << "skipping base_fonts::initialise, already initialized ...";
return;
}

std::lock_guard<std::mutex> lock(init_mutex);

// Check again after acquiring lock
if(initialized.load(std::memory_order_acquire))
{
LOG_S(WARNING) << "skipping base_fonts::initialise, already initialized ...";
return;
Expand Down Expand Up @@ -226,7 +242,7 @@ namespace pdflib
}
}

initialized = true;
initialized.store(true, std::memory_order_release);
}

std::string base_fonts::read_fontname(std::string filename)
Expand Down
27 changes: 22 additions & 5 deletions src/v2/pdf_resources/page_font/encodings.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
#ifndef PDF_PAGE_FONT_ENCODINGS_H
#define PDF_PAGE_FONT_ENCODINGS_H

#include <mutex>
#include <atomic>

namespace pdflib
{

Expand All @@ -21,13 +24,17 @@ namespace pdflib

private:

bool initialized;
static std::atomic<bool> initialized;
static std::mutex init_mutex;

std::map<font_encoding_name, font_encoding> name_to_encoding;
};

font_encodings::font_encodings():
initialized(false)
// Static member definitions
std::atomic<bool> font_encodings::initialized(false);
std::mutex font_encodings::init_mutex;

font_encodings::font_encodings()
{}

font_encodings::~font_encodings()
Expand All @@ -41,7 +48,17 @@ namespace pdflib
template<typename glyphs_type>
void font_encodings::initialise(std::string dirname, glyphs_type& glyphs)
{
if(initialized)
// Use double-checked locking pattern for thread-safe initialization
if(initialized.load(std::memory_order_acquire))
{
LOG_S(WARNING) << "skipping font_encodings::initialise, already initialized ...";
return;
}

std::lock_guard<std::mutex> lock(init_mutex);

// Check again after acquiring lock
if(initialized.load(std::memory_order_acquire))
{
LOG_S(WARNING) << "skipping font_encodings::initialise, already initialized ...";
return;
Expand All @@ -60,7 +77,7 @@ namespace pdflib
encoding.initialise(item.first, dirname+"/"+item.second, glyphs);
}

initialized = true;
initialized.store(true, std::memory_order_release);
}

}
Expand Down
Loading
Loading