From cd1f310958a15063f2ea9c2ada2d0cd93137be8e Mon Sep 17 00:00:00 2001 From: Craig McNamara Date: Wed, 25 Mar 2026 15:22:30 -0700 Subject: [PATCH 1/4] Add tagged PDF infrastructure for Section 508 accessibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds marked content operators (BMC/BDC/EMC) and structure tree support to pdf-core, enabling accessible/tagged PDF generation. This is the foundation layer that Prawn and Prawn::Table will build on. New modules: - PDF::Core::MarkedContent — emits BMC/BDC/EMC operators in content streams - PDF::Core::StructureTree — manages StructTreeRoot, structure elements, ParentTree, and MCID allocation Modified: - ObjectStore: accepts marked: true, sets /MarkInfo on Catalog - DocumentState: threads marked option through to ObjectStore - Renderer: includes MarkedContent, creates StructureTree when marked, registers before_render callback for structure tree finalization Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/pdf/core.rb | 2 + lib/pdf/core/document_state.rb | 14 +- lib/pdf/core/marked_content.rb | 60 ++++++ lib/pdf/core/object_store.rb | 12 ++ lib/pdf/core/renderer.rb | 20 ++ lib/pdf/core/structure_tree.rb | 272 +++++++++++++++++++++++++++ spec/pdf/core/marked_content_spec.rb | 68 +++++++ spec/pdf/core/structure_tree_spec.rb | 235 +++++++++++++++++++++++ 8 files changed, 674 insertions(+), 9 deletions(-) create mode 100644 lib/pdf/core/marked_content.rb create mode 100644 lib/pdf/core/structure_tree.rb create mode 100644 spec/pdf/core/marked_content_spec.rb create mode 100644 spec/pdf/core/structure_tree_spec.rb diff --git a/lib/pdf/core.rb b/lib/pdf/core.rb index 87eaf7c..93ec683 100644 --- a/lib/pdf/core.rb +++ b/lib/pdf/core.rb @@ -43,5 +43,7 @@ class InvalidPageLayout < StandardError require_relative 'core/page_geometry' require_relative 'core/outline_root' require_relative 'core/outline_item' +require_relative 'core/marked_content' +require_relative 'core/structure_tree' require_relative 'core/renderer' require_relative 'core/text' diff --git a/lib/pdf/core/document_state.rb b/lib/pdf/core/document_state.rb index 03d140d..3984dd5 100644 --- a/lib/pdf/core/document_state.rb +++ b/lib/pdf/core/document_state.rb @@ -20,15 +20,11 @@ class DocumentState def initialize(options) normalize_metadata(options) - @store = - if options[:print_scaling] - PDF::Core::ObjectStore.new( - info: options[:info], - print_scaling: options[:print_scaling], - ) - else - PDF::Core::ObjectStore.new(info: options[:info]) - end + store_opts = { info: options[:info] } + store_opts[:print_scaling] = options[:print_scaling] if options[:print_scaling] + store_opts[:marked] = options[:marked] if options[:marked] + + @store = PDF::Core::ObjectStore.new(store_opts) @version = 1.3 @pages = [] diff --git a/lib/pdf/core/marked_content.rb b/lib/pdf/core/marked_content.rb new file mode 100644 index 0000000..8b48773 --- /dev/null +++ b/lib/pdf/core/marked_content.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +module PDF + module Core + # Provides methods for emitting marked content operators (BMC/BDC/EMC) + # in PDF content streams. These operators associate content with structure + # elements for accessibility (tagged PDF). + # + # @api private + module MarkedContent + # Begin a marked content sequence with no properties. + # + # @param tag [Symbol] structure type tag (e.g., :P, :Span, :Artifact) + # @return [void] + def begin_marked_content(tag) + add_content("/#{tag} BMC") + end + + # Begin a marked content sequence with properties (BDC operator). + # + # @param tag [Symbol] structure type tag + # @param properties [Hash] properties dict (typically includes :MCID) + # @return [void] + def begin_marked_content_with_properties(tag, properties = {}) + props = PDF::Core.pdf_object(properties, true) + add_content("/#{tag} #{props} BDC") + end + + # End a marked content sequence. + # + # @return [void] + def end_marked_content + add_content('EMC') + end + + # Wrap a block in a marked content sequence (BMC/EMC). + # + # @param tag [Symbol] structure type tag + # @yield content to wrap + # @return [void] + def marked_content_sequence(tag) + begin_marked_content(tag) + yield if block_given? + end_marked_content + end + + # Wrap a block in a marked content sequence with properties (BDC/EMC). + # + # @param tag [Symbol] structure type tag + # @param properties [Hash] properties dict + # @yield content to wrap + # @return [void] + def marked_content_sequence_with_properties(tag, properties = {}) + begin_marked_content_with_properties(tag, properties) + yield if block_given? + end_marked_content + end + end + end +end diff --git a/lib/pdf/core/object_store.rb b/lib/pdf/core/object_store.rb index 430ae40..40dc9c1 100644 --- a/lib/pdf/core/object_store.rb +++ b/lib/pdf/core/object_store.rb @@ -16,6 +16,8 @@ class ObjectStore # @option opts :info [Hash] Document info dict # @option opts :print_scaling [:none, nil] (nil) Print scaling viewer # option + # @option opts :marked [Boolean] (false) Whether this is a tagged + # (accessible) PDF def initialize(opts = {}) @objects = {} @identifiers = [] @@ -25,11 +27,21 @@ def initialize(opts = {}) if opts[:print_scaling] == :none root.data[:ViewerPreferences] = { PrintScaling: :None } end + if opts[:marked] + root.data[:MarkInfo] = { Marked: true } + end if pages.nil? root.data[:Pages] = ref(Type: :Pages, Count: 0, Kids: []) end end + # Whether this document is marked (tagged for accessibility). + # + # @return [Boolean] + def marked? + root.data.key?(:MarkInfo) && root.data[:MarkInfo][:Marked] == true + end + # Wrap an object into a reference. # # @param data [Hash, Array, Numeric, String, Symbol, Date, Time, nil] diff --git a/lib/pdf/core/renderer.rb b/lib/pdf/core/renderer.rb index 7a39a3b..62683d3 100644 --- a/lib/pdf/core/renderer.rb +++ b/lib/pdf/core/renderer.rb @@ -6,6 +6,8 @@ module PDF module Core # Document renderer serializes document into its binary representation. class Renderer + include PDF::Core::MarkedContent + # @param state [PDF::Core::DocumentState] def initialize(state) @state = state @@ -14,6 +16,24 @@ def initialize(state) min_version(state.store.min_version) if state.store.min_version @page_number = 0 + + if state.store.marked? + @structure_tree = PDF::Core::StructureTree.new(self) + before_render { |_doc_state| @structure_tree.finalize! } + min_version(1.7) + end + end + + # The structure tree for this document, if tagged. + # + # @return [PDF::Core::StructureTree, nil] + attr_reader :structure_tree + + # Whether this document is marked (tagged for accessibility). + # + # @return [Boolean] + def marked? + state.store.marked? end # Document state diff --git a/lib/pdf/core/structure_tree.rb b/lib/pdf/core/structure_tree.rb new file mode 100644 index 0000000..ceac80c --- /dev/null +++ b/lib/pdf/core/structure_tree.rb @@ -0,0 +1,272 @@ +# frozen_string_literal: true + +module PDF + module Core + # Manages the PDF structure tree for tagged/accessible PDFs. + # + # The structure tree provides the logical structure of a document, + # mapping marked content sequences in page content streams to + # structure elements (headings, paragraphs, tables, etc.). + # + # PDF spec references: Section 14.7 (Logical Structure) + # + # @api private + class StructureTree + # @return [PDF::Core::Renderer] owning renderer + attr_reader :renderer + + # @return [PDF::Core::Reference] StructTreeRoot indirect object + attr_reader :root_ref + + # @return [PDF::Core::Reference] Document-level structure element + attr_reader :document_elem_ref + + # @return [Array] all structure elements created + attr_reader :elements + + # @return [Hash{Integer => Array}] page StructParents index => array of + # structure element refs for marked content on that page + attr_reader :parent_tree_map + + # @return [Integer] next available MCID for the current page + attr_reader :next_mcid + + # @return [Array] stack of open structure elements + attr_reader :element_stack + + # @param renderer [PDF::Core::Renderer] + def initialize(renderer) + @renderer = renderer + @elements = [] + @parent_tree_map = {} + @next_mcid = 0 + @element_stack = [] + @page_mcid_map = {} # page_ref_id => next mcid for that page + @root_ref = nil + @document_elem_ref = nil + end + + # Allocate the next MCID for the current page and track it. + # + # @return [Integer] the allocated MCID + def allocate_mcid + page = renderer.state.page + page_id = page.dictionary.identifier + + @page_mcid_map[page_id] ||= 0 + mcid = @page_mcid_map[page_id] + @page_mcid_map[page_id] += 1 + + mcid + end + + # Add a structure element as a child of the current open element + # (or the document element if none is open). + # + # @param tag [Symbol] structure type (e.g., :P, :H1, :Table, :TD) + # @param attributes [Hash] additional attributes for the structure element + # @option attributes [String] :Alt alternative text (for Figure, Formula) + # @option attributes [String] :Lang language tag + # @option attributes [Symbol] :Scope TH scope (:Column, :Row, :Both) + # @return [PDF::Core::Reference] the structure element reference + def add_element(tag, attributes = {}) + parent_ref = current_element || document_element + + elem_data = { + Type: :StructElem, + S: tag, + P: parent_ref, + K: [], + } + + elem_data[:Alt] = attributes[:Alt] if attributes[:Alt] + elem_data[:Lang] = attributes[:Lang] if attributes[:Lang] + + if attributes[:Scope] + elem_data[:A] = { + O: :Table, + Scope: attributes[:Scope], + } + end + + elem_ref = renderer.ref!(elem_data) + @elements << elem_ref + + # Add as child of parent + parent_data = renderer.deref(parent_ref) + parent_data[:K] << elem_ref + + elem_ref + end + + # Begin a structure element scope. Content rendered inside will be + # children of this element. + # + # @param tag [Symbol] structure type + # @param attributes [Hash] additional attributes + # @return [PDF::Core::Reference] the opened structure element + def begin_element(tag, attributes = {}) + elem_ref = add_element(tag, attributes) + @element_stack.push(elem_ref) + elem_ref + end + + # End the current structure element scope. + # + # @return [PDF::Core::Reference] the closed structure element + def end_element + @element_stack.pop + end + + # Add marked content to the current structure element. + # This allocates an MCID, records the mapping, and emits BDC/EMC + # operators around the yielded block. + # + # @param tag [Symbol] marked content tag (e.g., :P, :Span) + # @param struct_elem_ref [PDF::Core::Reference, nil] the structure element + # this content belongs to. If nil, uses current_element. + # @yield content to render inside the marked content sequence + # @return [void] + def mark_content(tag, struct_elem_ref: nil) + elem_ref = struct_elem_ref || current_element || document_element + mcid = allocate_mcid + page = renderer.state.page + page_ref = page.dictionary + + # Record in parent tree map + page_struct_parents = page_struct_parents_index(page_ref) + @parent_tree_map[page_struct_parents] ||= [] + + # Add marked content reference to the structure element's K array + mcr = { Type: :MCR, MCID: mcid, Pg: page_ref } + elem_data = renderer.deref(elem_ref) + elem_data[:K] << mcr + + # Track which struct element owns this MCID on this page + @parent_tree_map[page_struct_parents][mcid] = elem_ref + + # Emit BDC/EMC in content stream + renderer.begin_marked_content_with_properties(tag, { MCID: mcid }) + yield if block_given? + renderer.end_marked_content + end + + # Mark content as an artifact (decorative, not read by screen readers). + # + # @param artifact_type [Symbol, nil] optional artifact type + # (:Pagination, :Layout, :Page, :Background) + # @yield content to render as artifact + # @return [void] + def mark_artifact(artifact_type: nil) + if artifact_type + renderer.begin_marked_content_with_properties( + :Artifact, { Type: artifact_type } + ) + else + renderer.begin_marked_content(:Artifact) + end + yield if block_given? + renderer.end_marked_content + end + + # Finalize the structure tree before rendering. Called via + # before_render callback. + # + # Builds the StructTreeRoot, ParentTree, and wires everything + # into the Catalog. + # + # @return [void] + def finalize! + return if @elements.empty? && @parent_tree_map.empty? + + build_root + build_parent_tree + assign_struct_parents_to_pages + attach_to_catalog + end + + private + + # The current open structure element, or nil if none. + # + # @return [PDF::Core::Reference, nil] + def current_element + @element_stack.last + end + + # Get or create the Document-level structure element. + # + # @return [PDF::Core::Reference] + def document_element + return @document_elem_ref if @document_elem_ref + + @document_elem_ref = renderer.ref!( + Type: :StructElem, + S: :Document, + P: nil, # will be set to StructTreeRoot in finalize! + K: [], + ) + @elements << @document_elem_ref + @document_elem_ref + end + + # Get or assign a StructParents index for a page. + # + # @param page_ref [PDF::Core::Reference] page dictionary reference + # @return [Integer] the StructParents index + def page_struct_parents_index(page_ref) + page_ref.data[:StructParents] ||= @parent_tree_map.size + page_ref.data[:StructParents] + end + + # Build the StructTreeRoot object. + # + # @return [void] + def build_root + @root_ref = renderer.ref!( + Type: :StructTreeRoot, + K: document_element, + ParentTree: nil, # set in build_parent_tree + ) + + # Point Document element's parent to the root + doc_data = renderer.deref(@document_elem_ref) + doc_data[:P] = @root_ref + end + + # Build the ParentTree (a number tree mapping StructParents indices + # to arrays of structure element references). + # + # @return [void] + def build_parent_tree + # ParentTree is a number tree. For simplicity, use a flat Nums array + # since most documents won't have enough pages to need a balanced tree. + nums = [] + @parent_tree_map.sort_by { |k, _| k }.each do |index, elem_array| + nums << index + nums << renderer.ref!(elem_array) + end + + parent_tree_ref = renderer.ref!(Type: :ParentTree, Nums: nums) + root_data = renderer.deref(@root_ref) + root_data[:ParentTree] = parent_tree_ref + end + + # Ensure each page that has marked content has a StructParents entry. + # (Already handled lazily in page_struct_parents_index, but verify.) + # + # @return [void] + def assign_struct_parents_to_pages + # Already assigned lazily when mark_content is called. + # This method exists as a hook for any additional finalization. + end + + # Wire the StructTreeRoot into the document Catalog. + # + # @return [void] + def attach_to_catalog + renderer.state.store.root.data[:StructTreeRoot] = @root_ref + end + end + end +end diff --git a/spec/pdf/core/marked_content_spec.rb b/spec/pdf/core/marked_content_spec.rb new file mode 100644 index 0000000..ff80fba --- /dev/null +++ b/spec/pdf/core/marked_content_spec.rb @@ -0,0 +1,68 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe PDF::Core::MarkedContent do + subject(:renderer) do + PDF::Core::Renderer.new(PDF::Core::DocumentState.new({})) + end + + before do + renderer.start_new_page + end + + describe '#begin_marked_content' do + it 'emits BMC operator with tag' do + renderer.begin_marked_content(:P) + content = renderer.state.page.content.stream.filtered_stream + + expect(content).to include('/P BMC') + end + end + + describe '#end_marked_content' do + it 'emits EMC operator' do + renderer.end_marked_content + content = renderer.state.page.content.stream.filtered_stream + + expect(content).to include('EMC') + end + end + + describe '#begin_marked_content_with_properties' do + it 'emits BDC operator with tag and properties' do + renderer.begin_marked_content_with_properties(:P, { MCID: 0 }) + content = renderer.state.page.content.stream.filtered_stream + + expect(content).to include('/P << /MCID 0') + expect(content).to include('BDC') + end + end + + describe '#marked_content_sequence' do + it 'wraps content in BMC/EMC' do + renderer.marked_content_sequence(:Artifact) do + renderer.add_content('some content') + end + content = renderer.state.page.content.stream.filtered_stream + + expect(content).to include('/Artifact BMC') + expect(content).to include('some content') + expect(content).to include('EMC') + end + end + + describe '#marked_content_sequence_with_properties' do + it 'wraps content in BDC/EMC with properties' do + renderer.marked_content_sequence_with_properties(:P, { MCID: 0 }) do + renderer.add_content('tagged text') + end + content = renderer.state.page.content.stream.filtered_stream + + expect(content).to include('/P << /MCID 0') + expect(content).to include('BDC') + expect(content).to include('tagged text') + expect(content).to include('EMC') + end + end +end diff --git a/spec/pdf/core/structure_tree_spec.rb b/spec/pdf/core/structure_tree_spec.rb new file mode 100644 index 0000000..d5a9d7b --- /dev/null +++ b/spec/pdf/core/structure_tree_spec.rb @@ -0,0 +1,235 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe PDF::Core::StructureTree do + let(:state) { PDF::Core::DocumentState.new(marked: true) } + let(:renderer) { PDF::Core::Renderer.new(state) } + let(:structure_tree) { renderer.structure_tree } + + before do + renderer.start_new_page + end + + describe 'initialization' do + it 'creates a structure tree when marked: true' do + expect(structure_tree).to be_a(PDF::Core::StructureTree) + end + + it 'does not create a structure tree when marked is not set' do + plain_renderer = PDF::Core::Renderer.new( + PDF::Core::DocumentState.new({}), + ) + expect(plain_renderer.structure_tree).to be_nil + end + end + + describe '#marked?' do + it 'returns true for marked documents' do + expect(renderer.marked?).to be true + end + + it 'returns false for unmarked documents' do + plain_renderer = PDF::Core::Renderer.new( + PDF::Core::DocumentState.new({}), + ) + expect(plain_renderer.marked?).to be false + end + end + + describe '#allocate_mcid' do + it 'returns sequential MCIDs starting from 0' do + expect(structure_tree.allocate_mcid).to eq(0) + expect(structure_tree.allocate_mcid).to eq(1) + expect(structure_tree.allocate_mcid).to eq(2) + end + + it 'resets MCIDs for new pages' do + structure_tree.allocate_mcid # 0 on page 1 + structure_tree.allocate_mcid # 1 on page 1 + + renderer.start_new_page + + expect(structure_tree.allocate_mcid).to eq(0) # 0 on page 2 + end + end + + describe '#add_element' do + it 'creates a structure element reference' do + elem = structure_tree.add_element(:P) + + expect(elem).to be_a(PDF::Core::Reference) + expect(elem.data[:Type]).to eq(:StructElem) + expect(elem.data[:S]).to eq(:P) + end + + it 'adds element as child of Document element' do + elem = structure_tree.add_element(:P) + doc_elem = structure_tree.document_elem_ref + + expect(doc_elem.data[:K]).to include(elem) + end + + it 'supports Alt text attribute' do + elem = structure_tree.add_element(:Figure, Alt: 'A photo') + + expect(elem.data[:Alt]).to eq('A photo') + end + + it 'supports Scope attribute for table headers' do + elem = structure_tree.add_element(:TH, Scope: :Column) + + expect(elem.data[:A]).to eq({ O: :Table, Scope: :Column }) + end + end + + describe '#begin_element / #end_element' do + it 'manages an element stack' do + table_elem = structure_tree.begin_element(:Table) + row_elem = structure_tree.add_element(:TR) + + # TR should be child of Table, not Document + expect(table_elem.data[:K]).to include(row_elem) + + structure_tree.end_element + # After ending Table, next element goes to Document + p_elem = structure_tree.add_element(:P) + doc_elem = structure_tree.document_elem_ref + expect(doc_elem.data[:K]).to include(p_elem) + end + end + + describe '#mark_content' do + it 'emits BDC/EMC operators with MCID' do + structure_tree.begin_element(:P) + structure_tree.mark_content(:Span) do + renderer.add_content('Hello') + end + structure_tree.end_element + + content = renderer.state.page.content.stream.filtered_stream + expect(content).to include('/Span << /MCID 0') + expect(content).to include('BDC') + expect(content).to include('Hello') + expect(content).to include('EMC') + end + + it 'records MCR in the structure element K array' do + elem = structure_tree.begin_element(:P) + structure_tree.mark_content(:Span) do + renderer.add_content('text') + end + structure_tree.end_element + + mcr = elem.data[:K].find { |k| k.is_a?(Hash) && k[:Type] == :MCR } + expect(mcr).not_to be_nil + expect(mcr[:MCID]).to eq(0) + end + end + + describe '#mark_artifact' do + it 'emits BMC /Artifact' do + structure_tree.mark_artifact do + renderer.add_content('decorative') + end + + content = renderer.state.page.content.stream.filtered_stream + expect(content).to include('/Artifact BMC') + expect(content).to include('decorative') + expect(content).to include('EMC') + end + + it 'emits BDC with artifact type when specified' do + structure_tree.mark_artifact(artifact_type: :Pagination) do + renderer.add_content('page 1') + end + + content = renderer.state.page.content.stream.filtered_stream + expect(content).to include('/Artifact') + expect(content).to include('/Type /Pagination') + expect(content).to include('BDC') + end + end + + describe '#finalize!' do + it 'sets MarkInfo on catalog' do + root_data = renderer.state.store.root.data + expect(root_data[:MarkInfo]).to eq({ Marked: true }) + end + + it 'builds StructTreeRoot and attaches to catalog after render' do + structure_tree.begin_element(:P) + structure_tree.mark_content(:Span) do + renderer.add_content('text') + end + structure_tree.end_element + + # Render triggers before_render callback which calls finalize! + output = renderer.render + + root_data = renderer.state.store.root.data + expect(root_data[:StructTreeRoot]).to be_a(PDF::Core::Reference) + expect(root_data[:StructTreeRoot].data[:Type]).to eq(:StructTreeRoot) + end + + it 'creates a Document structure element as root K' do + structure_tree.add_element(:P) + renderer.render + + struct_root = renderer.state.store.root.data[:StructTreeRoot] + doc_elem = struct_root.data[:K] + expect(doc_elem).to be_a(PDF::Core::Reference) + expect(doc_elem.data[:S]).to eq(:Document) + end + + it 'builds a ParentTree' do + structure_tree.begin_element(:P) + structure_tree.mark_content(:Span) do + renderer.add_content('text') + end + structure_tree.end_element + + renderer.render + + struct_root = renderer.state.store.root.data[:StructTreeRoot] + parent_tree = struct_root.data[:ParentTree] + expect(parent_tree).to be_a(PDF::Core::Reference) + expect(parent_tree.data[:Nums]).not_to be_empty + end + + it 'assigns StructParents to pages with marked content' do + structure_tree.begin_element(:P) + structure_tree.mark_content(:Span) do + renderer.add_content('text') + end + structure_tree.end_element + + renderer.render + + page_dict = renderer.state.pages.first.dictionary.data + expect(page_dict[:StructParents]).to be_a(Integer) + end + + it 'produces a valid PDF' do + structure_tree.begin_element(:H1) + structure_tree.mark_content(:Span) do + renderer.add_content('BT /F1 12 Tf (Heading) Tj ET') + end + structure_tree.end_element + + structure_tree.begin_element(:P) + structure_tree.mark_content(:Span) do + renderer.add_content('BT /F1 12 Tf (Body text) Tj ET') + end + structure_tree.end_element + + output = renderer.render + + expect(output).to start_with('%PDF-1.7') + expect(output).to include('/MarkInfo') + expect(output).to include('/Marked true') + expect(output).to include('/StructTreeRoot') + expect(output).to include('/StructElem') + end + end +end From 90b72b96bcd4e4c7b713664c5458b11c4e911fdc Mon Sep 17 00:00:00 2001 From: Craig McNamara Date: Wed, 25 Mar 2026 20:46:05 -0700 Subject: [PATCH 2/4] Add ActualText attribute support for structure elements Allows setting /ActualText on structure elements so screen readers announce replacement text instead of reading visual characters literally (e.g., "required" instead of "asterisk" for *, "selected" for X). Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/pdf/core/structure_tree.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/pdf/core/structure_tree.rb b/lib/pdf/core/structure_tree.rb index ceac80c..de97e9e 100644 --- a/lib/pdf/core/structure_tree.rb +++ b/lib/pdf/core/structure_tree.rb @@ -66,6 +66,8 @@ def allocate_mcid # @param tag [Symbol] structure type (e.g., :P, :H1, :Table, :TD) # @param attributes [Hash] additional attributes for the structure element # @option attributes [String] :Alt alternative text (for Figure, Formula) + # @option attributes [String] :ActualText replacement text for screen + # readers (e.g., "required" instead of reading "*") # @option attributes [String] :Lang language tag # @option attributes [Symbol] :Scope TH scope (:Column, :Row, :Both) # @return [PDF::Core::Reference] the structure element reference @@ -80,6 +82,7 @@ def add_element(tag, attributes = {}) } elem_data[:Alt] = attributes[:Alt] if attributes[:Alt] + elem_data[:ActualText] = attributes[:ActualText] if attributes[:ActualText] elem_data[:Lang] = attributes[:Lang] if attributes[:Lang] if attributes[:Scope] From 48fe6bafde6edc91b42865b47703da18a8a8ff0e Mon Sep 17 00:00:00 2001 From: Craig McNamara Date: Wed, 25 Mar 2026 20:48:23 -0700 Subject: [PATCH 3/4] Add tests for ActualText attribute on structure elements Co-Authored-By: Claude Opus 4.6 (1M context) --- spec/pdf/core/structure_tree_spec.rb | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/spec/pdf/core/structure_tree_spec.rb b/spec/pdf/core/structure_tree_spec.rb index d5a9d7b..9b1d47e 100644 --- a/spec/pdf/core/structure_tree_spec.rb +++ b/spec/pdf/core/structure_tree_spec.rb @@ -81,6 +81,26 @@ expect(elem.data[:A]).to eq({ O: :Table, Scope: :Column }) end + + it 'supports ActualText attribute' do + elem = structure_tree.add_element(:Span, ActualText: 'required') + + expect(elem.data[:ActualText]).to eq('required') + end + + it 'includes ActualText in rendered PDF output' do + structure_tree.begin_element(:P) + span = structure_tree.begin_element(:Span, ActualText: 'selected') + structure_tree.mark_content(:Span) do + renderer.add_content('BT /F1 12 Tf (X) Tj ET') + end + structure_tree.end_element # Span + structure_tree.end_element # P + + output = renderer.render + + expect(output).to include('/ActualText') + end end describe '#begin_element / #end_element' do From d6e686e261f981b30b685de7a8aa516a7e75001c Mon Sep 17 00:00:00 2001 From: Craig McNamara Date: Thu, 26 Mar 2026 09:59:44 -0700 Subject: [PATCH 4/4] Fix RuboCop style violations in accessibility specs Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/pdf/core/structure_tree.rb | 2 +- spec/pdf/core/structure_tree_spec.rb | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/pdf/core/structure_tree.rb b/lib/pdf/core/structure_tree.rb index de97e9e..6288277 100644 --- a/lib/pdf/core/structure_tree.rb +++ b/lib/pdf/core/structure_tree.rb @@ -163,7 +163,7 @@ def mark_content(tag, struct_elem_ref: nil) def mark_artifact(artifact_type: nil) if artifact_type renderer.begin_marked_content_with_properties( - :Artifact, { Type: artifact_type } + :Artifact, { Type: artifact_type }, ) else renderer.begin_marked_content(:Artifact) diff --git a/spec/pdf/core/structure_tree_spec.rb b/spec/pdf/core/structure_tree_spec.rb index 9b1d47e..a4e0594 100644 --- a/spec/pdf/core/structure_tree_spec.rb +++ b/spec/pdf/core/structure_tree_spec.rb @@ -13,7 +13,7 @@ describe 'initialization' do it 'creates a structure tree when marked: true' do - expect(structure_tree).to be_a(PDF::Core::StructureTree) + expect(structure_tree).to be_a(described_class) end it 'does not create a structure tree when marked is not set' do @@ -90,7 +90,7 @@ it 'includes ActualText in rendered PDF output' do structure_tree.begin_element(:P) - span = structure_tree.begin_element(:Span, ActualText: 'selected') + structure_tree.begin_element(:Span, ActualText: 'selected') structure_tree.mark_content(:Span) do renderer.add_content('BT /F1 12 Tf (X) Tj ET') end @@ -142,7 +142,7 @@ structure_tree.end_element mcr = elem.data[:K].find { |k| k.is_a?(Hash) && k[:Type] == :MCR } - expect(mcr).not_to be_nil + expect(mcr).to_not(be_nil) expect(mcr[:MCID]).to eq(0) end end @@ -185,7 +185,7 @@ structure_tree.end_element # Render triggers before_render callback which calls finalize! - output = renderer.render + renderer.render root_data = renderer.state.store.root.data expect(root_data[:StructTreeRoot]).to be_a(PDF::Core::Reference) @@ -214,7 +214,7 @@ struct_root = renderer.state.store.root.data[:StructTreeRoot] parent_tree = struct_root.data[:ParentTree] expect(parent_tree).to be_a(PDF::Core::Reference) - expect(parent_tree.data[:Nums]).not_to be_empty + expect(parent_tree.data[:Nums]).to_not(be_empty) end it 'assigns StructParents to pages with marked content' do