vectorless-engine/pkg/parser/parser.go at main · hallelx2/vectorless-engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
// Package parser converts raw document bytes into a ParsedDoc — a
// hierarchical outline of sections that the ingest pipeline then turns
// into a tree.Tree.
//
// Each Parser is responsible for one input format (Markdown, HTML, PDF,
// DOCX, TXT, …). A Registry routes incoming bytes to the right parser
// based on content-type or filename extension.
//
// Parsers MUST NOT do any LLM work — they extract structure only.
// Summaries and embeddings are downstream concerns.
package parser

import (
	"context"
	"errors"
	"fmt"
	"io"
	"path/filepath"
	"strings"
)

// ErrUnsupported is returned when no parser accepts the given input.
var ErrUnsupported = errors.New("parser: no parser for content type")

// ParsedDoc is the language-agnostic output of a parser.
type ParsedDoc struct {
	// Title is the document's top-level title, if known.
	Title string

	// Sections is the hierarchical outline of the document. Nesting is
	// expressed via Section.Children.
	Sections []Section

	// Metadata holds whatever extra structural hints the parser recovered
	// (author, created date, page count, etc.).
	Metadata map[string]string
}

// Section is one node in the parsed outline.
type Section struct {
	// Level is 1 for a top-level heading, 2 for a sub-heading, etc.
	// Parsers that can't recover a level may use 1 for every section.
	Level int

	// Title is the human-readable heading.
	Title string

	// Content is the full text of this section — not including children's
	// content. Empty for purely structural nodes.
	Content string

	// Children are nested sub-sections.
	Children []Section

	// Metadata is an optional map of structural hints for this section
	// (page range, heading anchor, etc.).
	Metadata map[string]string
}

// Parser is the format-specific contract.
type Parser interface {
	// Name is a short identifier ("markdown", "pdf", …).
	Name() string

	// Accepts returns true if this parser can handle the given
	// content-type (MIME) or filename.
	Accepts(contentType, filename string) bool

	// Parse reads r until EOF and returns the parsed outline.
	Parse(ctx context.Context, r io.Reader) (*ParsedDoc, error)
}

// Registry picks the right parser for a given input.
type Registry struct {
	parsers []Parser
}

// NewRegistry returns a Registry preloaded with ps.
func NewRegistry(ps ...Parser) *Registry {
	return &Registry{parsers: ps}
}

// Register adds a parser to the end of the match list.
func (r *Registry) Register(p Parser) {
	r.parsers = append(r.parsers, p)
}

// For returns the first parser that accepts the given input, or nil.
func (r *Registry) For(contentType, filename string) Parser {
	ct := normalizeContentType(contentType)
	fn := strings.ToLower(filename)
	for _, p := range r.parsers {
		if p.Accepts(ct, fn) {
			return p
		}
	}
	return nil
}

// Parse picks a parser via For and runs it. Returns ErrUnsupported if
// no parser matches.
func (r *Registry) Parse(ctx context.Context, contentType, filename string, body io.Reader) (*ParsedDoc, error) {
	p := r.For(contentType, filename)
	if p == nil {
		return nil, fmt.Errorf("%w: %q (%s)", ErrUnsupported, filename, contentType)
	}
	return p.Parse(ctx, body)
}

func normalizeContentType(ct string) string {
	ct = strings.ToLower(strings.TrimSpace(ct))
	if i := strings.IndexByte(ct, ';'); i >= 0 {
		ct = strings.TrimSpace(ct[:i])
	}
	return ct
}

// HasExt returns true if filename ends with any of the given extensions
// (leading dot optional).
func HasExt(filename string, exts ...string) bool {
	e := strings.ToLower(filepath.Ext(filename))
	for _, want := range exts {
		if !strings.HasPrefix(want, ".") {
			want = "." + want
		}
		if e == strings.ToLower(want) {
			return true
		}
	}
	return false
}

// Flatten returns the sections of d in depth-first, pre-order.
func (d *ParsedDoc) Flatten() []Section {
	var out []Section
	var walk func(secs []Section)
	walk = func(secs []Section) {
		for _, s := range secs {
			out = append(out, s)
			walk(s.Children)
		}
	}
	walk(d.Sections)
	return out
}