Skip to content

Commit 71f7cf7

Browse files
committed
feat(document-search): init document-search module with basic RAG capabilities on text
delete file
1 parent 25d8249 commit 71f7cf7

File tree

27 files changed

+670
-0
lines changed

27 files changed

+670
-0
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import asyncio
2+
3+
from ragnarok_document_search import DocumentSearch
4+
from ragnarok_document_search.documents.document import DocumentMeta
5+
from ragnarok_document_search.vector_store.simple import SimpleVectorStore
6+
7+
from ragnarok_common.embeddings.litellm import LiteLLMEmbeddings
8+
9+
documents = [
10+
DocumentMeta.create_text_document_from_literal("RIP boiled water. You will be mist."),
11+
DocumentMeta.create_text_document_from_literal(
12+
"Why doesn't James Bond fart in bed? Because it would blow his cover."
13+
),
14+
DocumentMeta.create_text_document_from_literal(
15+
"Why programmers don't like to swim? Because they're scared of the floating points."
16+
),
17+
]
18+
19+
20+
async def main():
21+
"""Run the example."""
22+
23+
document_search = DocumentSearch(embedder=LiteLLMEmbeddings(), vector_store=SimpleVectorStore())
24+
25+
for document in documents:
26+
await document_search.ingest_document(document)
27+
28+
results = await document_search.search("I'm boiling my water and I need a joke")
29+
print(results)
30+
31+
32+
if __name__ == "__main__":
33+
asyncio.run(main())
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[build-system]
2+
requires = ["setuptools >= 40.9.0", "wheel"]
3+
build-backend = "setuptools.build_meta"
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
[metadata]
2+
name = ragnarok-document-search
3+
# do not change version by hand: use bump_version.sh
4+
version = 0.0.1
5+
description = "The ragstack module responsible for fetching data from unstructured data sources."
6+
author = deepsense.ai
7+
author_email = contact@deepsense.ai
8+
license = Other/Proprietary License
9+
license_files = LICENSE.md
10+
classifiers =
11+
Development Status :: 1 - Planning
12+
Environment :: Console
13+
Intended Audience :: Science/Research
14+
License :: Other/Proprietary License
15+
Natural Language :: English
16+
Operating System :: Independent
17+
Programming Language :: Python :: 3.10
18+
Programming Language :: Python :: 3.11
19+
Programming Language :: Python :: 3.12
20+
Topic :: AI
21+
Private :: Do Not Upload
22+
23+
[options]
24+
package_dir=
25+
=src
26+
packages=find:
27+
zip_safe = False
28+
platforms = any
29+
include_package_data = True
30+
python_requires = >=3.10
31+
install_requires =
32+
python-dotenv>=0.5.1
33+
litellm>=1.37.9
34+
loguru>=0.7.2
35+
numpy>=1.24.0
36+
pydantic>=2.8.2
37+
38+
[options.packages.find]
39+
where=src
40+
41+
[bdist_wheel]
42+
universal = 1
43+
44+
[aliases]
45+
# Alias `setup.py test` to `setup.py pytest`
46+
test = pytest

packages/ragnarok-document-search/src/py.typed

Whitespace-only changes.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from ._main import DocumentSearch
2+
3+
__all__ = ["DocumentSearch"]
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Version information."""
2+
3+
__version__ = "0.0.1"
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from ragnarok_document_search.documents.document import DocumentMeta
2+
from ragnarok_document_search.documents.element import Element
3+
from ragnarok_document_search.ingestion.document_processor import DocumentProcessor
4+
from ragnarok_document_search.retrieval.rephrasers.base import QueryRephraser
5+
from ragnarok_document_search.retrieval.rephrasers.noop import NoopQueryRephraser
6+
from ragnarok_document_search.retrieval.rerankers.base import Reranker
7+
from ragnarok_document_search.retrieval.rerankers.noop import NoopReranker
8+
from ragnarok_document_search.vector_store.base import VectorStore
9+
10+
from ragnarok_common.embeddings.base import Embeddings
11+
12+
13+
class DocumentSearch:
14+
"""
15+
A main entrypoint to the DocumentSearch functionality.
16+
17+
It provides methods for both ingestion and retrieval.
18+
19+
Retrieval:
20+
21+
1. Uses QueryRephraser to rephrase the query.
22+
2. Uses VectorStore to retrieve the most relevant chunks.
23+
3. Uses Reranker to rerank the chunks.
24+
"""
25+
26+
embedder: Embeddings
27+
28+
vector_store: VectorStore
29+
30+
query_rephraser: QueryRephraser
31+
reranker: Reranker
32+
33+
def __init__(
34+
self,
35+
embedder: Embeddings,
36+
vector_store: VectorStore,
37+
query_rephraser: QueryRephraser | None = None,
38+
reranker: Reranker | None = None,
39+
) -> None:
40+
self.embedder = embedder
41+
self.vector_store = vector_store
42+
self.query_rephraser = query_rephraser or NoopQueryRephraser()
43+
self.reranker = reranker or NoopReranker()
44+
45+
async def search(self, query: str) -> list[Element]:
46+
"""
47+
Search for the most relevant chunks for a query.
48+
49+
Args:
50+
query: The query to search for.
51+
52+
Returns:
53+
A list of chunks.
54+
"""
55+
queries = self.query_rephraser.rephrase(query)
56+
chunks = []
57+
for rephrased_query in queries:
58+
search_vector = await self.embedder.embed_text([rephrased_query])
59+
# TODO: search parameters should be configurable
60+
entries = await self.vector_store.retrieve(search_vector[0], k=1)
61+
chunks.extend([Element.from_vector_db_entry(entry) for entry in entries])
62+
63+
return self.reranker.rerank(chunks)
64+
65+
async def ingest_document(self, document: DocumentMeta) -> None:
66+
"""
67+
Ingest a document.
68+
69+
Args:
70+
document: The document to ingest.
71+
"""
72+
# TODO: This is a placeholder implementation. It should be replaced with a real implementation.
73+
74+
document_processor = DocumentProcessor()
75+
elements = await document_processor.process(document)
76+
vectors = await self.embedder.embed_text([element.get_key() for element in elements])
77+
entries = [element.to_vector_db_entry(vector) for element, vector in zip(elements, vectors)]
78+
await self.vector_store.store(entries)

packages/ragnarok-document-search/src/ragnarok_document_search/documents/__init__.py

Whitespace-only changes.
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import tempfile
2+
from enum import Enum
3+
from pathlib import Path
4+
from typing import Union
5+
6+
from pydantic import BaseModel, Field
7+
from ragnarok_document_search.documents.sources import LocalFileSource
8+
9+
10+
class DocumentType(str, Enum):
11+
"""Types of documents that can be stored."""
12+
13+
MD = "md"
14+
TXT = "txt"
15+
16+
17+
class DocumentMeta(BaseModel):
18+
"""
19+
An object representing a document metadata.
20+
"""
21+
22+
document_type: DocumentType
23+
source: Union[LocalFileSource] = Field(..., discriminator="source_type")
24+
25+
@property
26+
def id(self) -> str:
27+
"""
28+
Get the document ID.
29+
30+
Returns:
31+
The document ID.
32+
"""
33+
return self.source.get_id()
34+
35+
async def fetch(self) -> "Document":
36+
"""
37+
This method fetches the document from source (potentially remote) and creates an object to interface with it.
38+
Based on the document type, it will return a different object.
39+
40+
Returns:
41+
The document.
42+
"""
43+
local_path = await self.source.fetch()
44+
return Document.from_document_meta(self, local_path)
45+
46+
@classmethod
47+
def create_text_document_from_literal(cls, content: str) -> "DocumentMeta":
48+
"""
49+
Create a text document from a literal content.
50+
51+
Args:
52+
content: The content of the document.
53+
54+
Returns:
55+
The document metadata.
56+
"""
57+
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
58+
temp_file.write(content.encode())
59+
60+
return cls(
61+
document_type=DocumentType.TXT,
62+
source=LocalFileSource(path=Path(temp_file.name)),
63+
)
64+
65+
66+
class Document(DocumentMeta):
67+
"""
68+
An object representing a document which is downloaded and stored locally.
69+
"""
70+
71+
local_path: Path
72+
73+
@classmethod
74+
def from_document_meta(cls, document_meta: DocumentMeta, local_path: Path) -> "Document":
75+
"""
76+
Create a document from a document metadata.
77+
Based on the document type, it will return a different object.
78+
79+
Args:
80+
document_meta: The document metadata.
81+
local_path: The local path to the document.
82+
83+
Returns:
84+
The document.
85+
"""
86+
new_obj = {"local_path": local_path, **document_meta.model_dump()}
87+
88+
if document_meta.document_type in [DocumentType.MD, DocumentType.TXT]:
89+
return TextDocument.model_validate(new_obj)
90+
return cls.model_validate(new_obj)
91+
92+
93+
class TextDocument(Document):
94+
"""
95+
An object representing a text document.
96+
"""
97+
98+
@property
99+
def content(self) -> str:
100+
"""
101+
Get the content of the document.
102+
103+
Returns:
104+
The content of the document.
105+
"""
106+
return self.local_path.read_text()
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from abc import ABC, abstractmethod
2+
from typing import ClassVar
3+
4+
from pydantic import BaseModel
5+
from ragnarok_document_search.documents.document import DocumentMeta
6+
from ragnarok_document_search.vector_store.base import VectorDBEntry
7+
8+
9+
class Element(BaseModel, ABC):
10+
"""
11+
An object representing an element in a document.
12+
"""
13+
14+
element_type: str
15+
document: DocumentMeta
16+
17+
_elements_registry: ClassVar[dict[str, type["Element"]]] = {}
18+
19+
@abstractmethod
20+
def get_key(self) -> str:
21+
"""
22+
Get the key of the element which will be used to generate the vector.
23+
24+
Returns:
25+
The key.
26+
"""
27+
28+
@classmethod
29+
def __pydantic_init_subclass__(cls, **kwargs): # pylint: disable=unused-argument
30+
element_type_default = cls.model_fields["element_type"].default
31+
32+
if element_type_default is None:
33+
raise ValueError("Element type must be defined")
34+
35+
Element._elements_registry[element_type_default] = cls
36+
37+
@classmethod
38+
def from_vector_db_entry(cls, db_entry: VectorDBEntry) -> "Element":
39+
"""
40+
Create an element from a vector database entry.
41+
42+
Args:
43+
db_entry: The vector database entry.
44+
45+
Returns:
46+
The element.
47+
"""
48+
meta = db_entry.metadata
49+
element_type = meta["element_type"]
50+
element_cls = Element._elements_registry[element_type]
51+
52+
return element_cls(**meta)
53+
54+
def to_vector_db_entry(self, vector: list[float]) -> VectorDBEntry:
55+
"""
56+
Create a vector database entry from the element.
57+
58+
Args:
59+
vector: The vector.
60+
61+
Returns:
62+
The vector database entry
63+
"""
64+
return VectorDBEntry(
65+
key=self.get_key(),
66+
vector=vector,
67+
metadata=self.model_dump(),
68+
)
69+
70+
71+
class TextElement(Element):
72+
"""
73+
An object representing a text element in a document.
74+
"""
75+
76+
element_type: str = "text"
77+
content: str
78+
79+
def get_key(self) -> str:
80+
"""
81+
Get the key of the element which will be used to generate the vector.
82+
83+
Returns:
84+
The key.
85+
"""
86+
return self.content

0 commit comments

Comments
 (0)