-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsample.py
More file actions
91 lines (63 loc) · 2.5 KB
/
sample.py
File metadata and controls
91 lines (63 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import json
from typing import Any, Dict, List, Optional, Generator
from llama_index.core.schema import Document
from llama_index.core import VectorStoreIndex, Settings
from llama_index.llms.replicate import Replicate
from transformers import AutoTokenizer
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
import os
os.environ["REPLICATE_API_TOKEN"] = os.getenv("REPLICATE_API_TOKEN")
Settings.embed_model = HuggingFaceEmbedding(model_name = "Snowflake/snowflake-arctic-embed-l" , trust_remote_code=True)
# Settings.llm = Replicate(model="snowflake/snowflake-antic-instruct")
Settings.llm = Replicate(model="snowflake/snowflake-arctic-instruct")
# Settings.llm = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
Settings.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
Settings.context_window = 3072
Settings.text_splitter = SentenceSplitter(chunk_size=450)
Settings.chunk_size = 450
Settings.chunk_overlap = 120
def load_json_file(file_path: str) -> Dict[str, Any]:
"""
Loads a JSON file and returns its content as a dictionary.
Args:
file_path (str): The path to the JSON file.
Returns:
Dict[str, Any]: The content of the JSON file as a dictionary.
"""
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
# resp = Settings.llm.complete("Paul Graham is ")
x=0
# Example usage
# if __name__ == "__main__":
file_path = "res/directory/articles.json"
data = load_json_file(file_path)
data = data[:10]
data_documents:List[Document] = []
for datum in data:
temp_document:Document = Document(
text=datum['content'],
extra_info={"url": datum['url']}
)
data_documents.append(temp_document)
index = VectorStoreIndex.from_documents(
data_documents, transformations=[SentenceSplitter(chunk_size=512)]
)
node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
alt_alt_nodes = node_parser.get_nodes_from_documents(data_documents)
sample_doc = []
for node in alt_alt_nodes:
temp_document = Document(
text=node.text,
extra_info=node.extra_info,
)
sample_doc.append(temp_document)
vector_index = VectorStoreIndex(alt_alt_nodes)
query_engine = vector_index.as_query_engine()
sample = query_engine.query("FDA")
x=0