diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..40f9a2f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +invocation_reasons.yaml + diff --git a/README.md b/README.md index 59887d6..e7aa74c 100644 --- a/README.md +++ b/README.md @@ -34,13 +34,35 @@ docker buildx build --platform linux/arm64,linux/amd64 -f mcp-local/Dockerfile - For a single-platform build (faster): ```bash -docker buildx build -f mcp-local/Dockerfile -t arm-mcp mcp-local +# NOTE - building locally involves generating the Knowledge Base semantic embeddings and can take up to 20 minutes +docker buildx build -f mcp-local/Dockerfile -t arm-mcp mcp-local --load ``` ### 2. Configure Your MCP Client Choose the configuration that matches your MCP client: +#### Claude Code + +Add to `.mcp.json` in your project: + +```json +{ + "mcpServers": { + "arm-mcp": { + "command": "docker", + "args": [ + "run", + "--rm", + "-i", + "-v", "/path/to/your/workspace:/workspace", + "arm-mcp" + ] + } + } +} +``` + #### GitHub Copilot (VS Code) Add to `.vscode/mcp.json` in your project, or globally at `~/Library/Application Support/Code/User/mcp.json` (macOS): diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py index 7f777fe..6bd2845 100644 --- a/embedding-generation/generate-chunks.py +++ b/embedding-generation/generate-chunks.py @@ -12,12 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import requests -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry -from bs4 import BeautifulSoup import argparse -import sys, os +import sys +import os import re import uuid import yaml @@ -25,6 +22,14 @@ import datetime import json +import boto3 +from botocore.exceptions import NoCredentialsError, ClientError +from bs4 import BeautifulSoup +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + + # Create a session with retry logic for resilient HTTP requests def create_retry_session(retries=5, backoff_factor=1, status_forcelist=(500, 502, 503, 504)): """Create a requests session with automatic retry on failures.""" @@ -45,10 +50,6 @@ def create_retry_session(retries=5, backoff_factor=1, status_forcelist=(500, 502 # Global session for all HTTP requests http_session = create_retry_session() -# Boto3 for S3 operations -import boto3 -from botocore.exceptions import NoCredentialsError, ClientError - def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks', s3_bucket='arm-github-copilot-extension', @@ -57,7 +58,6 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks', Ensure the local 'intrinsic_chunks' folder exists and is populated with files from S3. If the folder does not exist, create it and download all files from the S3 prefix. """ - import os if not os.path.exists(local_folder): os.makedirs(local_folder, exist_ok=True) print(f"Created local folder: {local_folder}") @@ -86,10 +86,8 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks', To fix: 1. Prevent multiple learning paths from being used (compare URLs to existing chunks OR delete overlaps) 2. Learning Path titles must come from index page...send through function along with Graviton. - ''' - yaml_dir = 'yaml_data' details_file = 'info/chunk_details.csv' @@ -98,7 +96,6 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks', # Global var to prevent duplication entries from cross platform learning paths cross_platform_lps_dont_duplicate = [] - # Increase the file size limit, which defaults to '131,072' csv.field_size_limit(10**9) #1,000,000,000 (1 billion), smaller than 64-bit space but avoids 'python overflowerror' @@ -196,7 +193,6 @@ def createTextSnippet(main_row): return - def createIntrinsicsDatabaseChunks(): def htmlToMarkdown(html_string): # Step 0: Remove '

Operation

' as it isn't needed @@ -315,7 +311,6 @@ def htmlToMarkdown(html_string): ''' - def processLearningPath(url,type): github_raw_link = "https://raw.githubusercontent.com/ArmDeveloperEcosystem/arm-learning-paths/refs/heads/production/content" site_link = "https://learn.arm.com" @@ -462,6 +457,7 @@ def readInCSV(csv_file): return csv_dict, csv_length + def getMarkdownGitHubURLsFromPage(url): GH_urls = [] SITE_urls = [] @@ -523,6 +519,7 @@ def obtainMarkdownContentFromGitHubMDFile(gh_url): return md_content + def obtainTextSnippets__Markdown(content, min_words=300, max_words=500, min_final_words=200): """Split content into chunks based on headers and word count constraints.""" @@ -620,6 +617,7 @@ def createChunk(text_snippet,WEBSITE_url,keywords,title): return chunk + def printChunks(chunks): for chunk_dict in chunks: print('='*100) @@ -690,7 +688,6 @@ def recordChunk(): print(f"{file_name} === {chunk.title}") - def main(): diff --git a/embedding-generation/local_vectorstore_creation.py b/embedding-generation/local_vectorstore_creation.py index 45e6ec5..08f5899 100644 --- a/embedding-generation/local_vectorstore_creation.py +++ b/embedding-generation/local_vectorstore_creation.py @@ -24,6 +24,7 @@ from sentence_transformers import SentenceTransformer from usearch.index import Index + def load_local_yaml_files() -> List[Dict]: """Load locally stored YAML files and return their contents as a list of dictionaries.""" print("Loading local YAML files") @@ -63,6 +64,7 @@ def load_local_yaml_files() -> List[Dict]: print(f"Successfully loaded {len(yaml_contents)} YAML files") return yaml_contents + def create_embeddings(contents: List[str], model_name: str = 'all-MiniLM-L6-v2') -> np.ndarray: """Create embeddings for the given contents using SentenceTransformers.""" print(f"Creating embeddings using model: {model_name}") @@ -71,6 +73,7 @@ def create_embeddings(contents: List[str], model_name: str = 'all-MiniLM-L6-v2') print(f"Created embeddings with shape: {embeddings.shape}") return embeddings + def create_usearch_index(embeddings: np.ndarray, metadata: List[Dict]) -> Tuple[Index, List[Dict]]: """Create a USearch index with the given embeddings and metadata.""" print("Creating USearch index") @@ -100,6 +103,7 @@ def create_usearch_index(embeddings: np.ndarray, metadata: List[Dict]) -> Tuple[ print(f"Added {len(index)} vectors to the index") return index, metadata + def main(): print("Starting the USearch datastore creation process")