decodingml
diff --git a/‎apps/second-brain-offline/.env.example
Lines changed: 11 additions & 75 deletions b/‎apps/second-brain-offline/.env.example
Lines changed: 11 additions & 75 deletions
diff --git a/‎apps/second-brain-offline/Makefile
Lines changed: 20 additions & 13 deletions b/‎apps/second-brain-offline/Makefile
Lines changed: 20 additions & 13 deletions
diff --git a/‎apps/second-brain-offline/README.md
Lines changed: 3 additions & 5 deletions b/‎apps/second-brain-offline/README.md
Lines changed: 3 additions & 5 deletions
diff --git a/‎apps/second-brain-offline/configs/collect_notion_data.yaml
Lines changed: 1 addition & 1 deletion b/‎apps/second-brain-offline/configs/collect_notion_data.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/second-brain-offline/configs/etl.yaml
Lines changed: 1 addition & 1 deletion b/‎apps/second-brain-offline/configs/etl.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/second-brain-offline/src/second_brain_offline/config.py
Lines changed: 1 addition & 23 deletions b/‎apps/second-brain-offline/src/second_brain_offline/config.py
Lines changed: 1 addition & 23 deletions
diff --git a/‎apps/second-brain-offline/src/second_brain_offline/infrastructure/aws/s3.py
Lines changed: 36 additions & 17 deletions b/‎apps/second-brain-offline/src/second_brain_offline/infrastructure/aws/s3.py
Lines changed: 36 additions & 17 deletions
diff --git a/‎apps/second-brain-offline/tests/pytest.ini
Lines changed: 0 additions & 5 deletions b/‎apps/second-brain-offline/tests/pytest.ini
Lines changed: 0 additions & 5 deletions
@@ -1,80 +1,16 @@
-# AWS Configuration
-# AWS region for cloud services
-AWS_DEFAULT_REGION=eu-central-1
-# AWS access key for authentication
-AWS_ACCESS_KEY=""
-# AWS secret key for authentication
-AWS_SECRET_KEY=""
-# ARN for AWS cross-account access role
-AWS_CROSS_ACCOUNT_ROLE_ARN=""
-# Name of the S3 bucket for storing application data
-AWS_S3_BUCKET_NAME="decodingml-public-data"
-AWS_S3_PREFIX="second_brain_course/notion"
-AWS_S3_NOSIGN_REQUEST=True
+# --- Mandatory settings ---
 
-# CometML Configuration
-# API key for CometML integration
-COMET_API_KEY=""
-# CometML project name for tracking experiments
-COMET_PROJECT_NAME=""
+# OpenAI Config (starting with Lesson 2)
+OPENAI_API_KEY=
 
-# Data Fetching Limits
-# Maximum number of documents to fetch from the database
-# MAX_FETCH_LIMIT=50
+# Hugging Face Config (starting with Lesson 3)
+HUGGINGFACE_ACCESS_TOKEN=
 
-# Default Genre for Querying
-# DEFAULT_GENRE="Western"
+# Comet ML and Opik Config (starting with Lesson 4)
+COMET_API_KEY=
 
-# Docker and Network Configuration
-# Flag to indicate if the application is running inside a Docker container
-# IS_RUNNING_IN_DOCKER=True
-# Docker network for inter-container communication
-# DOCKER_NETWORK_NAME="zenml_network"
+# --- Optional settings ---
 
-# Enable Configurations
-# Flag to enable ingestion from MongoDB Atlas
-# ENABLE_MONGODB_ATLAS_INGESTION=False
-# Flag to enable offline mode (disables online ingestion)
-ENABLE_OFFLINE_MODE=True
-# Enable or disable structured logging
-# ENABLE_STRUCTURED_LOGGING=false
-
-# GROQ Configuration
-# API key for accessing GROQ services
-# GROQ_API_KEY=""
-
-# Hugging Face Configuration
-# Token for Hugging Face API
-# HUGGINGFACE_ACCESS_TOKEN=""
-HF_API_KEY=
-
-# Local Data Files
-# Path to the local JSON file for offline processing
-# LOCAL_JSON_FILE_PATH="data/sample_data_set.json"
-
-# MongoDB Configuration
-# Name of the database
-MONGODB_DATABASE_NAME=second_brain
-# Connection URI for local MongoDB instance
-MONGODB_OFFLINE_URI=mongodb://decodingml:decodingml@localhost:27017/?directConnection=true
-# MongoDB Atlas URI for cloud connection
-MONGODB_ONLINE_URI=""  # Leave empty or add your cloud MongoDB URI when needed
-MONGODB_CTOR_SEARCH_INDEX_NAME=vector_index
-MONGODB_COLLECTION_NAME_RAG=rag_data
-
-# Optional MongoDB Settings
-# Name of the offline database
-# MONGODB_OFFLINE_DATABASE="rag_pipeline"
-# Name of the collection in the offline database
-# MONGODB_OFFLINE_COLLECTION="offline_documents"
-
-# Notion API Configuration
-# Secret key for accessing Notion API
-# Optional, set this key if you plan on using your own personal notion and S3 bucket.
-NOTION_SECRET_KEY=""
-
-# OpenAI API Configuration
-# API key for accessing OpenAI services
-OPENAI_API_KEY=""
-# Model identifier for OpenAI
-OPENAI_MODEL_ID="gpt-4o-mini"
+# Notion Config (starting with Lesson 1)
+# In case you want to collect data from your personal Notion database
+NOTION_SECRET_KEY=
@@ -10,6 +10,7 @@ export PYTHONPATH = .
 # --- Default Values ---
 
 CHECK_DIRS := .
+AWS_S3_BUCKET_NAME := decodingml-public-data
 NOTION_LOCAL_DATA_PATH := data/notion
 CRAWLED_LOCAL_DATA_PATH := data/crawled
 
@@ -43,27 +44,31 @@ local-infrastructure-stop: local-docker-infrastructure-stop local-zenml-server-s
 
 # --- AWS ---
 
-s3-upload-raw-dataset:  # Upload raw Notion dataset from local folder to S3
+validate_aws_boto3:
+	@echo "Validating AWS Boto3 credentials..."
+	uv run python -m tools.validate_aws_boto3
+
+s3-upload-notion-dataset:  # Upload raw Notion dataset from local folder to S3
 	@echo "Uploading raw Notion dataset to S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/notion"
 	uv run python -m tools.use_s3 upload $(NOTION_LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix second_brain_course/notion
 
-s3-download-raw-dataset:  # Download raw Notion dataset from S3 to local folder
+s3-download-notion-dataset:  # Download raw Notion dataset from S3 to local folder
 	@echo "Downloading raw Notion dataset from S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/notion/notion.zip"
-	uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/notion/notion.zip $(NOTION_LOCAL_DATA_PATH)
+	uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/notion/notion.zip $(NOTION_LOCAL_DATA_PATH) --no-sign-request
 
 s3-upload-crawled-dataset:  # Upload processed crawled dataset from local folder to S3
 	@echo "Uploading crawled dataset to S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/crawled"
 	uv run python -m tools.use_s3 upload $(CRAWLED_LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix second_brain_course/crawled
 
 s3-download-crawled-dataset:  # Download processed crawled dataset from S3 to local folder
 	@echo "Downloading crawled dataset from S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/crawled/crawled.zip"
-	uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/crawled/crawled.zip $(CRAWLED_LOCAL_DATA_PATH)
+	uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/crawled/crawled.zip $(CRAWLED_LOCAL_DATA_PATH) --no-sign-request
 
-download-raw-dataset: s3-download-raw-dataset
+download-notion-dataset: s3-download-notion-dataset
 
 download-crawled-dataset: s3-download-crawled-dataset
 
-# --- Pipelines ---
+# --- Offline ML Pipelines ---
 
 collect-notion-data-pipeline:
 	uv run python -m tools.run --run-collect-notion-data-pipeline --no-cache
@@ -82,18 +87,20 @@ compute-rag-vector-index-pipeline:
 
 # --- Tests ---
 
-test-download-raw-dataset:
-	uv run pytest tests/test_download_raw_dataset.py -v
-
-test-etl-pipeline:
-	uv run pytest tests/test_etl_pipeline.py -v
+test-s3-download:
+	uv run pytest tests/test_s3.py -v
 
-test-rag-vector-index-pipeline:
-	uv run pytest tests/test_rag_vector_index_pipeline.py -v
+test-download-notion-dataset:
+	uv run pytest tests/test_download_notion_dataset.py -v
 
+test-download-crawled-dataset:
+	uv run pytest tests/test_download_crawled_dataset.py -v
 
 # --- QA ---
 
+test:
+	uv run pytest tests -v
+
 format-fix:
 	uv run ruff format $(CHECK_DIRS)
 	uv run ruff check --select I --fix 
 
@@ -66,14 +66,13 @@ No code to run for this lesson. Read the lesson to understand the problem and ov
 
 Download our prepared Notion dataset from S3 (recommended):
 ```bash
-make download-raw-dataset
-# Validate using test: make test-download-raw-dataset
+make download-notion-dataset
+# Validate using test: make test-download-notion-dataset
 ```
 
 Or if you want to prepare your own Notion data (optional - if you want to use your own data):
 ```bash
 make collect-notion-data-pipeline
-# Validate using test: make test-download-raw-dataset
 ```
 
 ### Run the ETL pipeline
@@ -89,8 +88,8 @@ Running time: ~30 minutes
 If you want to avoid any costs or waiting times, you can use our pre-computed dataset to populate MongoDB. Also, as crawling can often fail, you can use this dataset to skip the crawling step:
 ```bash
 make download-crawled-dataset
+# Validate using test: make test-download-crawled-dataset
 make etl-precomputed-pipeline
-# Validate using test: make test-etl-pipeline
 ```
 
 ## Lesson 3: Generate Fine-tuning Dataset
@@ -118,7 +117,6 @@ This time we will use Notebooks, as they are popular when it comes to LLM fine-t
 
 ```bash
 make compute-rag-vector-index-pipeline
-# Validate using test: make test-rag-vector-index-pipeline
 ```
 
 ## Lesson 6: Agentic App
 
@@ -9,4 +9,4 @@ parameters:
     - f54dbddcaa4c43c7ae17935716761536
     - 31fcaab5a9404d41b922897d32b901b3
   data_dir: data/
-  to_s3: true
+  to_s3: false
@@ -1,7 +1,7 @@
 parameters:
   data_dir: data/
   load_collection_name: raw
-  to_s3: true
+  to_s3: false
   max_workers: 4
   quality_agent_model_id: gpt-4o-mini
   quality_agent_mock: false
@@ -27,10 +27,6 @@ class Settings(BaseSettings):
         default="decodingml-public-data",
         description="Name of the S3 bucket for storing application data.",
     )
-    AWS_S3_NOSIGN_REQUEST: bool = Field(
-        default=False,
-        description="Flag to enable unauthenticated S3 bucket access. If True, bypasses AWS authentication.",
-    )
 
     # --- Comet ML & Opik Configuration ---
     COMET_API_KEY: str | None = Field(
@@ -57,14 +53,10 @@ class Settings(BaseSettings):
         default="second_brain_course",
         description="Name of the MongoDB database.",
     )
-    MONGODB_OFFLINE_URI: str = Field(
+    MONGODB_URI: str = Field(
         default="mongodb://decodingml:decodingml@localhost:27017/?directConnection=true",
         description="Connection URI for the local MongoDB Atlas instance.",
     )
-    MONGODB_ONLINE_URI: str | None = Field(
-        default=None,
-        description="Connection URI for the Cloud MongoDB Atlas instance.",
-    )
 
     # --- Notion API Configuration ---
     NOTION_SECRET_KEY: str | None = Field(
@@ -81,20 +73,6 @@ class Settings(BaseSettings):
         description="API key for OpenAI service authentication.",
     )
 
-    @property
-    def MONGODB_URI(self) -> str:
-        """
-        Returns the appropriate MongoDB URI based on ENABLE_OFFLINE_MODE.
-        """
-        if self.IS_OFFLINE_MODE is True:
-            return self.MONGODB_OFFLINE_URI
-
-        assert self.MONGODB_ONLINE_URI is not None, (
-            "MONGODB_ONLINE_URI is not set, while ENABLE_OFFLINE_MODE is False."
-        )
-
-        return self.MONGODB_ONLINE_URI
-
     @field_validator("OPENAI_API_KEY")
     @classmethod
     def check_not_empty(cls, value: str, info) -> str:
 
@@ -16,21 +16,23 @@ class S3Client:
     def __init__(
         self,
         bucket_name: str,
+        no_sign_request: bool = False,
         region: str = settings.AWS_DEFAULT_REGION,
-        aws_s3_no_sign_request: bool = settings.AWS_S3_NOSIGN_REQUEST,
     ) -> None:
         """Initialize S3 client and bucket name.
 
         Args:
-            bucket_name: Name of the S3 bucket
-            region: AWS region (defaults to AWS_DEFAULT_REGION or AWS_REGION env var, or 'us-east-1')
-            aws_s3_no_sign_request: if True will access S3 un-authenticated for public buckets, if False will use the AWS credentials set by the user
+            bucket_name (str): Name of the S3 bucket
+            no_sign_request (bool, optional): If True will access S3 un-authenticated for public buckets.
+                If False will use the AWS credentials set by the user. Defaults to False.
+            region (str, optional): AWS region. Defaults to AWS_DEFAULT_REGION or AWS_REGION env var,
+                or 'us-east-1'.
         """
 
         self.region = region
         self.bucket_name = bucket_name
-        self.aws_s3_no_sign_request = aws_s3_no_sign_request
-        if self.aws_s3_no_sign_request:
+        self.no_sign_request = no_sign_request
+        if self.no_sign_request:
             # Use unsigned mode for public buckets
             self.s3_client = boto3.client(
                 "s3",
@@ -42,12 +44,15 @@ def __init__(
             self.s3_client = boto3.client("s3", region_name=self.region)
 
     def upload_folder(self, local_path: Union[str, Path], s3_prefix: str = "") -> None:
-        """
-        Upload a local folder as a zip file to S3.
+        """Upload a local folder as a zip file to S3.
 
         Args:
-            local_path: Path to the local folder
-            s3_prefix: Optional prefix (folder path) in S3 bucket
+            local_path (Union[str, Path]): Path to the local folder
+            s3_prefix (str, optional): Optional prefix (folder path) in S3 bucket. Defaults to "".
+
+        Raises:
+            FileNotFoundError: If the local path does not exist
+            NotADirectoryError: If the local path is not a directory
         """
         # Ensure bucket exists before proceeding
         self.__create_bucket_if_doesnt_exist()
@@ -84,9 +89,10 @@ def upload_folder(self, local_path: Union[str, Path], s3_prefix: str = "") -> No
         os.unlink(temp_zip.name)
 
     def __create_bucket_if_doesnt_exist(self) -> None:
-        """
-        Check if bucket exists and create it if it doesn't.
-        Raises permission-related exceptions if user lacks necessary permissions.
+        """Check if bucket exists and create it if it doesn't.
+
+        Raises:
+            Exception: If bucket creation fails or if user lacks necessary permissions
         """
         try:
             self.s3_client.head_bucket(Bucket=self.bucket_name)
@@ -108,12 +114,11 @@ def __create_bucket_if_doesnt_exist(self) -> None:
                 raise
 
     def download_folder(self, s3_prefix: str, local_path: Union[str, Path]) -> None:
-        """
-        Download a zipped folder from S3 and extract it to local storage.
+        """Download a zipped folder from S3 and extract it to local storage.
 
         Args:
-            s3_prefix: Prefix (folder path) in S3 bucket
-            local_path: Local path where files should be extracted
+            s3_prefix (str): Prefix (folder path) in S3 bucket pointing to the zip file
+            local_path (Union[str, Path]): Local path where files should be extracted
         """
         local_path = Path(local_path)
 
@@ -135,3 +140,17 @@ def download_folder(self, s3_prefix: str, local_path: Union[str, Path]) -> None:
 
         # Clean up temporary zip file
         os.unlink(temp_zip.name)
+
+    def download_file(self, s3_prefix: str, local_path: Union[str, Path]) -> None:
+        """Download a file from S3 to local storage.
+
+        Args:
+            s3_prefix (str): Path to the file in S3 bucket
+            local_path (Union[str, Path]): Local directory path where the file should be downloaded
+        """
+
+        local_path = Path(local_path)
+        local_path.mkdir(parents=True, exist_ok=True)
+
+        target_file = local_path / Path(s3_prefix).name
+        self.s3_client.download_file(self.bucket_name, s3_prefix, str(target_file))