Skip to content

Commit 6d06c32

Browse files
committed
feat: Improve experience
1 parent 16b554f commit 6d06c32

File tree

20 files changed

+365
-729
lines changed

20 files changed

+365
-729
lines changed
Lines changed: 11 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,80 +1,16 @@
1-
# AWS Configuration
2-
# AWS region for cloud services
3-
AWS_DEFAULT_REGION=eu-central-1
4-
# AWS access key for authentication
5-
AWS_ACCESS_KEY=""
6-
# AWS secret key for authentication
7-
AWS_SECRET_KEY=""
8-
# ARN for AWS cross-account access role
9-
AWS_CROSS_ACCOUNT_ROLE_ARN=""
10-
# Name of the S3 bucket for storing application data
11-
AWS_S3_BUCKET_NAME="decodingml-public-data"
12-
AWS_S3_PREFIX="second_brain_course/notion"
13-
AWS_S3_NOSIGN_REQUEST=True
1+
# --- Mandatory settings ---
142

15-
# CometML Configuration
16-
# API key for CometML integration
17-
COMET_API_KEY=""
18-
# CometML project name for tracking experiments
19-
COMET_PROJECT_NAME=""
3+
# OpenAI Config (starting with Lesson 2)
4+
OPENAI_API_KEY=
205

21-
# Data Fetching Limits
22-
# Maximum number of documents to fetch from the database
23-
# MAX_FETCH_LIMIT=50
6+
# Hugging Face Config (starting with Lesson 3)
7+
HUGGINGFACE_ACCESS_TOKEN=
248

25-
# Default Genre for Querying
26-
# DEFAULT_GENRE="Western"
9+
# Comet ML and Opik Config (starting with Lesson 4)
10+
COMET_API_KEY=
2711

28-
# Docker and Network Configuration
29-
# Flag to indicate if the application is running inside a Docker container
30-
# IS_RUNNING_IN_DOCKER=True
31-
# Docker network for inter-container communication
32-
# DOCKER_NETWORK_NAME="zenml_network"
12+
# --- Optional settings ---
3313

34-
# Enable Configurations
35-
# Flag to enable ingestion from MongoDB Atlas
36-
# ENABLE_MONGODB_ATLAS_INGESTION=False
37-
# Flag to enable offline mode (disables online ingestion)
38-
ENABLE_OFFLINE_MODE=True
39-
# Enable or disable structured logging
40-
# ENABLE_STRUCTURED_LOGGING=false
41-
42-
# GROQ Configuration
43-
# API key for accessing GROQ services
44-
# GROQ_API_KEY=""
45-
46-
# Hugging Face Configuration
47-
# Token for Hugging Face API
48-
# HUGGINGFACE_ACCESS_TOKEN=""
49-
HF_API_KEY=
50-
51-
# Local Data Files
52-
# Path to the local JSON file for offline processing
53-
# LOCAL_JSON_FILE_PATH="data/sample_data_set.json"
54-
55-
# MongoDB Configuration
56-
# Name of the database
57-
MONGODB_DATABASE_NAME=second_brain
58-
# Connection URI for local MongoDB instance
59-
MONGODB_OFFLINE_URI=mongodb://decodingml:decodingml@localhost:27017/?directConnection=true
60-
# MongoDB Atlas URI for cloud connection
61-
MONGODB_ONLINE_URI="" # Leave empty or add your cloud MongoDB URI when needed
62-
MONGODB_CTOR_SEARCH_INDEX_NAME=vector_index
63-
MONGODB_COLLECTION_NAME_RAG=rag_data
64-
65-
# Optional MongoDB Settings
66-
# Name of the offline database
67-
# MONGODB_OFFLINE_DATABASE="rag_pipeline"
68-
# Name of the collection in the offline database
69-
# MONGODB_OFFLINE_COLLECTION="offline_documents"
70-
71-
# Notion API Configuration
72-
# Secret key for accessing Notion API
73-
# Optional, set this key if you plan on using your own personal notion and S3 bucket.
74-
NOTION_SECRET_KEY=""
75-
76-
# OpenAI API Configuration
77-
# API key for accessing OpenAI services
78-
OPENAI_API_KEY=""
79-
# Model identifier for OpenAI
80-
OPENAI_MODEL_ID="gpt-4o-mini"
14+
# Notion Config (starting with Lesson 1)
15+
# In case you want to collect data from your personal Notion database
16+
NOTION_SECRET_KEY=

apps/second-brain-offline/Makefile

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ export PYTHONPATH = .
1010
# --- Default Values ---
1111

1212
CHECK_DIRS := .
13+
AWS_S3_BUCKET_NAME := decodingml-public-data
1314
NOTION_LOCAL_DATA_PATH := data/notion
1415
CRAWLED_LOCAL_DATA_PATH := data/crawled
1516

@@ -43,27 +44,31 @@ local-infrastructure-stop: local-docker-infrastructure-stop local-zenml-server-s
4344

4445
# --- AWS ---
4546

46-
s3-upload-raw-dataset: # Upload raw Notion dataset from local folder to S3
47+
validate_aws_boto3:
48+
@echo "Validating AWS Boto3 credentials..."
49+
uv run python -m tools.validate_aws_boto3
50+
51+
s3-upload-notion-dataset: # Upload raw Notion dataset from local folder to S3
4752
@echo "Uploading raw Notion dataset to S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/notion"
4853
uv run python -m tools.use_s3 upload $(NOTION_LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix second_brain_course/notion
4954

50-
s3-download-raw-dataset: # Download raw Notion dataset from S3 to local folder
55+
s3-download-notion-dataset: # Download raw Notion dataset from S3 to local folder
5156
@echo "Downloading raw Notion dataset from S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/notion/notion.zip"
52-
uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/notion/notion.zip $(NOTION_LOCAL_DATA_PATH)
57+
uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/notion/notion.zip $(NOTION_LOCAL_DATA_PATH) --no-sign-request
5358

5459
s3-upload-crawled-dataset: # Upload processed crawled dataset from local folder to S3
5560
@echo "Uploading crawled dataset to S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/crawled"
5661
uv run python -m tools.use_s3 upload $(CRAWLED_LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix second_brain_course/crawled
5762

5863
s3-download-crawled-dataset: # Download processed crawled dataset from S3 to local folder
5964
@echo "Downloading crawled dataset from S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/crawled/crawled.zip"
60-
uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/crawled/crawled.zip $(CRAWLED_LOCAL_DATA_PATH)
65+
uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/crawled/crawled.zip $(CRAWLED_LOCAL_DATA_PATH) --no-sign-request
6166

62-
download-raw-dataset: s3-download-raw-dataset
67+
download-notion-dataset: s3-download-notion-dataset
6368

6469
download-crawled-dataset: s3-download-crawled-dataset
6570

66-
# --- Pipelines ---
71+
# --- Offline ML Pipelines ---
6772

6873
collect-notion-data-pipeline:
6974
uv run python -m tools.run --run-collect-notion-data-pipeline --no-cache
@@ -82,18 +87,20 @@ compute-rag-vector-index-pipeline:
8287

8388
# --- Tests ---
8489

85-
test-download-raw-dataset:
86-
uv run pytest tests/test_download_raw_dataset.py -v
87-
88-
test-etl-pipeline:
89-
uv run pytest tests/test_etl_pipeline.py -v
90+
test-s3-download:
91+
uv run pytest tests/test_s3.py -v
9092

91-
test-rag-vector-index-pipeline:
92-
uv run pytest tests/test_rag_vector_index_pipeline.py -v
93+
test-download-notion-dataset:
94+
uv run pytest tests/test_download_notion_dataset.py -v
9395

96+
test-download-crawled-dataset:
97+
uv run pytest tests/test_download_crawled_dataset.py -v
9498

9599
# --- QA ---
96100

101+
test:
102+
uv run pytest tests -v
103+
97104
format-fix:
98105
uv run ruff format $(CHECK_DIRS)
99106
uv run ruff check --select I --fix

apps/second-brain-offline/README.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,14 +66,13 @@ No code to run for this lesson. Read the lesson to understand the problem and ov
6666

6767
Download our prepared Notion dataset from S3 (recommended):
6868
```bash
69-
make download-raw-dataset
70-
# Validate using test: make test-download-raw-dataset
69+
make download-notion-dataset
70+
# Validate using test: make test-download-notion-dataset
7171
```
7272

7373
Or if you want to prepare your own Notion data (optional - if you want to use your own data):
7474
```bash
7575
make collect-notion-data-pipeline
76-
# Validate using test: make test-download-raw-dataset
7776
```
7877

7978
### Run the ETL pipeline
@@ -89,8 +88,8 @@ Running time: ~30 minutes
8988
If you want to avoid any costs or waiting times, you can use our pre-computed dataset to populate MongoDB. Also, as crawling can often fail, you can use this dataset to skip the crawling step:
9089
```bash
9190
make download-crawled-dataset
91+
# Validate using test: make test-download-crawled-dataset
9292
make etl-precomputed-pipeline
93-
# Validate using test: make test-etl-pipeline
9493
```
9594

9695
## Lesson 3: Generate Fine-tuning Dataset
@@ -118,7 +117,6 @@ This time we will use Notebooks, as they are popular when it comes to LLM fine-t
118117

119118
```bash
120119
make compute-rag-vector-index-pipeline
121-
# Validate using test: make test-rag-vector-index-pipeline
122120
```
123121

124122
## Lesson 6: Agentic App

apps/second-brain-offline/configs/collect_notion_data.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ parameters:
99
- f54dbddcaa4c43c7ae17935716761536
1010
- 31fcaab5a9404d41b922897d32b901b3
1111
data_dir: data/
12-
to_s3: true
12+
to_s3: false
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
parameters:
22
data_dir: data/
33
load_collection_name: raw
4-
to_s3: true
4+
to_s3: false
55
max_workers: 4
66
quality_agent_model_id: gpt-4o-mini
77
quality_agent_mock: false

apps/second-brain-offline/src/second_brain_offline/config.py

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,6 @@ class Settings(BaseSettings):
2727
default="decodingml-public-data",
2828
description="Name of the S3 bucket for storing application data.",
2929
)
30-
AWS_S3_NOSIGN_REQUEST: bool = Field(
31-
default=False,
32-
description="Flag to enable unauthenticated S3 bucket access. If True, bypasses AWS authentication.",
33-
)
3430

3531
# --- Comet ML & Opik Configuration ---
3632
COMET_API_KEY: str | None = Field(
@@ -57,14 +53,10 @@ class Settings(BaseSettings):
5753
default="second_brain_course",
5854
description="Name of the MongoDB database.",
5955
)
60-
MONGODB_OFFLINE_URI: str = Field(
56+
MONGODB_URI: str = Field(
6157
default="mongodb://decodingml:decodingml@localhost:27017/?directConnection=true",
6258
description="Connection URI for the local MongoDB Atlas instance.",
6359
)
64-
MONGODB_ONLINE_URI: str | None = Field(
65-
default=None,
66-
description="Connection URI for the Cloud MongoDB Atlas instance.",
67-
)
6860

6961
# --- Notion API Configuration ---
7062
NOTION_SECRET_KEY: str | None = Field(
@@ -81,20 +73,6 @@ class Settings(BaseSettings):
8173
description="API key for OpenAI service authentication.",
8274
)
8375

84-
@property
85-
def MONGODB_URI(self) -> str:
86-
"""
87-
Returns the appropriate MongoDB URI based on ENABLE_OFFLINE_MODE.
88-
"""
89-
if self.IS_OFFLINE_MODE is True:
90-
return self.MONGODB_OFFLINE_URI
91-
92-
assert self.MONGODB_ONLINE_URI is not None, (
93-
"MONGODB_ONLINE_URI is not set, while ENABLE_OFFLINE_MODE is False."
94-
)
95-
96-
return self.MONGODB_ONLINE_URI
97-
9876
@field_validator("OPENAI_API_KEY")
9977
@classmethod
10078
def check_not_empty(cls, value: str, info) -> str:

apps/second-brain-offline/src/second_brain_offline/infrastructure/aws/s3.py

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,23 @@ class S3Client:
1616
def __init__(
1717
self,
1818
bucket_name: str,
19+
no_sign_request: bool = False,
1920
region: str = settings.AWS_DEFAULT_REGION,
20-
aws_s3_no_sign_request: bool = settings.AWS_S3_NOSIGN_REQUEST,
2121
) -> None:
2222
"""Initialize S3 client and bucket name.
2323
2424
Args:
25-
bucket_name: Name of the S3 bucket
26-
region: AWS region (defaults to AWS_DEFAULT_REGION or AWS_REGION env var, or 'us-east-1')
27-
aws_s3_no_sign_request: if True will access S3 un-authenticated for public buckets, if False will use the AWS credentials set by the user
25+
bucket_name (str): Name of the S3 bucket
26+
no_sign_request (bool, optional): If True will access S3 un-authenticated for public buckets.
27+
If False will use the AWS credentials set by the user. Defaults to False.
28+
region (str, optional): AWS region. Defaults to AWS_DEFAULT_REGION or AWS_REGION env var,
29+
or 'us-east-1'.
2830
"""
2931

3032
self.region = region
3133
self.bucket_name = bucket_name
32-
self.aws_s3_no_sign_request = aws_s3_no_sign_request
33-
if self.aws_s3_no_sign_request:
34+
self.no_sign_request = no_sign_request
35+
if self.no_sign_request:
3436
# Use unsigned mode for public buckets
3537
self.s3_client = boto3.client(
3638
"s3",
@@ -42,12 +44,15 @@ def __init__(
4244
self.s3_client = boto3.client("s3", region_name=self.region)
4345

4446
def upload_folder(self, local_path: Union[str, Path], s3_prefix: str = "") -> None:
45-
"""
46-
Upload a local folder as a zip file to S3.
47+
"""Upload a local folder as a zip file to S3.
4748
4849
Args:
49-
local_path: Path to the local folder
50-
s3_prefix: Optional prefix (folder path) in S3 bucket
50+
local_path (Union[str, Path]): Path to the local folder
51+
s3_prefix (str, optional): Optional prefix (folder path) in S3 bucket. Defaults to "".
52+
53+
Raises:
54+
FileNotFoundError: If the local path does not exist
55+
NotADirectoryError: If the local path is not a directory
5156
"""
5257
# Ensure bucket exists before proceeding
5358
self.__create_bucket_if_doesnt_exist()
@@ -84,9 +89,10 @@ def upload_folder(self, local_path: Union[str, Path], s3_prefix: str = "") -> No
8489
os.unlink(temp_zip.name)
8590

8691
def __create_bucket_if_doesnt_exist(self) -> None:
87-
"""
88-
Check if bucket exists and create it if it doesn't.
89-
Raises permission-related exceptions if user lacks necessary permissions.
92+
"""Check if bucket exists and create it if it doesn't.
93+
94+
Raises:
95+
Exception: If bucket creation fails or if user lacks necessary permissions
9096
"""
9197
try:
9298
self.s3_client.head_bucket(Bucket=self.bucket_name)
@@ -108,12 +114,11 @@ def __create_bucket_if_doesnt_exist(self) -> None:
108114
raise
109115

110116
def download_folder(self, s3_prefix: str, local_path: Union[str, Path]) -> None:
111-
"""
112-
Download a zipped folder from S3 and extract it to local storage.
117+
"""Download a zipped folder from S3 and extract it to local storage.
113118
114119
Args:
115-
s3_prefix: Prefix (folder path) in S3 bucket
116-
local_path: Local path where files should be extracted
120+
s3_prefix (str): Prefix (folder path) in S3 bucket pointing to the zip file
121+
local_path (Union[str, Path]): Local path where files should be extracted
117122
"""
118123
local_path = Path(local_path)
119124

@@ -135,3 +140,17 @@ def download_folder(self, s3_prefix: str, local_path: Union[str, Path]) -> None:
135140

136141
# Clean up temporary zip file
137142
os.unlink(temp_zip.name)
143+
144+
def download_file(self, s3_prefix: str, local_path: Union[str, Path]) -> None:
145+
"""Download a file from S3 to local storage.
146+
147+
Args:
148+
s3_prefix (str): Path to the file in S3 bucket
149+
local_path (Union[str, Path]): Local directory path where the file should be downloaded
150+
"""
151+
152+
local_path = Path(local_path)
153+
local_path.mkdir(parents=True, exist_ok=True)
154+
155+
target_file = local_path / Path(s3_prefix).name
156+
self.s3_client.download_file(self.bucket_name, s3_prefix, str(target_file))

apps/second-brain-offline/tests/pytest.ini

Lines changed: 0 additions & 5 deletions
This file was deleted.

0 commit comments

Comments
 (0)