Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
0a5b1d0
Merge pull request #43 from starfishdata/main
SoulEvill May 8, 2025
d51efb9
Switch to Async queue
jjiang10-sv May 12, 2025
b181c2e
add re;eae pipeline
jjiang10-sv May 13, 2025
73af01e
add condition
jjiang10-sv May 13, 2025
ccc043a
add condition
jjiang10-sv May 13, 2025
aa6cf3c
add release pipeline (#47)
jjiang10-sv May 13, 2025
fcdf3d6
debug on pipeline
jjiang10-sv May 13, 2025
0da875e
Merge branch 'dev' of https://github.com/starfishdata/starfish into dev
jjiang10-sv May 13, 2025
799821c
fix merge
jjiang10-sv May 13, 2025
9aa87e3
Data Ingest first version
jjiang10-sv May 13, 2025
41f9eb4
prevents deadlocks when the same thread
SoulEvill May 21, 2025
50b4683
Update by using pydantic as source of default
SoulEvill May 21, 2025
c41f667
typo correction
SoulEvill May 21, 2025
d841922
first data gen template
SoulEvill May 21, 2025
2ab79e9
input validate
SoulEvill May 21, 2025
40d69c4
pydantic as source of truth
SoulEvill May 21, 2025
8b3386a
update topic generate template
jjiang10-sv May 21, 2025
8fde84b
F/data mcp (#50)
jjiang10-sv May 22, 2025
9362478
F/pypi release (#48)
jjiang10-sv May 22, 2025
1e400d0
updat test pypi workflow
jjiang10-sv May 22, 2025
3ac73de
Add data gen template john (#52)
jjiang10-sv May 22, 2025
580866a
merge from dev
jjiang10-sv May 22, 2025
534db8e
merge code
jjiang10-sv May 22, 2025
f73c149
update data_tempalte
jjiang10-sv May 23, 2025
1e2ed6e
update the tempalte
jjiang10-sv May 23, 2025
4362195
merge from dev
jjiang10-sv May 23, 2025
0fb5aed
Add data gen template
SoulEvill May 23, 2025
b909564
rename and commit api gen data template
SoulEvill May 23, 2025
01cacd1
data_template/data_ingest (#54)
jjiang10-sv May 24, 2025
0696d89
update version
jjiang10-sv May 24, 2025
3c8755b
update version
jjiang10-sv May 24, 2025
2def288
trigger pipeline
jjiang10-sv May 24, 2025
5fc1821
trigger pipeline
jjiang10-sv May 24, 2025
1eb12e3
trigger pipeline
jjiang10-sv May 24, 2025
1f8ae75
trigger pipeline
jjiang10-sv May 24, 2025
9584d73
prepare for prod release
jjiang10-sv May 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ TELEMETRY_ENABLED=true

# Logging
LOG_LEVEL=INFO
# STARFISH_LOCAL_STORAGE_DIR=
# STARFISH_LOCAL_STORAGE_DIR=
JINA_AI_API_KEY=jina_api_key
10 changes: 6 additions & 4 deletions .github/workflows/lint-and-test.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
name: Starfish testing workflow

on:
push:
branches:
- main
- dev
# push:
# branches:
# - main
# - dev
pull_request:
branches:
- main
- dev
- '!f/pypi_release'

jobs:
test-integration:
if: github.event.pull_request.head.ref != 'f/pypi_release'
runs-on: ubuntu-latest

steps:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ on:
push:
tags:
- 'v*'
# branches:
# - 'main'

jobs:
deploy:
Expand All @@ -12,6 +14,14 @@ jobs:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Verify tag is on main branch
run: |
TAG_NAME=${GITHUB_REF#refs/tags/}
COMMIT=$(git rev-parse $TAG_NAME)
if ! git branch --contains $COMMIT | grep -qw main; then
echo "::error::Tag $TAG_NAME must be created from main branch"
exit 1
fi
- name: Set up Python
uses: actions/setup-python@v4
with:
Expand Down
107 changes: 107 additions & 0 deletions .github/workflows/publish_testpypi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
name: Publish to Test PyPI

on:
push:
tags:
- 'test-v*'
branches:
- 'f/pypi_release'

jobs:
deploy_testpypi:
#if: true
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0 # Required for full commit history check
- name: Verify tag is on dev branch
run: |
TAG_NAME=${GITHUB_REF#refs/tags/}
COMMIT=$(git rev-parse $TAG_NAME)
if ! git branch --contains $COMMIT | grep -qw dev; then
echo "::error::Tag $TAG_NAME must be created from dev branch"
exit 1
fi
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install build twine
- name: Build and publish
env:
#TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
#ACTIONS_STEP_DEBUG: true
run: |
# echo "TWINE_PASSWORD first 5 chars: ${TWINE_PASSWORD:0:184}"
# echo "TWINE_PASSWORD length: ${#TWINE_PASSWORD}"
python -m build
twine upload --verbose --repository-url https://test.pypi.org/legacy/ dist/*

test-colab:
needs: deploy_testpypi
runs-on: ubuntu-latest
#a Public "Colab-like" Image
container:
image: jupyter/minimal-notebook:latest
options: --user root # Run as root to avoid permission issues
permissions:
contents: write
steps:
- uses: actions/checkout@v3
with:
sparse-checkout: |
tests/*
examples/data_factory_release_check.ipynb
sparse-checkout-cone-mode: false
- name: Update system packages
run: |
apt-get update
apt-get install -y libssl3 # Removed sudo since we're running as root
- name: Print Python and Jupyter versions
run: |
python --version
pip list | grep -E 'jupyter|ipykernel|nbconvert|notebook'
# Authenticate to GCP
# - name: Authenticate to GCP
# uses: google-github-actions/auth@v1
# with:
# credentials_json: ${{ secrets.GCP_SA_KEY }}

# # Configure Docker to use GCR credentials
# - name: Configure Docker for GCR
# uses: google-github-actions/docker-auth@v1

# # Now you can pull the image
# - name: Use Colab base image
# run: docker pull gcr.io/colab-images/base:latest

# --no-prompt --no-input \ suppress the output
- name: Run Colab-style tests
run: |
if ! jupyter nbconvert --execute --to notebook --inplace \
--ExecutePreprocessor.kernel_name=python3 \
--ExecutePreprocessor.timeout=120 \
--no-prompt --no-input \
--stdout \
examples/data_factory_release_check.ipynb; then
echo "::error::Notebook execution failed"
exit 1
fi
echo "Notebook executed successfully. Summary:" && \
jupyter nbconvert --to markdown --stdout \
examples/data_factory_release_check.ipynb | \
grep -E '^#|^##' || true

# Add tag deletion step
- name: Delete triggering tag after successful test
if: startsWith(github.ref, 'refs/tags/test-v')
run: |
gh api -X DELETE /repos/$GITHUB_REPOSITORY/git/refs/tags/${GITHUB_REF#refs/tags/}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# Adhoc stuff
input.json
output.json
.serena/
docs/
/vibe_coding/response.md
/dev/
todo
.local/
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "internal"]
path = internal
url = https://github.com/starfishdata/starfish_internal.git
[submodule "docs_mintlify"]
path = docs_mintlify
url = https://github.com/starfishdata/docs.git
20 changes: 16 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,21 @@ docstring:
test:
poetry run pytest tests/

install:
@echo "Installing dependencies..."
poetry install
poetry run pre-commit install --install-hooks
install: install-extras

#poetry install --extras "code_execution vllm" --with dev
# Install with specific extras
#make install EXTRAS="pdf"
# Install all extras
#make install EXTRAS="all"
# Install without extras (default)
#make install
install-extras:
@echo "Installing dependencies with extras: $(EXTRAS)"
poetry install $(if $(EXTRAS),--extras "$(EXTRAS)",) --with dev

start-client_claude:
python src/starfish/data_mcp/client_claude.py src/starfish/data_mcp/server.py

start-client_openai:
python src/starfish/data_mcp/client_openai.py
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,22 @@ Key Features:
pip install starfish-core
```

### Optional Dependencies

Starfish supports optional dependencies for specific file parsers. Install only what you need:

```bash
# Install specific parsers
pip install "starfish-core[pdf]" # PDF support
pip install "starfish-core[docx]" # Word document support
pip install "starfish-core[ppt]" # PowerPoint support
pip install "starfish-core[excel]" # Excel support
pip install "starfish-core[youtube]" # YouTube support

# Install all parser dependencies
pip install "starfish-core[all]"
```

## Configuration

Starfish uses environment variables for configuration. We provide a `.env.template` file to help you get started quickly:
Expand Down
1 change: 1 addition & 0 deletions docs_mintlify
Submodule docs_mintlify added at 6ad0ad
4 changes: 2 additions & 2 deletions examples/data_factory.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "starfish-T7IInzTH-py3.11",
"display_name": ".venv",
"language": "python",
"name": "python3"
},
Expand All @@ -673,7 +673,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
"version": "3.11.4"
}
},
"nbformat": 4,
Expand Down
Loading