tomasruizt
diff --git a/‎README.md‎
Lines changed: 6 additions & 0 deletions b/‎README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎llmlib/llmlib/llama3/.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎llmlib/llmlib/llama3/.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎llmlib/llmlib/llama3/README.md‎
Lines changed: 0 additions & 5 deletions b/‎llmlib/llmlib/llama3/README.md‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎llmlib/llmlib/llama3/__init__.py‎
Lines changed: 0 additions & 3 deletions b/‎llmlib/llmlib/llama3/__init__.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎llmlib/llmlib/llama3/llama3_vision_8b.py‎
Lines changed: 0 additions & 67 deletions b/‎llmlib/llmlib/llama3/llama3_vision_8b.py‎
Lines changed: 0 additions & 67 deletions
diff --git a/‎llmlib/llmlib/runtime.py‎
Lines changed: 0 additions & 2 deletions b/‎llmlib/llmlib/runtime.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/test_llama3.py‎
Lines changed: 0 additions & 17 deletions b/‎tests/test_llama3.py‎
Lines changed: 0 additions & 17 deletions
@@ -19,3 +19,9 @@ docker compose up rest_api
 * The open-source implementation are based on the `transformers` library. I have experimented with `vLLM`, but it made the GPU run OOM. More fiddling is needed.
 * I have extracted a REST API using `FastAPI` to decouple the frontend streamlit code from the inference server.
 * The app supports small open-source models atm, because the inference server is running a single 24GB VRAM GPU. We will hopefully scale this backend up soon.
+
+## Archive: Installation Tips
+Installation for the quantized model in `llama_cpp`:
+```shell
+CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCUDA_PATH=/usr/local/cuda-12.5 -DCUDAToolkit_ROOT=/usr/local/cuda-12.5 -DCUDAToolkit_INCLUDE_DIR=/usr/local/cuda-12/include -DCUDAToolkit_LIBRARY_DIR=/usr/local/cuda-12.5/lib64" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
+```
@@ -2,7 +2,6 @@
 from .gemini.gemini_code import GeminiAPI
 from .gemma import PaliGemma2
 from .minicpm import MiniCPM
-from .llama3 import LLama3Vision8B
 from .model_registry import ModelEntry, ModelRegistry, model_entries_from_mult_ids
 from .openai.openai_completion import OpenAIModel
 from .phi3.phi3 import Phi3Vision
@@ -14,7 +13,6 @@ def filled_model_registry() -> ModelRegistry:
             *model_entries_from_mult_ids(MiniCPM),
             ModelEntry.from_cls_with_id(Apollo7B),
             ModelEntry.from_cls_with_id(Phi3Vision),
-            ModelEntry.from_cls_with_id(LLama3Vision8B),
             ModelEntry.from_cls_with_id(PaliGemma2),
             *model_entries_from_mult_ids(OpenAIModel),
             *model_entries_from_mult_ids(GeminiAPI),