CogStack
diff --git a/‎medcat-v1/.github/workflows/main.yml‎
Lines changed: 1 addition & 1 deletion b/‎medcat-v1/.github/workflows/main.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎medcat-v1/.github/workflows/production.yml‎
Lines changed: 1 addition & 1 deletion b/‎medcat-v1/.github/workflows/production.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎medcat-v1/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎medcat-v1/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎medcat-v1/.readthedocs.yaml‎
Lines changed: 2 additions & 2 deletions b/‎medcat-v1/.readthedocs.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎medcat-v1/README.md‎
Lines changed: 5 additions & 1 deletion b/‎medcat-v1/README.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎medcat-v1/docs/requirements.txt‎
Lines changed: 2 additions & 2 deletions b/‎medcat-v1/docs/requirements.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎medcat-v1/install_requires.txt‎
Lines changed: 1 addition & 1 deletion b/‎medcat-v1/install_requires.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎medcat-v1/medcat/cat.py‎
Lines changed: 1 addition & 1 deletion b/‎medcat-v1/medcat/cat.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎medcat-v1/medcat/config_rel_cat.py‎
Lines changed: 85 additions & 13 deletions b/‎medcat-v1/medcat/config_rel_cat.py‎
Lines changed: 85 additions & 13 deletions
diff --git a/‎medcat-v1/medcat/meta_cat.py‎
Lines changed: 4 additions & 3 deletions b/‎medcat-v1/medcat/meta_cat.py‎
Lines changed: 4 additions & 3 deletions
@@ -76,7 +76,7 @@ jobs:
       github.ref == 'refs/heads/master' &&
       github.event_name == 'push' &&
       startsWith(github.ref, 'refs/tags') != true
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
     timeout-minutes: 45
     concurrency: publish-to-test-pypi
     needs: [build]
 
@@ -8,7 +8,7 @@ on:
 
 jobs:
   build-n-publish-to-pypi:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
     concurrency: build-n-publish-to-pypi
     if: github.repository == 'CogStack/MedCAT'
 
 
@@ -55,3 +55,4 @@ tests/model_creator/output/*
 docs/auto/
 docs/_build
 
+models/
@@ -15,5 +15,5 @@ sphinx:
 python:
   install:
     - requirements: docs/requirements.txt
-    - method: setuptools
-      path: .
+    - method: pip
+      path: .
@@ -1,5 +1,9 @@
 # Medical  <img src="https://raw.githubusercontent.com/CogStack/MedCAT/master/media/cat-logo.png" width=45> oncept Annotation Tool
 
+**NB! [MedCAT v2](https://github.com/CogStack/MedCAT2) will soon be released.**
+**MedCAT v1.16.0 will be the latest MedCAT v1 minor release.**
+**However, we will likely continue to support v1.16.x with patch releases for some time.**
+
 [![Build Status](https://github.com/CogStack/MedCAT/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/CogStack/MedCAT/actions/workflows/main.yml?query=branch%3Amaster)
 [![Documentation Status](https://readthedocs.org/projects/medcat/badge/?version=latest)](https://medcat.readthedocs.io/en/latest/?badge=latest)
 [![Latest release](https://img.shields.io/github/v/release/CogStack/MedCAT)](https://github.com/CogStack/MedCAT/releases/latest)
@@ -47,7 +51,7 @@ Normal installations of MedCAT will install torch-gpu and all relevant dependanc
 
 To install the latest version of MedCAT without torch GPU support run the following command:
 ```
-pip install medcat --extra_index_url https://download.pytorch.org/whl/cpu/
+pip install medcat --extra-index-url https://download.pytorch.org/whl/cpu/
 ```
 ## Demo
 A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III and all of SNOMED-CT.
 
@@ -30,7 +30,7 @@ flake8==7.0.0
 frozenlist==1.5.0
 fsspec==2024.6.1
 gensim==4.3.3
-huggingface-hub==0.27.1
+huggingface-hub==0.30.2
 idna==3.10
 ipython==8.31.0
 ipywidgets==8.1.5
@@ -90,7 +90,7 @@ tomli==2.2.1
 torch==2.5.1
 tqdm==4.67.1
 traitlets==5.14.3
-transformers==4.47.1
+transformers==4.51.3
 triton==3.1.0
 typer==0.15.1
 types-PyYAML==6.0.3
 
@@ -3,7 +3,7 @@
 'gensim>=4.3.0,<5.0.0'  # 5.3.0 is first to support 3.11; avoid major version bump
 'spacy>=3.6.0,<4.0.0'  # avoid major bump
 'scipy>=1.9.2,<1.14.0'  # 1.9.2 is first to support 3.11; 1.14.0 does not support 3.9
-'transformers>=4.34.0,<5.0.0'  # avoid major version bump
+'transformers>=4.48.1,<5.0.0'  # avoid major version bump
 'accelerate>=0.23.0' # required by Trainer class in de-id
 'torch>=2.4.0,<3.0.0' # 2.4.0 is first to support 3.12; avoid major 3.0.0 for now
 'tqdm>=4.27'
 
@@ -143,7 +143,7 @@ def _create_pipeline(self, config: Config):
             self.pipe.add_meta_cat(meta_cat, meta_cat.config.general.category_name)
 
         for rel_cat in self._rel_cats:
-            self.pipe.add_rel_cat(rel_cat, "_".join(list(rel_cat.config.general["labels2idx"].keys())))
+            self.pipe.add_rel_cat(rel_cat, "_".join(list(rel_cat.component.relcat_config.general["labels2idx"].keys())))
 
         # Set max document length
         self.pipe.spacy_nlp.max_length = config.preprocessing.max_document_length
 
@@ -1,5 +1,6 @@
+import os
 import logging
-from typing import Dict, Any, List
+from typing import Any, Dict, List, Tuple, Union, cast
 from medcat.config import MixingConfig, BaseModel, Optional
 
 
@@ -21,10 +22,14 @@ class General(MixingConfig, BaseModel):
     window_size: int = 300
     """Max acceptable dinstance between entities (in characters), care when using this as it can produce sentences that are over 512 tokens (limit is given by tokenizer)"""
 
-    mct_export_max_non_rel_sample_size:int = 200
+    limit_samples_per_class: int = -1
+    """Number of samples per class, this limit is applied for train samples, so if train samples are 100 then test would be 20."""
+    addl_rels_max_sample_size:int = 200
     """Limit the number of 'Other' samples selected for training/test. This is applied per encountered medcat project, sample_size/num_projects. """
-    mct_export_create_addl_rels: bool = False
-    """When processing relations from a MedCAT export, relations labeled as 'Other' are created from all the annotations pairs available"""
+    create_addl_rels: bool = False
+    """When processing relations from a MedCAT export/docs, relations labeled as 'Other' are created from all the annotations pairs available"""
+    create_addl_rels_by_type: bool = False
+    """When creating the 'Other' relation class, actually split this class into subclasses based on concept types"""
 
     tokenizer_name: str = "bert"
     """The name of the tokenizer user.
@@ -46,21 +51,47 @@ class General(MixingConfig, BaseModel):
     """Tokenizer.
 
     NB! For these changes to take effect, the pipe would need to be recreated."""
-    annotation_schema_tag_ids: List = []
+    annotation_schema_tag_ids: List = [30522, 30523, 30524, 30525]
     """If a foreign non-MCAT trainer dataset is used, you can insert your own Rel entity token delimiters into the tokenizer, \
-    copy those token IDs here, and also resize your tokenizer embeddings and adjust the hidden_size of the model, this will depend on the number of tokens you introduce"""
-    labels2idx: Dict = {}
-    idx2labels: Dict = {}
+    copy those token IDs here, and also resize your tokenizer embeddings and adjust the hidden_size of the model, this will depend on the number of tokens you introduce
+    for example: 30522 - [s1], 30523 - [e1], 30524 - [s2], 30525 - [e2], 30526 - [BLANK], 30527 - [ENT1], 30528 - [ENT2], 30529 - [/ENT1], 30530 - [/ENT2]
+    Please note that the tokenizer special tokens are supposed to be in pairs of two for example [s1] and [e1], [s2] and [e2], the [BLANK] is just an example placeholder token
+    If you have more than four tokens here then you need to make sure they are present in the text, 
+    otherwise the pipeline will throw an error in the get_annotation_schema_tag() function.
+    """
+
+    tokenizer_relation_annotation_special_tokens_tags: List[str] = ["[s1]", "[e1]", "[s2]", "[e2]"]
+
+    tokenizer_other_special_tokens: Dict[str, str] = {"pad_token": "[PAD]"}
+    """
+    The special tokens used by the tokenizer. The {PAD} is for Lllama tokenizer."""
+
+    labels2idx: Dict[str, int] = {}
+    idx2labels: Dict[int, str] = {}
+
     pin_memory: bool = True
+    """If True the data loader will copy the tensors to the GPU pinned memory"""
+
     seed: int = 13
     """The seed for random number generation.
 
-    NOTE: If used along MetaCAT or additional NER, only one of the seeds will take effect
     NB! For these changes to take effect, the pipe would need to be recreated."""
     task: str = "train"
-    """The task for RelCAT.
+    """The task for RelCAT."""
 
-    NB! For these changes to take effect, the pipe would need to be recreated."""
+    language: str = "en"
+    """Used for Spacy lang setting"""
+
+    @classmethod
+    def convert_keys_to_int(cls, value):
+        if isinstance(value, dict):
+            return {int(k): v for k, v in value.items()}
+        return value
+
+    def __setattr__(self, key: str, value: Any):
+        if key == "idx2labels" and isinstance(value, dict):
+            value = self.convert_keys_to_int(value)  # Ensure conversion
+        super().__setattr__(key, value)
 
 
 class Model(MixingConfig, BaseModel):
@@ -82,12 +113,18 @@ class Model(MixingConfig, BaseModel):
     num_directions: int = 2
     """2 - bidirectional model, 1 - unidirectional"""
 
+    freeze_layers: bool = True
+    """If we update the weights during training"""
+
     padding_idx: int = -1
     emb_grad: bool = True
     """If True the embeddings will also be trained"""
     ignore_cpos: bool = False
     """If set to True center positions will be ignored when calculating representation"""
 
+    llama_use_pooled_output: bool = False
+    """If set to True, used only in Llama model, it will add the extra tensor formed from selecting the max of the last hidden layer"""
+
     class Config:
         extra = 'allow'
         validate_assignment = True
@@ -98,9 +135,24 @@ class Train(MixingConfig, BaseModel):
     nclasses: int = 2
     """Number of classes that this model will output"""
     batch_size: int = 25
+    """batch size"""
     nepochs: int = 1
+    """Epochs"""
     lr: float = 1e-4
-    adam_epsilon: float = 1e-4
+    """Learning rate"""
+    stratified_batching: bool = False
+    """Train the model with stratified batching"""
+    batching_samples_per_class: list = []
+    """Number of samples per class in each batch
+    example for batch size 64: [6,6,6,8,8,8,6,8,8]"""
+    batching_minority_limit: Union[List[int], int] = 0
+    """Maximum number of samples the minority class can have.
+    Since the minority class elements need to be repeated, this is used to facilitate that
+    example: batching_samples_per_class - [6,6,6,8,8,8,6,8,8]
+             batching_minority_limit - 6"""
+    adam_betas: Tuple[float, float] = (0.9, 0.999)
+    adam_weight_decay: float = 0
+    adam_epsilon: float = 1e-8
     test_size: float = 0.2
     gradient_acc_steps: int = 1
     multistep_milestones: List[int] = [
@@ -109,7 +161,8 @@ class Train(MixingConfig, BaseModel):
     max_grad_norm: float = 1.0
     shuffle_data: bool = True
     """Used only during training, if set the dataset will be shuffled before train/test split"""
-    class_weights: Optional[Any] = None
+    class_weights: Union[List[float], None] = None
+    enable_class_weights: bool = False
     score_average: str = "weighted"
     """What to use for averaging F1/P/R across labels"""
     auto_save_model: bool = True
@@ -129,3 +182,22 @@ class ConfigRelCAT(MixingConfig, BaseModel):
     class Config:
         extra = 'allow'
         validate_assignment = True
+
+    @classmethod
+    def load(cls, load_path: str = "./") -> "ConfigRelCAT":
+        """Load the config from a file.
+
+        Args:
+            load_path (str): Path to RelCAT config. Defaults to "./".
+
+        Returns:
+            ConfigRelCAT: The loaded config.
+        """
+        config = cls()
+        if os.path.exists(load_path):
+            if "config.json" not in load_path:
+                load_path = os.path.join(load_path, "config.json")
+            config = cast(ConfigRelCAT, super().load(load_path))
+            logging.info("Loaded config.json")
+
+        return config
@@ -95,8 +95,9 @@ def get_model(self, embeddings: Optional[Tensor]) -> nn.Module:
             if not config.model.model_freeze_layers:
                 peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=16,
                                          target_modules=["query", "value"], lora_dropout=0.2)
-
-                model = get_peft_model(model, peft_config)
+                # Not sure what changed between transformers 4.50.3 and 4.50.1 that made this
+                # fail for mypy. But as best as I Can tell, it still works just the same
+                model = get_peft_model(model, peft_config)  # type: ignore
                 # model.print_trainable_parameters()
 
             logger.info("BERT model used for classification")
@@ -412,7 +413,7 @@ def load(cls, save_dir_path: str, config_dict: Optional[Dict] = None) -> "MetaCA
             tokenizer = TokenizerWrapperBPE.load(save_dir_path)
         elif config.general['tokenizer_name'] == 'bert-tokenizer':
             from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT
-            tokenizer = TokenizerWrapperBERT.load(save_dir_path, config.model['model_variant'])
+            tokenizer = TokenizerWrapperBERT.load(save_dir_path, config.model.model_variant)
 
         # Create meta_cat
         meta_cat = cls(tokenizer=tokenizer, embeddings=None, config=config)
Original file line number	Diff line number	Diff line change
`@@ -55,3 +55,4 @@ tests/model_creator/output/*`
`55`	`55`	`docs/auto/`
`56`	`56`	`docs/_build`
`57`	`57`
	`58`	`+models/`