Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class Settings(BaseSettings): # type: ignore
TRAINING_METRICS_LOGGING_INTERVAL: int = 5 # the number of steps after which training metrics will be collected
TRAINING_SAFE_MODEL_SERIALISATION: str = "false" # if "true", serialise the trained model using safe tensors
TRAINING_CACHE_DIR: str = os.path.join(os.path.abspath(os.path.dirname(__file__)), "cms_cache") # the directory to cache the intermediate files created during training
TRAINING_HF_TAGGING_SCHEME: str = "flat" # the tagging scheme during the Hugging Face NER model training, either "flat", "iob" or "iobes"
HF_PIPELINE_AGGREGATION_STRATEGY: str = "simple" # the strategy used for aggregating the predictions of the Hugging Face NER model
LOG_PER_CONCEPT_ACCURACIES: str = "false" # if "true", per-concept accuracies will be exposed to the metrics scrapper. Switch this on with caution due to the potentially high number of concepts
MEDCAT2_MAPPED_ONTOLOGIES: str = "" # the comma-separated names of ontologies for MedCAT2 to map to
Expand Down
10 changes: 10 additions & 0 deletions app/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ class Device(str, Enum):
MPS = "mps"


class TaggingScheme(str, Enum):
FLAT = "flat"
IOB = "iob"
IOBES = "iobes"


class HfTransformerBackbone(Enum):
ALBERT = "albert"
BIG_BIRD = "bert"
Expand Down Expand Up @@ -110,20 +116,24 @@ class LlmEngine(Enum):
CMS = "CMS"
VLLM = "vLLM"


class LlmRole(Enum):
SYSTEM = "system"
USER = "user"
ASSISTANT = "assistant"
TOOL = "tool"


class LlmTrainerType(Enum):
GRPO = "grpo"
PPO = "ppo"


class LlmDatasetType(Enum):
JSON = "json"
CSV = "csv"


class Annotation(BaseModel):
doc_name: Optional[str] = Field(default=None, description="The name of the document to which the annotation belongs")
start: int = Field(description="The start index of the annotation span")
Expand Down
3 changes: 3 additions & 0 deletions app/envs/.env
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ TRAINING_SAFE_MODEL_SERIALISATION=false
# The strategy used for aggregating the predictions of the Hugging Face NER model
HF_PIPELINE_AGGREGATION_STRATEGY=simple

# The tagging scheme during the Hugging Face NER model training, either "flat", "iob" or "iobes"
TRAINING_HF_TAGGING_SCHEME=flat

# The comma-separated names of ontologies for MedCAT2 to map to
MEDCAT2_MAPPED_ONTOLOGIES=opcs4,icd10

Expand Down
25 changes: 16 additions & 9 deletions app/model_services/huggingface_llm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from app.exception import ConfigurationException
from app.model_services.base import AbstractModelService
from app.trainers.huggingface_llm_trainer import HuggingFaceLlmSupervisedTrainer
from app.domain import ModelCard, ModelType, Annotation
from app.domain import ModelCard, ModelType, Annotation, Device
from app.config import Settings
from app.utils import (
get_settings,
Expand Down Expand Up @@ -157,9 +157,19 @@ def load_model(
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
if get_settings().DEVICE == Device.DEFAULT.value:
model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config,
device_map="auto",
)
else:
model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
else:
model = AutoModelForCausalLM.from_pretrained(model_path)
if get_settings().DEVICE == Device.DEFAULT.value:
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
else:
model = AutoModelForCausalLM.from_pretrained(model_path)
ensure_tensor_contiguity(model)
tokenizer = AutoTokenizer.from_pretrained(
model_path,
Expand Down Expand Up @@ -242,8 +252,7 @@ def generate(
self.model.eval()

inputs = self.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
if non_default_device_is_available(self._config.DEVICE):
inputs.to(get_settings().DEVICE)
inputs.to(self.model.device)

generation_kwargs = dict(
inputs=inputs.input_ids,
Expand Down Expand Up @@ -291,8 +300,7 @@ async def generate_async(
self.model.eval()

inputs = self.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
if non_default_device_is_available(self._config.DEVICE):
inputs.to(get_settings().DEVICE)
inputs.to(self.model.device)

streamer = TextIteratorStreamer(
self.tokenizer,
Expand Down Expand Up @@ -363,8 +371,7 @@ def create_embeddings(
truncation=True,
)

if non_default_device_is_available(self._config.DEVICE):
inputs.to(get_settings().DEVICE)
inputs.to(self.model.device)

with torch.no_grad():
outputs = self.model(**inputs, output_hidden_states=True)
Expand Down
44 changes: 33 additions & 11 deletions app/model_services/huggingface_ner_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from app.exception import ConfigurationException
from app.model_services.base import AbstractModelService
from app.trainers.huggingface_ner_trainer import HuggingFaceNerUnsupervisedTrainer, HuggingFaceNerSupervisedTrainer
from app.domain import ModelCard, ModelType, Annotation
from app.domain import ModelCard, ModelType, Annotation, Device, TaggingScheme
from app.config import Settings
from app.utils import (
get_settings,
Expand All @@ -27,6 +27,7 @@
get_model_data_package_base_name,
load_pydantic_object_from_dict,
)
from app.processors.tagging import TagProcessor

logger = logging.getLogger("cms")

Expand All @@ -41,7 +42,7 @@ def __init__(
enable_trainer: Optional[bool] = None,
model_name: Optional[str] = None,
base_model_file: Optional[str] = None,
confidence_threshold: float = 0.5,
confidence_threshold: float = 0.7,
) -> None:
"""
Initialises the HuggingFace NER model service with specified configurations.
Expand All @@ -52,7 +53,7 @@ def __init__(
enable_trainer (Optional[bool]): The flag to enable or disable trainers. Defaults to None.
model_name (Optional[str]): The name of the model. Defaults to None.
base_model_file (Optional[str]): The model package file name. Defaults to None.
confidence_threshold (float): The threshold for the confidence score. Defaults to 0.5.
confidence_threshold (float): The threshold for the confidence score. Defaults to 0.7.
"""

super().__init__(config)
Expand Down Expand Up @@ -123,19 +124,20 @@ def from_model(cls, model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase)
HuggingFaceNerModel: A HuggingFace NER model service.
"""

model_service = cls(get_settings(), enable_trainer=False)
_config = get_settings()
model_service = cls(_config, enable_trainer=False)
model_service.model = model
model_service.tokenizer = tokenizer
_pipeline = partial(
pipeline,
task="ner",
model=model_service.model,
tokenizer=model_service.tokenizer,
stride=10,
aggregation_strategy=get_settings().HF_PIPELINE_AGGREGATION_STRATEGY,
stride=32,
aggregation_strategy=_config.HF_PIPELINE_AGGREGATION_STRATEGY,
)
if non_default_device_is_available(get_settings().DEVICE):
model_service._ner_pipeline = _pipeline(device=get_hf_pipeline_device_id(get_settings().DEVICE))
if non_default_device_is_available(_config.DEVICE):
model_service._ner_pipeline = _pipeline(device=get_hf_pipeline_device_id(_config.DEVICE))
else:
model_service._ner_pipeline = _pipeline()
return model_service
Expand All @@ -160,7 +162,10 @@ def load_model(model_file_path: str, *args: Tuple, **kwargs: Dict[str, Any]) ->
model_path = os.path.join(os.path.dirname(model_file_path), get_model_data_package_base_name(model_file_path))
if unpack_model_data_package(model_file_path, model_path):
try:
model = AutoModelForTokenClassification.from_pretrained(model_path)
if get_settings().DEVICE == Device.DEFAULT.value:
model = AutoModelForTokenClassification.from_pretrained(model_path, device_map="auto")
else:
model = AutoModelForTokenClassification.from_pretrained(model_path)
ensure_tensor_contiguity(model)
tokenizer = AutoTokenizer.from_pretrained(
model_path,
Expand Down Expand Up @@ -197,7 +202,7 @@ def init_model(self, *args: Any, **kwargs: Any) -> None:
task="ner",
model=self._model,
tokenizer=self._tokenizer,
stride=10,
stride=32,
aggregation_strategy=self._config.HF_PIPELINE_AGGREGATION_STRATEGY,
)
if non_default_device_is_available(get_settings().DEVICE):
Expand Down Expand Up @@ -233,12 +238,29 @@ def annotate(self, text: str) -> List[Annotation]:
List[Annotation]: A list of annotations containing the extracted named entities.
"""

entities = self._ner_pipeline(text)
if TaggingScheme(self._config.TRAINING_HF_TAGGING_SCHEME.lower()) == TaggingScheme.IOBES:
entities = self._ner_pipeline(text, aggregation_strategy="none")
else:
entities = self._ner_pipeline(text)
df = pd.DataFrame(entities)

if df.empty:
columns = ["label_name", "label_id", "start", "end", "accuracy"]
df = pd.DataFrame(columns=(columns + ["text"]) if self._config.INCLUDE_SPAN_TEXT == "true" else columns)
elif TaggingScheme(self._config.TRAINING_HF_TAGGING_SCHEME.lower()) == TaggingScheme.IOBES:
aggregated_entities = TagProcessor.aggregate_bioes_predictions(
df,
text,
self._config.INCLUDE_SPAN_TEXT == "true",
)
df = pd.DataFrame(aggregated_entities)
if df.empty:
columns = ["label_name", "label_id", "start", "end", "accuracy"]
df = pd.DataFrame(
columns=(columns + ["text"]) if self._config.INCLUDE_SPAN_TEXT == "true" else columns
)
else:
df = df[df["accuracy"] >= self._confidence_threshold]
else:
for idx, row in df.iterrows():
df.loc[idx, "label_id"] = row["entity_group"]
Expand Down
Loading