diff --git a/validmind/__init__.py b/validmind/__init__.py index 55b2dd1d2..c99f3a537 100644 --- a/validmind/__init__.py +++ b/validmind/__init__.py @@ -43,7 +43,7 @@ warnings.simplefilter("ignore", category=NumbaPendingDeprecationWarning) from .__version__ import __version__ # noqa: E402 -from .api_client import init, log_metric, reload +from .api_client import init, log_metric, log_text, reload from .client import ( # noqa: E402 get_test_suite, init_dataset, @@ -125,4 +125,5 @@ def check_version(): "tests", "unit_metrics", "test_suites", + "log_text", ] diff --git a/validmind/api_client.py b/validmind/api_client.py index 7abd6374a..c5755daaa 100644 --- a/validmind/api_client.py +++ b/validmind/api_client.py @@ -18,11 +18,12 @@ import aiohttp import requests from aiohttp import FormData +from ipywidgets import HTML, Accordion from .client_config import client_config from .errors import MissingAPICredentialsError, MissingModelIdError, raise_api_error from .logging import get_logger, init_sentry, send_single_error -from .utils import NumpyEncoder, run_async +from .utils import NumpyEncoder, is_html, md_to_html, run_async from .vm_models import Figure logger = get_logger(__name__) @@ -407,6 +408,39 @@ def log_input(input_id: str, type: str, metadata: Dict[str, Any]) -> Dict[str, A return run_async(alog_input, input_id, type, metadata) +def log_text( + content_id: str, text: str, _json: Optional[Dict[str, Any]] = None +) -> Dict[str, Any]: + """Logs free-form text to ValidMind API. + + Args: + content_id (str): Unique content identifier for the text. + text (str): The text to log. Will be converted to HTML with MathML support. + _json (dict, optional): Additional metadata to associate with the text. Defaults to None. + + Raises: + ValueError: If content_id or text are empty or not strings. + Exception: If the API call fails. + + Returns: + ipywidgets.Accordion: An accordion widget containing the logged text as HTML. + """ + if not content_id or not isinstance(content_id, str): + raise ValueError("`content_id` must be a non-empty string") + if not text or not isinstance(text, str): + raise ValueError("`text` must be a non-empty string") + + if not is_html(text): + text = md_to_html(text, mathml=True) + + log_text = run_async(alog_metadata, content_id, text, _json) + + return Accordion( + children=[HTML(log_text["text"])], + titles=[f"Text Block: '{log_text['content_id']}'"], + ) + + async def alog_metric( key: str, value: Union[int, float], diff --git a/validmind/utils.py b/validmind/utils.py index a3d2444e4..5d8306a05 100644 --- a/validmind/utils.py +++ b/validmind/utils.py @@ -20,6 +20,7 @@ import numpy as np import pandas as pd import seaborn as sns +from bs4 import BeautifulSoup from IPython.core import getipython from IPython.display import HTML from IPython.display import display as ipy_display @@ -576,6 +577,63 @@ def md_to_html(md: str, mathml=False) -> str: return html +def is_html(text: str) -> bool: + """Check if a string is HTML. + + Uses more robust heuristics to determine if a string contains HTML content. + + Args: + text (str): The string to check + + Returns: + bool: True if the string likely contains HTML, False otherwise + """ + # Strip whitespace first + text = text.strip() + + # Basic check: Must at least start with < and end with > + if not (text.startswith("<") and text.endswith(">")): + return False + + # Look for common HTML tags + common_html_patterns = [ + r"", # HTML tag + r"", # Body tag + r"", # Div tag + r"

.*?

", # Paragraph with content + r".*?", # Headers + r"", # Script tags + r"", # Style tags + r"", # Links + r"", # Images + r"", # Tables + r"", # DOCTYPE declaration + ] + + for pattern in common_html_patterns: + if re.search(pattern, text, re.IGNORECASE | re.DOTALL): + return True + + # If we have at least 2 matching tags, it's likely HTML + # This helps detect custom elements or patterns not in our list + tags = re.findall(r"", text) + if len(tags) >= 2: + return True + + # Try parsing with BeautifulSoup as a last resort + try: + soup = BeautifulSoup(text, "html.parser") + # If we find any tags that weren't in the original text, BeautifulSoup + # likely tried to fix broken HTML, meaning it's not valid HTML + return len(soup.find_all()) > 0 + + except Exception as e: + logger.error(f"Error checking if text is HTML: {e}") + return False + + return False + + def inspect_obj(obj): # Filtering only attributes print(len("Attributes:") * "-")