NVIDIA-NeMo · AmberJBlue · Sep 18, 2025 · Sep 23, 2025
diff --git a/nemoguardrails/library/gcp_moderate_text/actions.py b/nemoguardrails/library/gcp_moderate_text/actions.py
@@ -142,3 +142,6 @@ async def call_gcp_text_moderation_api(
         violations_dict[violation.name] = violation.confidence
 
     return {"max_risk_score": max_risk_score, "violations": violations_dict}
+
+
+call_gcp_text_moderation_api.action_meta.setdefault("tags", []).append("moderation")
diff --git a/nemoguardrails/library/openai_moderate_text/__init__.py b/nemoguardrails/library/openai_moderate_text/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemoguardrails/library/openai_moderate_text/actions.py b/nemoguardrails/library/openai_moderate_text/actions.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Optional
+
+try:
+    from openai import OpenAI
+except ImportError:
+    pass
+
+
+from nemoguardrails.actions import action
+
+log = logging.getLogger(__name__)
+
+
+def openai_text_moderation_mapping(result: dict) -> bool:
+    """Determines whether the output from the OpenAI moderation API should be blocked.
+    Returns True (i.e. block the output) if any of the conditions are met.
+    """
+    # Check overall risk score if available.
+    flagged = result.get("results", [{}])[0].get("flagged", False)
+
+    if flagged:
+        return True
+
+    confidence_threshold = 0.8
+    scores = result["results"][0]["category_scores"]
+
+    if any(score > confidence_threshold for score in scores.values()):
+        return True
+
+    # If none of the thresholds are exceeded, allow the output.
+    return False
+
+
+@action(
+    name="openai_moderation_api",
+    is_system_action=True,
+    output_mapping=openai_text_moderation_mapping,
+)
+async def call_openai_text_moderation_api(
+    context: Optional[dict] = None, **kwargs
+) -> dict:
+    try:
+        from openai import OpenAI
+
+    except ImportError:
+        raise ImportError(
+            "Could not import openai, please install it with " "`pip install openai`."
+        )
+
+    user_message = context.get("user_message")
+    client = OpenAI()
+
+    response = client.moderations.create(
+        model="omni-moderation-latest",
+        input=user_message,
+    )
+
+    result = response.model_dump()["results"][0]
+    result["categories"] = dict(result.get("categories", {}))
+    result["category_scores"] = {
+        str(k): float(v) for k, v in result.get("category_scores", {}).items()
+    }
+    result["flagged"] = bool(result.get("flagged", False))
+
+    return result
+
+
+call_openai_text_moderation_api.action_meta.setdefault("tags", []).append("moderation")
diff --git a/nemoguardrails/library/openai_moderate_text/flows.co b/nemoguardrails/library/openai_moderate_text/flows.co
@@ -0,0 +1,93 @@
+"""
+https://platform.openai.com/docs/guides/moderation
+
+Supported Categories:
+
+Category	        Description
+harassment	                Content that expresses, incites, or promotes harassing language towards any target.
+harassment/threatening	    Harassment content that also includes violence or serious harm towards any target.
+hate	                    Content that expresses, incites, or promotes hate based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste. Hateful content aimed at non-protected groups (e.g., chess players) is harassment.
+hate/threatening	        Hateful content that also includes violence or serious harm towards the targeted group based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste.
+illicit	                    Content that gives advice or instruction on how to commit illicit acts. A phrase like "how to shoplift" would fit this category.
+illicit/violent	            The same types of content flagged by the illicit category, but also includes references to violence or procuring a weapon.
+self-harm     	            Content that promotes, encourages, or depicts acts of self-harm, such as suicide, cutting, and eating disorders.
+self-harm/intent        	Content where the speaker expresses that they are engaging or intend to engage in acts of self-harm, such as suicide, cutting, and eating disorders.
+self-harm/instructions	    Content that encourages performing acts of self-harm, such as suicide, cutting, and eating disorders, or that gives instructions or advice on how to commit such acts.
+sexual	                    Content meant to arouse sexual excitement, such as the description of sexual activity, or that promotes sexual services (excluding sex education and wellness).
+sexual/minors  	            Sexual content that includes an individual who is under 18 years old.
+violence	                Content that depicts death, violence, or physical injury.
+violence/graphic	        Content that depicts death, violence, or physical injury in graphic detail.
+"""
+
+define subflow openai moderation
+  """Guardrail based on model classification of potentially harmful content"""
+  $result = execute openai_moderation_api
+
+  if $result.get("flagged", False)
+    bot refuse to respond
+    stop
+
+define subflow openai moderation detailed
+  """Guardrail based on individual risk scores."""
+  $result = execute openai_moderation_api
+
+  if $result.categories.get("sexual", False)
+    stop
+    bot inform cannot engage in inappropriate content
+
+  if $result.categories.get("sexual/minors", False)
+    stop
+    bot inform cannot engage in inappropriate content
+
+  if $result.categories.get("harassment", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("harassment/threatening", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("hate", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("hate/threatening", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("illicit", False)
+    bot inform cannot engage in inappropriate content
+    stop
+
+  if $result.categories.get("illicit/violent", False)
+    bot inform cannot engage in inappropriate content
+    stop
+
+  if $result.categories.get("self-harm", False)
+    bot inform cannot engage with sensitive content
+    stop
+
+  if $result.categories.get("self-harm/intent", False)
+    bot inform cannot engage with sensitive content
+    stop
+
+  if $result.categories.get("self-harm/instructions", False)
+    bot inform cannot engage with sensitive content
+    stop
+
+  if $result.categories.get("violence", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("violence/graphic", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+define bot inform cannot engage in abusive or harmful behavior
+  "I will not engage in any abusive or harmful behavior."
+
+define bot inform cannot engage with inappropriate content
+  "I will not engage with inappropriate content."
+
+define bot inform cannot engage with sensitive content
+  "I will not engage with sensitive content."
diff --git a/nemoguardrails/server/api.py b/nemoguardrails/server/api.py
@@ -20,11 +20,12 @@
 import os.path
 import re
 import time
+import uuid
 import warnings
 from contextlib import asynccontextmanager
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
 
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field, root_validator, validator
 from starlette.responses import StreamingResponse
@@ -46,7 +47,7 @@
 # backends and storage engines.
 registered_loggers = []
 
-api_description = """Guardrails Sever API."""
+api_description = """Guardrails Server API."""
 
 # The headers for each request
 api_request_headers = contextvars.ContextVar("headers")
@@ -576,6 +577,126 @@ class GuardrailsConfigurationError(Exception):
     pass
 
 
+class ModerationsRequest(BaseModel):
+    input: Union[str, List[str]]
+    model: Optional[str] = None
+    config_id: Optional[str] = Field(default=os.getenv("DEFAULT_CONFIG_ID", None))
+    config_ids: Optional[List[str]] = None
+
+    @root_validator(pre=True)
+    def normalize_keys(cls, data):
+        if isinstance(data, dict) and "Model" in data and "model" not in data:
+            data["model"] = data.pop("Model")
+        return data
+
+    @validator("config_ids", pre=True, always=True)
+    def ensure_config_ids(cls, v, values):
+        if v is None and values.get("config_id"):
+            return [values["config_id"]]
+        return v
+
+
+class ModerationsResponse(BaseModel):
+    id: str
+    model: str
+    results: List[dict]
+
+
+def _moderation_actions(dispatcher) -> list[str]:
+    out = []
+    for name, fn in dispatcher.registered_actions.items():
+        meta = getattr(fn, "action_meta", {}) or {}
+        tags = set(meta.get("tags", []))
+        if "moderation" in tags:
+            out.append(name)
+    return sorted(out)
+
+
+def _resolve_moderation_action(dispatcher, model_or_action: str | None) -> str:
+    mods = _moderation_actions(dispatcher)
+    if not mods:
+        raise HTTPException(
+            status_code=500, detail="No moderation actions are registered."
+        )
+
+    if not model_or_action:
+        return mods[0]
+
+    wanted = model_or_action.strip()
+    if dispatcher.has_registered(wanted) and wanted in mods:
+        return wanted
+
+    def norm(s: str) -> str:
+        return s.lower().replace(" ", "")
+
+    for m in mods:
+        if norm(m) == norm(wanted):
+            return m
+
+    raise HTTPException(
+        status_code=400,
+        detail=f"Unknown moderation action '{model_or_action}'. Available: {', '.join(mods)}",
+    )
+
+
+@app.post("/v1/moderations", response_model=ModerationsResponse)
+async def create_moderations(body: ModerationsRequest, request: Request):
+    for logger in registered_loggers:
+        asyncio.get_event_loop().create_task(
+            logger({"endpoint": "/v1/moderations", "body": body.json()})
+        )
+    api_request_headers.set(request.headers)
+
+    config_ids = body.config_ids or ([body.config_id] if body.config_id else None)
+    if not config_ids:
+        if app.default_config_id:
+            config_ids = [app.default_config_id]
+        elif app.single_config_mode and app.single_config_id:
+            config_ids = [app.single_config_id]
+        else:
+            raise GuardrailsConfigurationError(
+                "No 'config_id' provided and no default configuration is set for the server."
+            )
+
+    try:
+        llm_rails = _get_rails(config_ids)
+    except Exception:
+        raise HTTPException(
+            status_code=500, detail="Could not load guardrails configuration."
+        )
+
+    dispatcher = llm_rails.runtime.action_dispatcher
+    action_name = _resolve_moderation_action(dispatcher, body.model)
+
+    texts = body.input if isinstance(body.input, list) else [body.input]
+    results = []
+
+    for text in texts:
+        params = {"context": {"user_message": text}}
+        try:
+            result = await dispatcher.execute_action(action_name, params)
+        except Exception:
+            raise HTTPException(
+                status_code=502, detail=f"Moderation action '{action_name}' failed."
+            )
+
+        value, status = (
+            result
+            if isinstance(result, tuple) and len(result) == 2
+            else (result, "success")
+        )
+        payload = getattr(value, "return_value", value) or {}
+        if not isinstance(payload, dict):
+            payload = {"value": payload}
+        results.append(payload)
+
+    return ModerationsResponse(
+        id=f"modr-{uuid.uuid4()}",
+        model=action_name,
+        results=results,
+    )
+
+
 # # Register a nicer error message for 422 error
 # def register_exception(app: FastAPI):
 #     @app.exception_handler(RequestValidationError)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -142,3 +142,6 @@ async def call_gcp_text_moderation_api(
		violations_dict[violation.name] = violation.confidence

		return {"max_risk_score": max_risk_score, "violations": violations_dict}


		call_gcp_text_moderation_api.action_meta.setdefault("tags", []).append("moderation")