From d128f8a7ea9099a36d122117e092a1a27b9c0840 Mon Sep 17 00:00:00 2001
From: amberjblue <ablue@redhat.com>
Date: Thu, 18 Sep 2025 08:53:17 -0400
Subject: [PATCH 1/2] Add moderations server endpoint.

---
 .../library/gcp_moderate_text/actions.py      |   3 +
 .../library/openai_moderate_text/__init__.py  |  14 ++
 .../library/openai_moderate_text/actions.py   |  84 ++++++++++++
 .../library/openai_moderate_text/flows.co     |  93 +++++++++++++
 nemoguardrails/server/api.py                  | 127 +++++++++++++++++-
 5 files changed, 318 insertions(+), 3 deletions(-)
 create mode 100644 nemoguardrails/library/openai_moderate_text/__init__.py
 create mode 100644 nemoguardrails/library/openai_moderate_text/actions.py
 create mode 100644 nemoguardrails/library/openai_moderate_text/flows.co

diff --git a/nemoguardrails/library/gcp_moderate_text/actions.py b/nemoguardrails/library/gcp_moderate_text/actions.py
index afb7004f0..eba21fa49 100644
--- a/nemoguardrails/library/gcp_moderate_text/actions.py
+++ b/nemoguardrails/library/gcp_moderate_text/actions.py
@@ -142,3 +142,6 @@ async def call_gcp_text_moderation_api(
         violations_dict[violation.name] = violation.confidence
 
     return {"max_risk_score": max_risk_score, "violations": violations_dict}
+
+
+call_gcp_text_moderation_api.action_meta.setdefault("tags", []).append("moderation")
diff --git a/nemoguardrails/library/openai_moderate_text/__init__.py b/nemoguardrails/library/openai_moderate_text/__init__.py
new file mode 100644
index 000000000..9ba9d4310
--- /dev/null
+++ b/nemoguardrails/library/openai_moderate_text/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemoguardrails/library/openai_moderate_text/actions.py b/nemoguardrails/library/openai_moderate_text/actions.py
new file mode 100644
index 000000000..126d2e57f
--- /dev/null
+++ b/nemoguardrails/library/openai_moderate_text/actions.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Optional
+
+try:
+    from openai import OpenAI
+except ImportError:
+    pass
+
+
+from nemoguardrails.actions import action
+
+log = logging.getLogger(__name__)
+
+
+def openai_text_moderation_mapping(result: dict) -> bool:
+    """Determines whether the output from the OpenAI moderation API should be blocked.
+    Returns True (i.e. block the output) if any of the conditions are met.
+    """
+    # Check overall risk score if available.
+    flagged = result.get("results", [{}])[0].get("flagged", False)
+
+    if flagged:
+        return True
+
+    confidence_threshold = 0.8
+    scores = result["results"][0]["category_scores"]
+
+    if any(score > confidence_threshold for score in scores.values()):
+        return True
+
+    # If none of the thresholds are exceeded, allow the output.
+    return False
+
+
+@action(
+    name="openai_moderation_api",
+    is_system_action=True,
+    output_mapping=openai_text_moderation_mapping,
+)
+async def call_openai_text_moderation_api(
+    context: Optional[dict] = None, **kwargs
+) -> dict:
+    try:
+        from openai import OpenAI
+
+    except ImportError:
+        raise ImportError(
+            "Could not import openai, please install it with " "`pip install openai`."
+        )
+
+    user_message = context.get("user_message")
+    client = OpenAI()
+
+    response = client.moderations.create(
+        model="omni-moderation-latest",
+        input=user_message,
+    )
+
+    result = response.model_dump()["results"][0]
+    result["categories"] = dict(result.get("categories", {}))
+    result["category_scores"] = {
+        str(k): float(v) for k, v in result.get("category_scores", {}).items()
+    }
+    result["flagged"] = bool(result.get("flagged", False))
+
+    return result
+
+
+call_openai_text_moderation_api.action_meta.setdefault("tags", []).append("moderation")
diff --git a/nemoguardrails/library/openai_moderate_text/flows.co b/nemoguardrails/library/openai_moderate_text/flows.co
new file mode 100644
index 000000000..bce66e6a5
--- /dev/null
+++ b/nemoguardrails/library/openai_moderate_text/flows.co
@@ -0,0 +1,93 @@
+"""
+https://platform.openai.com/docs/guides/moderation
+
+Supported Categories:
+
+Category	        Description
+harassment	                Content that expresses, incites, or promotes harassing language towards any target.
+harassment/threatening	    Harassment content that also includes violence or serious harm towards any target.
+hate	                    Content that expresses, incites, or promotes hate based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste. Hateful content aimed at non-protected groups (e.g., chess players) is harassment.
+hate/threatening	        Hateful content that also includes violence or serious harm towards the targeted group based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste.
+illicit	                    Content that gives advice or instruction on how to commit illicit acts. A phrase like "how to shoplift" would fit this category.
+illicit/violent	            The same types of content flagged by the illicit category, but also includes references to violence or procuring a weapon.
+self-harm     	            Content that promotes, encourages, or depicts acts of self-harm, such as suicide, cutting, and eating disorders.
+self-harm/intent        	Content where the speaker expresses that they are engaging or intend to engage in acts of self-harm, such as suicide, cutting, and eating disorders.
+self-harm/instructions	    Content that encourages performing acts of self-harm, such as suicide, cutting, and eating disorders, or that gives instructions or advice on how to commit such acts.
+sexual	                    Content meant to arouse sexual excitement, such as the description of sexual activity, or that promotes sexual services (excluding sex education and wellness).
+sexual/minors  	            Sexual content that includes an individual who is under 18 years old.
+violence	                Content that depicts death, violence, or physical injury.
+violence/graphic	        Content that depicts death, violence, or physical injury in graphic detail.
+"""
+
+define subflow openai moderation
+  """Guardrail based on model classification of potentially harmful content"""
+  $result = execute openai_moderation_api
+
+  if $result.get("flagged", False)
+    bot refuse to respond
+    stop
+
+define subflow openai moderation detailed
+  """Guardrail based on individual risk scores."""
+  $result = execute openai_moderation_api
+
+  if $result.categories.get("sexual", False)
+    stop
+    bot inform cannot engage in inappropriate content
+
+  if $result.categories.get("sexual/minors", False)
+    stop
+    bot inform cannot engage in inappropriate content
+
+  if $result.categories.get("harassment", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("harassment/threatening", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("hate", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("hate/threatening", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("illicit", False)
+    bot inform cannot engage in inappropriate content
+    stop
+
+  if $result.categories.get("illicit/violent", False)
+    bot inform cannot engage in inappropriate content
+    stop
+
+  if $result.categories.get("self-harm", False)
+    bot inform cannot engage with sensitive content
+    stop
+
+  if $result.categories.get("self-harm/intent", False)
+    bot inform cannot engage with sensitive content
+    stop
+
+  if $result.categories.get("self-harm/instructions", False)
+    bot inform cannot engage with sensitive content
+    stop
+
+  if $result.categories.get("violence", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("violence/graphic", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+define bot inform cannot engage in abusive or harmful behavior
+  "I will not engage in any abusive or harmful behavior."
+
+define bot inform cannot engage with inappropriate content
+  "I will not engage with inappropriate content."
+
+define bot inform cannot engage with sensitive content
+  "I will not engage with sensitive content."
diff --git a/nemoguardrails/server/api.py b/nemoguardrails/server/api.py
index d07cb63df..285caef3e 100644
--- a/nemoguardrails/server/api.py
+++ b/nemoguardrails/server/api.py
@@ -20,11 +20,12 @@
 import os.path
 import re
 import time
+import uuid
 import warnings
 from contextlib import asynccontextmanager
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
 
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field, root_validator, validator
 from starlette.responses import StreamingResponse
@@ -46,7 +47,7 @@
 # backends and storage engines.
 registered_loggers = []
 
-api_description = """Guardrails Sever API."""
+api_description = """Guardrails Server API."""
 
 # The headers for each request
 api_request_headers = contextvars.ContextVar("headers")
@@ -576,6 +577,126 @@ class GuardrailsConfigurationError(Exception):
     pass
 
 
+class ModerationsRequest(BaseModel):
+    input: Union[str, List[str]]
+    model: Optional[str] = None
+    config_id: Optional[str] = Field(default=os.getenv("DEFAULT_CONFIG_ID", None))
+    config_ids: Optional[List[str]] = None
+
+    @root_validator(pre=True)
+    def normalize_keys(cls, data):
+        if isinstance(data, dict) and "Model" in data and "model" not in data:
+            data["model"] = data.pop("Model")
+        return data
+
+    @validator("config_ids", pre=True, always=True)
+    def ensure_config_ids(cls, v, values):
+        if v is None and values.get("config_id"):
+            return [values["config_id"]]
+        return v
+
+
+class ModerationsResponse(BaseModel):
+    id: str
+    model: str
+    results: List[dict]
+
+
+def _moderation_actions(dispatcher) -> list[str]:
+    out = []
+    for name, fn in dispatcher.registered_actions.items():
+        meta = getattr(fn, "action_meta", {}) or {}
+        tags = set(meta.get("tags", []))
+        if "moderation" in tags:
+            out.append(name)
+    return sorted(out)
+
+
+def _resolve_moderation_action(dispatcher, model_or_action: str | None) -> str:
+    mods = _moderation_actions(dispatcher)
+    if not mods:
+        raise HTTPException(
+            status_code=500, detail="No moderation actions are registered."
+        )
+
+    if not model_or_action:
+        return mods[0]
+
+    wanted = model_or_action.strip()
+    if dispatcher.has_registered(wanted) and wanted in mods:
+        return wanted
+
+    def norm(s: str) -> str:
+        return s.lower().replace(" ", "")
+
+    for m in mods:
+        if norm(m) == norm(wanted):
+            return m
+
+    raise HTTPException(
+        status_code=400,
+        detail=f"Unknown moderation action '{model_or_action}'. Available: {', '.join(mods)}",
+    )
+
+
+@app.post("/v1/moderations", response_model=ModerationsResponse)
+async def create_moderations(body: ModerationsRequest, request: Request):
+    for logger in registered_loggers:
+        asyncio.get_event_loop().create_task(
+            logger({"endpoint": "/v1/moderations", "body": body.json()})
+        )
+    api_request_headers.set(request.headers)
+
+    config_ids = body.config_ids or ([body.config_id] if body.config_id else None)
+    if not config_ids:
+        if app.default_config_id:
+            config_ids = [app.default_config_id]
+        elif app.single_config_mode and app.single_config_id:
+            config_ids = [app.single_config_id]
+        else:
+            raise GuardrailsConfigurationError(
+                "No 'config_id' provided and no default configuration is set for the server."
+            )
+
+    try:
+        llm_rails = _get_rails(config_ids)
+    except Exception:
+        raise HTTPException(
+            status_code=500, detail="Could not load guardrails configuration."
+        )
+
+    dispatcher = llm_rails.runtime.action_dispatcher
+    action_name = _resolve_moderation_action(dispatcher, body.model)
+
+    texts = body.input if isinstance(body.input, list) else [body.input]
+    results = []
+
+    for text in texts:
+        params = {"context": {"user_message": text}}
+        try:
+            result = await dispatcher.execute_action(action_name, params)
+        except Exception:
+            raise HTTPException(
+                status_code=502, detail=f"Moderation action '{action_name}' failed."
+            )
+
+        value, status = (
+            result
+            if isinstance(result, tuple) and len(result) == 2
+            else (result, "success")
+        )
+        payload = getattr(value, "return_value", value) or {}
+        if not isinstance(payload, dict):
+            payload = {"value": payload}
+        results.append(payload)
+
+    return ModerationsResponse(
+        id=f"modr-{uuid.uuid4()}",
+        model=action_name,
+        results=results,
+    )
+
+
 # # Register a nicer error message for 422 error
 # def register_exception(app: FastAPI):
 #     @app.exception_handler(RequestValidationError)

From d300a7c8e17c35b41eb93993ca4bddcc545b625e Mon Sep 17 00:00:00 2001
From: amberjblue <ablue@redhat.com>
Date: Thu, 2 Oct 2025 15:26:34 -0400
Subject: [PATCH 2/2] Update docs and add community user guide

---
 .../community/openai-moderations-api.md       | 157 ++++++++++++++++++
 docs/user-guides/guardrails-library.md        |  17 +-
 2 files changed, 173 insertions(+), 1 deletion(-)
 create mode 100644 docs/user-guides/community/openai-moderations-api.md

diff --git a/docs/user-guides/community/openai-moderations-api.md b/docs/user-guides/community/openai-moderations-api.md
new file mode 100644
index 000000000..abf694a70
--- /dev/null
+++ b/docs/user-guides/community/openai-moderations-api.md
@@ -0,0 +1,157 @@
+# OpenAI Moderations API
+
+NeMo Guardrails supports using the [OpenAI Moderations API](https://platform.openai.com/docs/guides/moderation) as an input or output rail out-of-the-box. You need to have the `OPENAI_API_KEY` environment variable set or configure it in your OpenAI client.
+
+## Basic Usage
+
+```yaml
+rails:
+  input:
+    flows:
+      # The simplified version using the flagged response
+      - openai moderation
+
+      # The detailed version with individual category checks
+      # - openai moderation detailed
+```
+
+The `openai moderation` flow uses OpenAI's built-in flagging system to decide if the input should be allowed or not. The `openai moderation detailed` flow checks individual violation categories with custom logic.
+
+## Using the Moderation API Endpoint
+
+You can also use the moderation endpoint directly:
+
+```yaml
+rails:
+  config:
+    moderation:
+      providers:
+        - id: openai-moderation
+          provider: openai
+          model: omni-moderation-latest
+          action: openai_moderation_api
+          default: true
+```
+
+Then call the endpoint:
+```bash
+curl -X POST http://localhost:8000/v1/moderations \
+  -H "Content-Type: application/json" \
+  -d '{
+    "input": "...text to classify goes here..."
+  }'
+```
+
+## Supported Categories
+
+OpenAI's moderation API detects the following categories:
+
+| Category | Description |
+|----------|-------------|
+| `harassment` | Content that expresses, incites, or promotes harassing language towards any target |
+| `harassment/threatening` | Harassment content that also includes violence or serious harm towards any target |
+| `hate` | Content that expresses, incites, or promotes hate based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste |
+| `hate/threatening` | Hateful content that also includes violence or serious harm towards the targeted group |
+| `illicit` | Content that gives advice or instruction on how to commit illicit acts |
+| `illicit/violent` | Illicit content that also includes references to violence or procuring a weapon |
+| `self-harm` | Content that promotes, encourages, or depicts acts of self-harm |
+| `self-harm/intent` | Content where the speaker expresses intent to engage in acts of self-harm |
+| `self-harm/instructions` | Content that encourages or gives instructions on acts of self-harm |
+| `sexual` | Content meant to arouse sexual excitement or that promotes sexual services |
+| `sexual/minors` | Sexual content that includes an individual who is under 18 years old |
+| `violence` | Content that depicts death, violence, or physical injury |
+| `violence/graphic` | Content that depicts death, violence, or physical injury in graphic detail |
+
+## Customizing Thresholds
+
+To customize the behavior, you can overwrite the [default flows](https://github.com/NVIDIA/NeMo-Guardrails/tree/develop/nemoguardrails/library/openai_moderate_text/flows.co) in your config. For example, to create a custom moderation flow:
+
+```colang
+define subflow custom openai moderation
+  """Custom guardrail with specific threshold logic."""
+  $result = execute openai_moderation_api
+
+  # Block if OpenAI flags it as harmful
+  if $result.get("flagged", False)
+    bot refuse to respond
+    stop
+
+  # Custom threshold checks on category scores
+  if $result.category_scores.get("violence", 0) > 0.5
+    bot inform cannot engage in violent content
+    stop
+```
+
+## Detailed Category Handling
+
+Using OpenAI Text Moderation, you can control various violation categories individually. The API returns both boolean flags for each category and confidence scores. Here's an example of a detailed input moderation flow:
+
+```colang
+define flow openai input moderation detailed
+  $result = execute openai_moderation_api(text=$user_message)
+
+  if $result.categories.get("harassment", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("hate", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.categories.get("sexual", False)
+    bot inform cannot engage in inappropriate content
+    stop
+
+  if $result.categories.get("violence", False)
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+define bot inform cannot engage in abusive or harmful behavior
+  "I will not engage in any abusive or harmful behavior."
+
+define bot inform cannot engage in inappropriate content
+  "I will not engage with inappropriate content."
+```
+
+## Using with Category Scores
+
+You can also use the confidence scores for more nuanced control:
+
+```colang
+define subflow openai score based moderation
+  """Moderation based on confidence scores rather than binary flags."""
+  $result = execute openai_moderation_api
+
+  # Custom thresholds for different categories
+  if $result.category_scores.get("harassment", 0) > 0.7
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.category_scores.get("hate", 0) > 0.6
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+
+  if $result.category_scores.get("violence", 0) > 0.8
+    bot inform cannot engage in abusive or harmful behavior
+    stop
+```
+
+## Environment Setup
+
+Make sure you have your OpenAI API key configured:
+
+```bash
+export OPENAI_API_KEY="your-api-key-here"
+```
+
+Or you can configure it in your application code when initializing the OpenAI client.
+
+## Installation
+
+The OpenAI moderation integration requires the `openai` package:
+
+```bash
+pip install openai
+```
+
+This is typically included when you install NeMo Guardrails with OpenAI support.
diff --git a/docs/user-guides/guardrails-library.md b/docs/user-guides/guardrails-library.md
index 0215b20d4..453f56c91 100644
--- a/docs/user-guides/guardrails-library.md
+++ b/docs/user-guides/guardrails-library.md
@@ -28,7 +28,7 @@ NeMo Guardrails comes with a library of built-in guardrails that you can easily
    - [Prompt Security Protection](#prompt-security-protection)
    - [Pangea AI Guard](#pangea-ai-guard)
    - [Trend Micro Vision One AI Application Security](#trend-micro-vision-one-ai-application-security)
-   - OpenAI Moderation API - *[COMING SOON]*
+   - [OpenAI Moderations API](#openai-moderations-api)
 
 4. Other
    - [Jailbreak Detection](#jailbreak-detection)
@@ -937,6 +937,21 @@ rails:
 
 For more details, check out the [Trend Micro Vision One AI Application Security](./community/trend-micro.md) page.
 
+### OpenAI Moderations API
+
+NeMo Guardrails supports using the [OpenAI Moderation API](https://platform.openai.com/docs/guides/moderation) for input and output content moderation. You need to have the `OPENAI_API_KEY` environment variable set.
+
+#### Example usage
+
+```yaml
+rails:
+  input:
+    flows:
+      - openai moderation
+```
+
+For more details, check out the [OpenAI Moderations API Integration](./community/openai-moderations-api.md) page.
+
 ## Other
 
 ### Jailbreak Detection