From d128f8a7ea9099a36d122117e092a1a27b9c0840 Mon Sep 17 00:00:00 2001 From: amberjblue Date: Thu, 18 Sep 2025 08:53:17 -0400 Subject: [PATCH 1/2] Add moderations server endpoint. --- .../library/gcp_moderate_text/actions.py | 3 + .../library/openai_moderate_text/__init__.py | 14 ++ .../library/openai_moderate_text/actions.py | 84 ++++++++++++ .../library/openai_moderate_text/flows.co | 93 +++++++++++++ nemoguardrails/server/api.py | 127 +++++++++++++++++- 5 files changed, 318 insertions(+), 3 deletions(-) create mode 100644 nemoguardrails/library/openai_moderate_text/__init__.py create mode 100644 nemoguardrails/library/openai_moderate_text/actions.py create mode 100644 nemoguardrails/library/openai_moderate_text/flows.co diff --git a/nemoguardrails/library/gcp_moderate_text/actions.py b/nemoguardrails/library/gcp_moderate_text/actions.py index afb7004f0..eba21fa49 100644 --- a/nemoguardrails/library/gcp_moderate_text/actions.py +++ b/nemoguardrails/library/gcp_moderate_text/actions.py @@ -142,3 +142,6 @@ async def call_gcp_text_moderation_api( violations_dict[violation.name] = violation.confidence return {"max_risk_score": max_risk_score, "violations": violations_dict} + + +call_gcp_text_moderation_api.action_meta.setdefault("tags", []).append("moderation") diff --git a/nemoguardrails/library/openai_moderate_text/__init__.py b/nemoguardrails/library/openai_moderate_text/__init__.py new file mode 100644 index 000000000..9ba9d4310 --- /dev/null +++ b/nemoguardrails/library/openai_moderate_text/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemoguardrails/library/openai_moderate_text/actions.py b/nemoguardrails/library/openai_moderate_text/actions.py new file mode 100644 index 000000000..126d2e57f --- /dev/null +++ b/nemoguardrails/library/openai_moderate_text/actions.py @@ -0,0 +1,84 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Optional + +try: + from openai import OpenAI +except ImportError: + pass + + +from nemoguardrails.actions import action + +log = logging.getLogger(__name__) + + +def openai_text_moderation_mapping(result: dict) -> bool: + """Determines whether the output from the OpenAI moderation API should be blocked. + Returns True (i.e. block the output) if any of the conditions are met. + """ + # Check overall risk score if available. + flagged = result.get("results", [{}])[0].get("flagged", False) + + if flagged: + return True + + confidence_threshold = 0.8 + scores = result["results"][0]["category_scores"] + + if any(score > confidence_threshold for score in scores.values()): + return True + + # If none of the thresholds are exceeded, allow the output. + return False + + +@action( + name="openai_moderation_api", + is_system_action=True, + output_mapping=openai_text_moderation_mapping, +) +async def call_openai_text_moderation_api( + context: Optional[dict] = None, **kwargs +) -> dict: + try: + from openai import OpenAI + + except ImportError: + raise ImportError( + "Could not import openai, please install it with " "`pip install openai`." + ) + + user_message = context.get("user_message") + client = OpenAI() + + response = client.moderations.create( + model="omni-moderation-latest", + input=user_message, + ) + + result = response.model_dump()["results"][0] + result["categories"] = dict(result.get("categories", {})) + result["category_scores"] = { + str(k): float(v) for k, v in result.get("category_scores", {}).items() + } + result["flagged"] = bool(result.get("flagged", False)) + + return result + + +call_openai_text_moderation_api.action_meta.setdefault("tags", []).append("moderation") diff --git a/nemoguardrails/library/openai_moderate_text/flows.co b/nemoguardrails/library/openai_moderate_text/flows.co new file mode 100644 index 000000000..bce66e6a5 --- /dev/null +++ b/nemoguardrails/library/openai_moderate_text/flows.co @@ -0,0 +1,93 @@ +""" +https://platform.openai.com/docs/guides/moderation + +Supported Categories: + +Category Description +harassment Content that expresses, incites, or promotes harassing language towards any target. +harassment/threatening Harassment content that also includes violence or serious harm towards any target. +hate Content that expresses, incites, or promotes hate based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste. Hateful content aimed at non-protected groups (e.g., chess players) is harassment. +hate/threatening Hateful content that also includes violence or serious harm towards the targeted group based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste. +illicit Content that gives advice or instruction on how to commit illicit acts. A phrase like "how to shoplift" would fit this category. +illicit/violent The same types of content flagged by the illicit category, but also includes references to violence or procuring a weapon. +self-harm Content that promotes, encourages, or depicts acts of self-harm, such as suicide, cutting, and eating disorders. +self-harm/intent Content where the speaker expresses that they are engaging or intend to engage in acts of self-harm, such as suicide, cutting, and eating disorders. +self-harm/instructions Content that encourages performing acts of self-harm, such as suicide, cutting, and eating disorders, or that gives instructions or advice on how to commit such acts. +sexual Content meant to arouse sexual excitement, such as the description of sexual activity, or that promotes sexual services (excluding sex education and wellness). +sexual/minors Sexual content that includes an individual who is under 18 years old. +violence Content that depicts death, violence, or physical injury. +violence/graphic Content that depicts death, violence, or physical injury in graphic detail. +""" + +define subflow openai moderation + """Guardrail based on model classification of potentially harmful content""" + $result = execute openai_moderation_api + + if $result.get("flagged", False) + bot refuse to respond + stop + +define subflow openai moderation detailed + """Guardrail based on individual risk scores.""" + $result = execute openai_moderation_api + + if $result.categories.get("sexual", False) + stop + bot inform cannot engage in inappropriate content + + if $result.categories.get("sexual/minors", False) + stop + bot inform cannot engage in inappropriate content + + if $result.categories.get("harassment", False) + bot inform cannot engage in abusive or harmful behavior + stop + + if $result.categories.get("harassment/threatening", False) + bot inform cannot engage in abusive or harmful behavior + stop + + if $result.categories.get("hate", False) + bot inform cannot engage in abusive or harmful behavior + stop + + if $result.categories.get("hate/threatening", False) + bot inform cannot engage in abusive or harmful behavior + stop + + if $result.categories.get("illicit", False) + bot inform cannot engage in inappropriate content + stop + + if $result.categories.get("illicit/violent", False) + bot inform cannot engage in inappropriate content + stop + + if $result.categories.get("self-harm", False) + bot inform cannot engage with sensitive content + stop + + if $result.categories.get("self-harm/intent", False) + bot inform cannot engage with sensitive content + stop + + if $result.categories.get("self-harm/instructions", False) + bot inform cannot engage with sensitive content + stop + + if $result.categories.get("violence", False) + bot inform cannot engage in abusive or harmful behavior + stop + + if $result.categories.get("violence/graphic", False) + bot inform cannot engage in abusive or harmful behavior + stop + +define bot inform cannot engage in abusive or harmful behavior + "I will not engage in any abusive or harmful behavior." + +define bot inform cannot engage with inappropriate content + "I will not engage with inappropriate content." + +define bot inform cannot engage with sensitive content + "I will not engage with sensitive content." diff --git a/nemoguardrails/server/api.py b/nemoguardrails/server/api.py index d07cb63df..285caef3e 100644 --- a/nemoguardrails/server/api.py +++ b/nemoguardrails/server/api.py @@ -20,11 +20,12 @@ import os.path import re import time +import uuid import warnings from contextlib import asynccontextmanager -from typing import Any, List, Optional +from typing import Any, List, Optional, Union -from fastapi import FastAPI, Request +from fastapi import FastAPI, HTTPException, Request from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field, root_validator, validator from starlette.responses import StreamingResponse @@ -46,7 +47,7 @@ # backends and storage engines. registered_loggers = [] -api_description = """Guardrails Sever API.""" +api_description = """Guardrails Server API.""" # The headers for each request api_request_headers = contextvars.ContextVar("headers") @@ -576,6 +577,126 @@ class GuardrailsConfigurationError(Exception): pass +class ModerationsRequest(BaseModel): + input: Union[str, List[str]] + model: Optional[str] = None + config_id: Optional[str] = Field(default=os.getenv("DEFAULT_CONFIG_ID", None)) + config_ids: Optional[List[str]] = None + + @root_validator(pre=True) + def normalize_keys(cls, data): + if isinstance(data, dict) and "Model" in data and "model" not in data: + data["model"] = data.pop("Model") + return data + + @validator("config_ids", pre=True, always=True) + def ensure_config_ids(cls, v, values): + if v is None and values.get("config_id"): + return [values["config_id"]] + return v + + +class ModerationsResponse(BaseModel): + id: str + model: str + results: List[dict] + + +def _moderation_actions(dispatcher) -> list[str]: + out = [] + for name, fn in dispatcher.registered_actions.items(): + meta = getattr(fn, "action_meta", {}) or {} + tags = set(meta.get("tags", [])) + if "moderation" in tags: + out.append(name) + return sorted(out) + + +def _resolve_moderation_action(dispatcher, model_or_action: str | None) -> str: + mods = _moderation_actions(dispatcher) + if not mods: + raise HTTPException( + status_code=500, detail="No moderation actions are registered." + ) + + if not model_or_action: + return mods[0] + + wanted = model_or_action.strip() + if dispatcher.has_registered(wanted) and wanted in mods: + return wanted + + def norm(s: str) -> str: + return s.lower().replace(" ", "") + + for m in mods: + if norm(m) == norm(wanted): + return m + + raise HTTPException( + status_code=400, + detail=f"Unknown moderation action '{model_or_action}'. Available: {', '.join(mods)}", + ) + + +@app.post("/v1/moderations", response_model=ModerationsResponse) +async def create_moderations(body: ModerationsRequest, request: Request): + for logger in registered_loggers: + asyncio.get_event_loop().create_task( + logger({"endpoint": "/v1/moderations", "body": body.json()}) + ) + api_request_headers.set(request.headers) + + config_ids = body.config_ids or ([body.config_id] if body.config_id else None) + if not config_ids: + if app.default_config_id: + config_ids = [app.default_config_id] + elif app.single_config_mode and app.single_config_id: + config_ids = [app.single_config_id] + else: + raise GuardrailsConfigurationError( + "No 'config_id' provided and no default configuration is set for the server." + ) + + try: + llm_rails = _get_rails(config_ids) + except Exception: + raise HTTPException( + status_code=500, detail="Could not load guardrails configuration." + ) + + dispatcher = llm_rails.runtime.action_dispatcher + action_name = _resolve_moderation_action(dispatcher, body.model) + + texts = body.input if isinstance(body.input, list) else [body.input] + results = [] + + for text in texts: + params = {"context": {"user_message": text}} + try: + result = await dispatcher.execute_action(action_name, params) + except Exception: + raise HTTPException( + status_code=502, detail=f"Moderation action '{action_name}' failed." + ) + + value, status = ( + result + if isinstance(result, tuple) and len(result) == 2 + else (result, "success") + ) + payload = getattr(value, "return_value", value) or {} + if not isinstance(payload, dict): + payload = {"value": payload} + results.append(payload) + + return ModerationsResponse( + id=f"modr-{uuid.uuid4()}", + model=action_name, + results=results, + ) + + # # Register a nicer error message for 422 error # def register_exception(app: FastAPI): # @app.exception_handler(RequestValidationError) From d300a7c8e17c35b41eb93993ca4bddcc545b625e Mon Sep 17 00:00:00 2001 From: amberjblue Date: Thu, 2 Oct 2025 15:26:34 -0400 Subject: [PATCH 2/2] Update docs and add community user guide --- .../community/openai-moderations-api.md | 157 ++++++++++++++++++ docs/user-guides/guardrails-library.md | 17 +- 2 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 docs/user-guides/community/openai-moderations-api.md diff --git a/docs/user-guides/community/openai-moderations-api.md b/docs/user-guides/community/openai-moderations-api.md new file mode 100644 index 000000000..abf694a70 --- /dev/null +++ b/docs/user-guides/community/openai-moderations-api.md @@ -0,0 +1,157 @@ +# OpenAI Moderations API + +NeMo Guardrails supports using the [OpenAI Moderations API](https://platform.openai.com/docs/guides/moderation) as an input or output rail out-of-the-box. You need to have the `OPENAI_API_KEY` environment variable set or configure it in your OpenAI client. + +## Basic Usage + +```yaml +rails: + input: + flows: + # The simplified version using the flagged response + - openai moderation + + # The detailed version with individual category checks + # - openai moderation detailed +``` + +The `openai moderation` flow uses OpenAI's built-in flagging system to decide if the input should be allowed or not. The `openai moderation detailed` flow checks individual violation categories with custom logic. + +## Using the Moderation API Endpoint + +You can also use the moderation endpoint directly: + +```yaml +rails: + config: + moderation: + providers: + - id: openai-moderation + provider: openai + model: omni-moderation-latest + action: openai_moderation_api + default: true +``` + +Then call the endpoint: +```bash +curl -X POST http://localhost:8000/v1/moderations \ + -H "Content-Type: application/json" \ + -d '{ + "input": "...text to classify goes here..." + }' +``` + +## Supported Categories + +OpenAI's moderation API detects the following categories: + +| Category | Description | +|----------|-------------| +| `harassment` | Content that expresses, incites, or promotes harassing language towards any target | +| `harassment/threatening` | Harassment content that also includes violence or serious harm towards any target | +| `hate` | Content that expresses, incites, or promotes hate based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste | +| `hate/threatening` | Hateful content that also includes violence or serious harm towards the targeted group | +| `illicit` | Content that gives advice or instruction on how to commit illicit acts | +| `illicit/violent` | Illicit content that also includes references to violence or procuring a weapon | +| `self-harm` | Content that promotes, encourages, or depicts acts of self-harm | +| `self-harm/intent` | Content where the speaker expresses intent to engage in acts of self-harm | +| `self-harm/instructions` | Content that encourages or gives instructions on acts of self-harm | +| `sexual` | Content meant to arouse sexual excitement or that promotes sexual services | +| `sexual/minors` | Sexual content that includes an individual who is under 18 years old | +| `violence` | Content that depicts death, violence, or physical injury | +| `violence/graphic` | Content that depicts death, violence, or physical injury in graphic detail | + +## Customizing Thresholds + +To customize the behavior, you can overwrite the [default flows](https://github.com/NVIDIA/NeMo-Guardrails/tree/develop/nemoguardrails/library/openai_moderate_text/flows.co) in your config. For example, to create a custom moderation flow: + +```colang +define subflow custom openai moderation + """Custom guardrail with specific threshold logic.""" + $result = execute openai_moderation_api + + # Block if OpenAI flags it as harmful + if $result.get("flagged", False) + bot refuse to respond + stop + + # Custom threshold checks on category scores + if $result.category_scores.get("violence", 0) > 0.5 + bot inform cannot engage in violent content + stop +``` + +## Detailed Category Handling + +Using OpenAI Text Moderation, you can control various violation categories individually. The API returns both boolean flags for each category and confidence scores. Here's an example of a detailed input moderation flow: + +```colang +define flow openai input moderation detailed + $result = execute openai_moderation_api(text=$user_message) + + if $result.categories.get("harassment", False) + bot inform cannot engage in abusive or harmful behavior + stop + + if $result.categories.get("hate", False) + bot inform cannot engage in abusive or harmful behavior + stop + + if $result.categories.get("sexual", False) + bot inform cannot engage in inappropriate content + stop + + if $result.categories.get("violence", False) + bot inform cannot engage in abusive or harmful behavior + stop + +define bot inform cannot engage in abusive or harmful behavior + "I will not engage in any abusive or harmful behavior." + +define bot inform cannot engage in inappropriate content + "I will not engage with inappropriate content." +``` + +## Using with Category Scores + +You can also use the confidence scores for more nuanced control: + +```colang +define subflow openai score based moderation + """Moderation based on confidence scores rather than binary flags.""" + $result = execute openai_moderation_api + + # Custom thresholds for different categories + if $result.category_scores.get("harassment", 0) > 0.7 + bot inform cannot engage in abusive or harmful behavior + stop + + if $result.category_scores.get("hate", 0) > 0.6 + bot inform cannot engage in abusive or harmful behavior + stop + + if $result.category_scores.get("violence", 0) > 0.8 + bot inform cannot engage in abusive or harmful behavior + stop +``` + +## Environment Setup + +Make sure you have your OpenAI API key configured: + +```bash +export OPENAI_API_KEY="your-api-key-here" +``` + +Or you can configure it in your application code when initializing the OpenAI client. + +## Installation + +The OpenAI moderation integration requires the `openai` package: + +```bash +pip install openai +``` + +This is typically included when you install NeMo Guardrails with OpenAI support. diff --git a/docs/user-guides/guardrails-library.md b/docs/user-guides/guardrails-library.md index 0215b20d4..453f56c91 100644 --- a/docs/user-guides/guardrails-library.md +++ b/docs/user-guides/guardrails-library.md @@ -28,7 +28,7 @@ NeMo Guardrails comes with a library of built-in guardrails that you can easily - [Prompt Security Protection](#prompt-security-protection) - [Pangea AI Guard](#pangea-ai-guard) - [Trend Micro Vision One AI Application Security](#trend-micro-vision-one-ai-application-security) - - OpenAI Moderation API - *[COMING SOON]* + - [OpenAI Moderations API](#openai-moderations-api) 4. Other - [Jailbreak Detection](#jailbreak-detection) @@ -937,6 +937,21 @@ rails: For more details, check out the [Trend Micro Vision One AI Application Security](./community/trend-micro.md) page. +### OpenAI Moderations API + +NeMo Guardrails supports using the [OpenAI Moderation API](https://platform.openai.com/docs/guides/moderation) for input and output content moderation. You need to have the `OPENAI_API_KEY` environment variable set. + +#### Example usage + +```yaml +rails: + input: + flows: + - openai moderation +``` + +For more details, check out the [OpenAI Moderations API Integration](./community/openai-moderations-api.md) page. + ## Other ### Jailbreak Detection