Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions nemoguardrails/library/gcp_moderate_text/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,6 @@ async def call_gcp_text_moderation_api(
violations_dict[violation.name] = violation.confidence

return {"max_risk_score": max_risk_score, "violations": violations_dict}


call_gcp_text_moderation_api.action_meta.setdefault("tags", []).append("moderation")
14 changes: 14 additions & 0 deletions nemoguardrails/library/openai_moderate_text/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
84 changes: 84 additions & 0 deletions nemoguardrails/library/openai_moderate_text/actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import Optional

try:
from openai import OpenAI
except ImportError:
pass


from nemoguardrails.actions import action

log = logging.getLogger(__name__)


def openai_text_moderation_mapping(result: dict) -> bool:
"""Determines whether the output from the OpenAI moderation API should be blocked.
Returns True (i.e. block the output) if any of the conditions are met.
"""
# Check overall risk score if available.
flagged = result.get("results", [{}])[0].get("flagged", False)

if flagged:
return True

confidence_threshold = 0.8
scores = result["results"][0]["category_scores"]

if any(score > confidence_threshold for score in scores.values()):
return True

# If none of the thresholds are exceeded, allow the output.
return False


@action(
name="openai_moderation_api",
is_system_action=True,
output_mapping=openai_text_moderation_mapping,
)
async def call_openai_text_moderation_api(
context: Optional[dict] = None, **kwargs
) -> dict:
try:
from openai import OpenAI

except ImportError:
raise ImportError(
"Could not import openai, please install it with " "`pip install openai`."
)

user_message = context.get("user_message")
client = OpenAI()

response = client.moderations.create(
model="omni-moderation-latest",
input=user_message,
)

result = response.model_dump()["results"][0]
result["categories"] = dict(result.get("categories", {}))
result["category_scores"] = {
str(k): float(v) for k, v in result.get("category_scores", {}).items()
}
result["flagged"] = bool(result.get("flagged", False))

return result


call_openai_text_moderation_api.action_meta.setdefault("tags", []).append("moderation")
93 changes: 93 additions & 0 deletions nemoguardrails/library/openai_moderate_text/flows.co
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
https://platform.openai.com/docs/guides/moderation

Supported Categories:

Category Description
harassment Content that expresses, incites, or promotes harassing language towards any target.
harassment/threatening Harassment content that also includes violence or serious harm towards any target.
hate Content that expresses, incites, or promotes hate based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste. Hateful content aimed at non-protected groups (e.g., chess players) is harassment.
hate/threatening Hateful content that also includes violence or serious harm towards the targeted group based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste.
illicit Content that gives advice or instruction on how to commit illicit acts. A phrase like "how to shoplift" would fit this category.
illicit/violent The same types of content flagged by the illicit category, but also includes references to violence or procuring a weapon.
self-harm Content that promotes, encourages, or depicts acts of self-harm, such as suicide, cutting, and eating disorders.
self-harm/intent Content where the speaker expresses that they are engaging or intend to engage in acts of self-harm, such as suicide, cutting, and eating disorders.
self-harm/instructions Content that encourages performing acts of self-harm, such as suicide, cutting, and eating disorders, or that gives instructions or advice on how to commit such acts.
sexual Content meant to arouse sexual excitement, such as the description of sexual activity, or that promotes sexual services (excluding sex education and wellness).
sexual/minors Sexual content that includes an individual who is under 18 years old.
violence Content that depicts death, violence, or physical injury.
violence/graphic Content that depicts death, violence, or physical injury in graphic detail.
"""

define subflow openai moderation
"""Guardrail based on model classification of potentially harmful content"""
$result = execute openai_moderation_api

if $result.get("flagged", False)
bot refuse to respond
stop

define subflow openai moderation detailed
"""Guardrail based on individual risk scores."""
$result = execute openai_moderation_api

if $result.categories.get("sexual", False)
stop
bot inform cannot engage in inappropriate content

if $result.categories.get("sexual/minors", False)
stop
bot inform cannot engage in inappropriate content

if $result.categories.get("harassment", False)
bot inform cannot engage in abusive or harmful behavior
stop

if $result.categories.get("harassment/threatening", False)
bot inform cannot engage in abusive or harmful behavior
stop

if $result.categories.get("hate", False)
bot inform cannot engage in abusive or harmful behavior
stop

if $result.categories.get("hate/threatening", False)
bot inform cannot engage in abusive or harmful behavior
stop

if $result.categories.get("illicit", False)
bot inform cannot engage in inappropriate content
stop

if $result.categories.get("illicit/violent", False)
bot inform cannot engage in inappropriate content
stop

if $result.categories.get("self-harm", False)
bot inform cannot engage with sensitive content
stop

if $result.categories.get("self-harm/intent", False)
bot inform cannot engage with sensitive content
stop

if $result.categories.get("self-harm/instructions", False)
bot inform cannot engage with sensitive content
stop

if $result.categories.get("violence", False)
bot inform cannot engage in abusive or harmful behavior
stop

if $result.categories.get("violence/graphic", False)
bot inform cannot engage in abusive or harmful behavior
stop

define bot inform cannot engage in abusive or harmful behavior
"I will not engage in any abusive or harmful behavior."

define bot inform cannot engage with inappropriate content
"I will not engage with inappropriate content."

define bot inform cannot engage with sensitive content
"I will not engage with sensitive content."
127 changes: 124 additions & 3 deletions nemoguardrails/server/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@
import os.path
import re
import time
import uuid
import warnings
from contextlib import asynccontextmanager
from typing import Any, List, Optional
from typing import Any, List, Optional, Union

from fastapi import FastAPI, Request
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, root_validator, validator
from starlette.responses import StreamingResponse
Expand All @@ -46,7 +47,7 @@
# backends and storage engines.
registered_loggers = []

api_description = """Guardrails Sever API."""
api_description = """Guardrails Server API."""

# The headers for each request
api_request_headers = contextvars.ContextVar("headers")
Expand Down Expand Up @@ -576,6 +577,126 @@ class GuardrailsConfigurationError(Exception):
pass


class ModerationsRequest(BaseModel):
input: Union[str, List[str]]
model: Optional[str] = None
config_id: Optional[str] = Field(default=os.getenv("DEFAULT_CONFIG_ID", None))
config_ids: Optional[List[str]] = None

@root_validator(pre=True)
def normalize_keys(cls, data):
if isinstance(data, dict) and "Model" in data and "model" not in data:
data["model"] = data.pop("Model")
return data

@validator("config_ids", pre=True, always=True)
def ensure_config_ids(cls, v, values):
if v is None and values.get("config_id"):
return [values["config_id"]]
return v


class ModerationsResponse(BaseModel):
id: str
model: str
results: List[dict]


def _moderation_actions(dispatcher) -> list[str]:
out = []
for name, fn in dispatcher.registered_actions.items():
meta = getattr(fn, "action_meta", {}) or {}
tags = set(meta.get("tags", []))
if "moderation" in tags:
out.append(name)
return sorted(out)


def _resolve_moderation_action(dispatcher, model_or_action: str | None) -> str:
mods = _moderation_actions(dispatcher)
if not mods:
raise HTTPException(
status_code=500, detail="No moderation actions are registered."
)

if not model_or_action:
return mods[0]

wanted = model_or_action.strip()
if dispatcher.has_registered(wanted) and wanted in mods:
return wanted

def norm(s: str) -> str:
return s.lower().replace(" ", "")

for m in mods:
if norm(m) == norm(wanted):
return m

raise HTTPException(
status_code=400,
detail=f"Unknown moderation action '{model_or_action}'. Available: {', '.join(mods)}",
)


@app.post("/v1/moderations", response_model=ModerationsResponse)
async def create_moderations(body: ModerationsRequest, request: Request):
for logger in registered_loggers:
asyncio.get_event_loop().create_task(
logger({"endpoint": "/v1/moderations", "body": body.json()})
)
api_request_headers.set(request.headers)

config_ids = body.config_ids or ([body.config_id] if body.config_id else None)
if not config_ids:
if app.default_config_id:
config_ids = [app.default_config_id]
elif app.single_config_mode and app.single_config_id:
config_ids = [app.single_config_id]
else:
raise GuardrailsConfigurationError(
"No 'config_id' provided and no default configuration is set for the server."
)

try:
llm_rails = _get_rails(config_ids)
except Exception:
raise HTTPException(
status_code=500, detail="Could not load guardrails configuration."
)

dispatcher = llm_rails.runtime.action_dispatcher
action_name = _resolve_moderation_action(dispatcher, body.model)

texts = body.input if isinstance(body.input, list) else [body.input]
results = []

for text in texts:
params = {"context": {"user_message": text}}
try:
result = await dispatcher.execute_action(action_name, params)
except Exception:
raise HTTPException(
status_code=502, detail=f"Moderation action '{action_name}' failed."
)

value, status = (
result
if isinstance(result, tuple) and len(result) == 2
else (result, "success")
)
payload = getattr(value, "return_value", value) or {}
if not isinstance(payload, dict):
payload = {"value": payload}
results.append(payload)

return ModerationsResponse(
id=f"modr-{uuid.uuid4()}",
model=action_name,
results=results,
)


# # Register a nicer error message for 422 error
# def register_exception(app: FastAPI):
# @app.exception_handler(RequestValidationError)
Expand Down
Loading