Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions openkiln/commands/skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ def _append_config_section(skill_name: str) -> None:
config_templates: dict[str, str] = {
"orbisearch": ('\n[skills.orbisearch]\napi_key = "" # get your free key at orbisearch.com\n'),
"smartlead": ('\n[skills.smartlead]\napi_key = ""\n'),
"cleanco": ('\n[skills.cleanco]\napi_key = "" # OpenAI API key\n'),
}

template = config_templates.get(skill_name)
Expand Down
117 changes: 117 additions & 0 deletions openkiln/skills/cleanco/SKILL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Cleanco Skill

Clean company names for outreach using OpenAI.

## Provides

| Type | Name | Description |
|-----------|-----------------------|---------------------------------------|
| transform | cleanco.company_name | Clean company names for cold email |

## Required config
```bash
# Set via environment variable (recommended)
export OPENAI_API_KEY=your-key-here

# Or via config file (~/.openkiln/config.toml)
[skills.cleanco]
api_key = "your-key-here"
```

## What it cleans

- Legal suffixes: Inc, LLC, Ltd, GmbH, Corp, PLC, etc.
- Parenthetical descriptions: "Tiger Data (creators of TimescaleDB)" -> "Tiger Data"
- Pipe-separated taglines: "Rocketship | Digital Marketing Agency" -> "Rocketship"
- Colon-separated taglines: "eyreACT: AI Act Compliance Platform" -> "eyreACT"
- Preserves brand names: "1Password", "6sense" stay unchanged

Uses gpt-4o-mini for intelligent cleaning. Results are cached in
cleanco.db so each unique name is only cleaned once.

## CLI Commands

### clean

Clean company names in a CSV file.

```bash
# dry run — shows what would be cleaned
openkiln cleanco clean contacts.csv

# clean and write output (defaults to contacts-cleaned.csv)
openkiln cleanco clean contacts.csv --apply

# specify column and output path
openkiln cleanco clean contacts.csv --column company_name --output cleaned.csv --apply

# JSON output
openkiln cleanco clean contacts.csv --apply --json
```

**Flags:**
- `--column`, `-c` — Column name containing company names (default: `company_name`)
- `--output`, `-o` — Output file path (default: `<input>-cleaned.csv`)
- `--apply` — Actually clean and write output (default: dry run)
- `--json` — Output as JSON

### cache

Show cache statistics.

```bash
openkiln cleanco cache
openkiln cleanco cache --json
```

### show

Show cached name changes (where original differs from cleaned).

```bash
openkiln cleanco show
openkiln cleanco show --limit 50
openkiln cleanco show --json
```

**Flags:**
- `--limit`, `-n` — Number of entries to show (default: 20)
- `--json` — Output as JSON

## Example workflow usage

### Pre-import (CLI)
```bash
openkiln cleanco clean contacts.csv --apply
openkiln record import contacts-cleaned.csv --type contact --skill crm --apply
```

### Post-import (workflow)
```yaml
name: clean-validate-push
requires:
- crm
- cleanco
- orbisearch
- smartlead

source:
skill: crm
type: contacts
filter:
segment: clay-gtm-ops

transforms:
- cleanco.company_name
- orbisearch.validate

filter:
status: safe

sinks:
- skill: crm
action: update
- skill: smartlead
action: push
campaign_id: "3133669"
```
4 changes: 4 additions & 0 deletions openkiln/skills/cleanco/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# openkiln/skills/cleanco/
# Company name cleaning via OpenAI.

__version__ = "0.1.0"
126 changes: 126 additions & 0 deletions openkiln/skills/cleanco/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""
Company name cleaning via OpenAI.

Uses gpt-4o-mini to clean company names for cold email outreach.
Batches names for efficiency.
"""

from __future__ import annotations

import json
import os

import httpx

from openkiln import config

BASE_URL = "https://api.openai.com/v1"
REQUEST_TIMEOUT = 30.0
MODEL = "gpt-4o-mini"

SYSTEM_PROMPT = """\
You clean company names for use in cold email outreach.

Rules:
- Remove legal suffixes: Inc, Inc., LLC, Ltd, Ltd., Limited, GmbH, Corp, \
Corp., Corporation, PLC, AG, SA, SAS, BV, NV, Pty, Co., Company, Group
- Remove parenthetical descriptions: "Tiger Data (creators of TimescaleDB)" -> "Tiger Data"
- Remove pipe-separated taglines: "Rocketship | Digital Marketing Agency" -> "Rocketship"
- Remove colon-separated taglines: "eyreACT: AI Act Compliance Platform" -> "eyreACT"
- Keep the core brand name as it would appear in casual business conversation
- Preserve capitalisation and special characters that are part of the brand
- If the entire name IS the brand (e.g. "1Password", "6sense"), return it unchanged
- If removing a suffix leaves nothing meaningful, keep the original

Return a JSON array of cleaned names in the same order as the input.
No explanations, just the JSON array.\
"""


class CleancoError(Exception):
"""Base error for cleanco failures."""

def __init__(self, message: str, status_code: int | None = None) -> None:
self.status_code = status_code
super().__init__(message)


class CleancoClient:
"""Cleans company names via OpenAI."""

def __init__(self, api_key: str) -> None:
self._api_key = api_key

def clean_batch(self, names: list[str]) -> list[str]:
"""Clean a batch of company names. Returns cleaned names in order."""
if not names:
return []

user_msg = json.dumps(names)

response = httpx.post(
f"{BASE_URL}/chat/completions",
headers={
"Authorization": f"Bearer {self._api_key}",
"Content-Type": "application/json",
},
json={
"model": MODEL,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_msg},
],
"temperature": 0,
},
timeout=REQUEST_TIMEOUT,
)

if response.status_code >= 400:
raise CleancoError(
f"OpenAI API error: {response.text[:200]}",
response.status_code,
)

data = response.json()
content = data["choices"][0]["message"]["content"].strip()

# Parse the JSON array from the response
# Strip markdown code fences if present
if content.startswith("```"):
content = content.split("\n", 1)[1]
content = content.rsplit("```", 1)[0].strip()

try:
cleaned = json.loads(content)
except json.JSONDecodeError:
raise CleancoError(f"Could not parse OpenAI response: {content[:200]}")

if not isinstance(cleaned, list) or len(cleaned) != len(names):
got = len(cleaned) if isinstance(cleaned, list) else "non-list"
raise CleancoError(f"Expected {len(names)} names, got {got}")

return cleaned


def _resolve_api_key() -> str:
"""Resolve API key from environment or config."""
key = os.environ.get("OPENAI_API_KEY")
if key:
return key

cfg = config.get()
key = cfg.skill_config("cleanco").get("api_key", "")
if key:
return key

raise CleancoError(
"No OpenAI API key configured.\n"
"Set OPENAI_API_KEY or add it to ~/.openkiln/config.toml:\n"
" [skills.cleanco]\n"
' api_key = "your-key-here"'
)


def get_client() -> CleancoClient:
"""Returns a client using the configured API key."""
return CleancoClient(_resolve_api_key())
Loading
Loading