secmask/eval_simple.py at main · AndrewAndrewsen/secmask · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
"""
Simple evaluation script for SecMask.

Tests the system with various secret types and reports accuracy.
"""
import time
from infer_moe import mask_text_moe

# Test cases: (input_text, expected_secrets_count, description)
TEST_CASES = [
    (
        "My AWS key is AKIAIOSFODNN7EXAMPLE",
        1,
        "AWS Access Key"
    ),
    (
        "GitHub token: ghp_1234567890abcdefghijklmnopqrstuvwxyz",
        1,
        "GitHub Personal Access Token"
    ),
    (
        "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U",
        1,
        "JWT Token"
    ),
    (
        "No secrets here, just normal text",
        0,
        "No secrets (negative case)"
    ),
    (
        "Multiple secrets: AKIAIOSFODNN7EXAMPLE and ghp_abcdef1234567890",
        2,
        "Multiple secrets"
    ),
    (
        """-----BEGIN RSA PRIVATE KEY-----
MIIEpAIBAAKCAQEAx5N3nQw8qJ1KZ7h5L9X2tE5F3J8P1K5M9N7O8Q6R7S8T9U
VWXYZaBcDeFgHiJkLmNoPqRsTuVwXyZ1A2B3C4D5E6F7G8H9I0J1K2L3M4N5O6
-----END RSA PRIVATE KEY-----""",
        1,
        "PEM Block"
    ),
    (
        "API key: sk-proj-1234567890abcdefghijklmnop",
        1,
        "API Key"
    ),
]


def count_secrets_in_masked(masked_text: str) -> int:
    """Count [SECRET] tokens in masked text."""
    return masked_text.count("[SECRET]")


def run_evaluation(use_long_model: bool = False):
    """Run evaluation on test cases."""
    print("=" * 80)
    print("SecMask Evaluation")
    print("=" * 80)
    print()

    if use_long_model:
        print("⚠️  Note: Long model not available in this test (using fast model only)")
        print()

    results = []
    total_time = 0

    for i, (text, expected_count, description) in enumerate(TEST_CASES, 1):
        print(f"Test {i}/{len(TEST_CASES)}: {description}")
        print(f"  Input: {text[:60]}{'...' if len(text) > 60 else ''}")

        # Run masking
        start_time = time.time()
        try:
            masked = mask_text_moe(
                text,
                fast_model_dir="andrewandrewsen/distilbert-secret-masker",
                tau=0.80,
                enable_filters=True,
                enable_escalation=False  # Don't escalate in simple eval
            )
            latency = (time.time() - start_time) * 1000  # Convert to ms
            total_time += latency

            # Count secrets
            actual_count = count_secrets_in_masked(masked)

            # Check if correct
            correct = actual_count == expected_count
            results.append({
                'description': description,
                'expected': expected_count,
                'actual': actual_count,
                'correct': correct,
                'latency_ms': latency
            })

            status = "✅ PASS" if correct else "❌ FAIL"
            print(f"  Output: {masked[:60]}{'...' if len(masked) > 60 else ''}")
            print(f"  Expected: {expected_count} secret(s), Got: {actual_count} secret(s)")
            print(f"  Latency: {latency:.2f}ms")
            print(f"  {status}")

        except Exception as e:
            print(f"  ❌ ERROR: {str(e)}")
            results.append({
                'description': description,
                'expected': expected_count,
                'actual': 0,
                'correct': False,
                'latency_ms': 0
            })

        print()

    # Summary
    print("=" * 80)
    print("Summary")
    print("=" * 80)

    total_tests = len(results)
    passed_tests = sum(1 for r in results if r['correct'])
    accuracy = (passed_tests / total_tests) * 100 if total_tests > 0 else 0
    avg_latency = total_time / total_tests if total_tests > 0 else 0

    print(f"Total tests: {total_tests}")
    print(f"Passed: {passed_tests}")
    print(f"Failed: {total_tests - passed_tests}")
    print(f"Accuracy: {accuracy:.1f}%")
    print(f"Average latency: {avg_latency:.2f}ms")
    print()

    if passed_tests == total_tests:
        print("🎉 All tests passed!")
    else:
        print("⚠️  Some tests failed. Review results above.")

    return results


if __name__ == "__main__":
    results = run_evaluation()