-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpatch_worst_commit.py
More file actions
111 lines (93 loc) · 3.72 KB
/
patch_worst_commit.py
File metadata and controls
111 lines (93 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python3
"""
Patch precomputed.json with worst_commit_toxicity and worst_commit_msg
using already-saved raw_data/<user>/commits.json.
No GitHub API calls needed.
"""
import json
from pathlib import Path
OUTPUT_FILE = Path("precomputed.json")
RAW_DATA_DIR = Path("raw_data")
def main():
if not OUTPUT_FILE.exists():
print("No precomputed.json found.")
return
data = json.loads(OUTPUT_FILE.read_text())
print(f"Loaded {len(data)} users from {OUTPUT_FILE}")
# Lazy-load model only if needed
model = None
patched = 0
for username, info in data.items():
# Skip if already has worst commit data
if info.get("worst_commit_msg"):
print(f" {username}: already has worst_commit_msg, skipping")
patched += 1
continue
# Load cached commits
commits_file = RAW_DATA_DIR / username.lower() / "commits.json"
if not commits_file.exists():
# Try exact case
commits_file = RAW_DATA_DIR / username / "commits.json"
if not commits_file.exists():
print(f" {username}: no cached commits found, setting defaults")
info["worst_commit_toxicity"] = 0.0
info["worst_commit_msg"] = ""
continue
texts = json.loads(commits_file.read_text())
if not texts:
print(f" {username}: empty commits, setting defaults")
info["worst_commit_toxicity"] = 0.0
info["worst_commit_msg"] = ""
continue
# Load model on first use
if model is None:
import torch
from detoxify import Detoxify
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Detoxify("original", device=device)
print(f"Toxicity model loaded on {device}")
_torch = torch
else:
import torch as _torch
# Batch predict
axes = ["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack"]
all_scores = {k: [] for k in axes}
batch_size = 32
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
with _torch.no_grad():
results = model.predict(batch)
for k in axes:
all_scores[k].extend(results[k])
if _torch.cuda.is_available():
_torch.cuda.empty_cache()
# Find worst commit (highest score on any axis)
worst_idx = None
worst_score = 0.0
for idx in range(len(texts)):
for axis in axes:
score = float(all_scores[axis][idx])
if score > worst_score:
worst_score = score
worst_idx = idx
if worst_idx is not None:
info["worst_commit_toxicity"] = worst_score
info["worst_commit_msg"] = texts[worst_idx]
# Also save worst_commit.json
worst_data = {
"message": texts[worst_idx],
"toxicity_score": worst_score,
"all_scores": {axis: float(all_scores[axis][worst_idx]) for axis in axes},
}
worst_file = (RAW_DATA_DIR / username / "worst_commit.json")
if worst_file.parent.exists():
worst_file.write_text(json.dumps(worst_data, indent=2))
else:
info["worst_commit_toxicity"] = 0.0
info["worst_commit_msg"] = ""
patched += 1
print(f" {username}: worst={worst_score:.3f} \"{(texts[worst_idx] if worst_idx is not None else '')[:60]}\"")
OUTPUT_FILE.write_text(json.dumps(data, indent=2))
print(f"\nDone. Patched {patched}/{len(data)} users in {OUTPUT_FILE}")
if __name__ == "__main__":
main()