Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions graphify/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ def build_from_json(extraction: dict, *, directed: bool = False) -> nx.Graph:
hyperedges = extraction.get("hyperedges", [])
if hyperedges:
G.graph["hyperedges"] = hyperedges
# Preserve cross-file call resolution stats (degree cap, truncation counts)
# so downstream exporters can surface them in graph.json without breaking
# existing consumers that expect the plain `nodes`/`edges` shape.
stats = extraction.get("cross_file_call_stats")
if stats is not None:
G.graph["cross_file_call_stats"] = stats
return G


Expand Down
3 changes: 3 additions & 0 deletions graphify/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,9 @@ def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str) ->
conf = link.get("confidence", "EXTRACTED")
link["confidence_score"] = _CONFIDENCE_SCORE_DEFAULTS.get(conf, 1.0)
data["hyperedges"] = getattr(G, "graph", {}).get("hyperedges", [])
stats = getattr(G, "graph", {}).get("cross_file_call_stats")
if stats is not None:
data["cross_file_call_stats"] = stats
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)

Expand Down
144 changes: 128 additions & 16 deletions graphify/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3027,7 +3027,12 @@ def _check_tree_sitter_version() -> None:
)


def extract(paths: list[Path], cache_root: Path | None = None) -> dict:
def extract(
paths: list[Path],
cache_root: Path | None = None,
*,
max_ambiguity_fanout: int | None = None,
) -> dict:
"""Extract AST nodes and edges from a list of code files.

Two-pass process:
Expand All @@ -3040,6 +3045,15 @@ def extract(paths: list[Path], cache_root: Path | None = None) -> dict:
cache_root: explicit root for graphify-out/cache/ (overrides the
inferred common path prefix). Pass Path('.') when running on a
subdirectory so the cache stays at ./graphify-out/cache/.
max_ambiguity_fanout: cap on the number of AMBIGUOUS edges the
cross-file call resolver will emit for a single callee label.
When a normalised label matches more than this many candidate
nodes, the resolver drops the whole fan-out and records the
label under ``cross_file_call_stats.truncated_examples`` — this
prevents generic verbs (``.get()``, ``.all()``, ``.delete()``)
from inflating the graph with thousands of meaningless edges.
``None`` (default) falls back to the ``GRAPHIFY_MAX_AMBIGUITY_FANOUT``
env var, then to ``20``.
"""
_check_tree_sitter_version()
per_file: list[dict] = []
Expand Down Expand Up @@ -3141,39 +3155,137 @@ def extract(paths: list[Path], cache_root: Path | None = None) -> dict:
# Cross-file call resolution for all languages
# Each extractor saved unresolved calls in raw_calls. Now that we have all
# nodes from all files, resolve any callee that exists in another file.
global_label_to_nid: dict[str, str] = {}
#
# The lookup table is keyed by normalised lowercase label and stores a list
# of candidate nids (not just one). When multiple nodes share the same
# normalised label — which is common for CRUD verbs like `.get()`, `.all()`,
# `.delete()` and for cross-language collisions — a plain ``dict[str, str]``
# silently drops N-1 candidates via dict-overwrite. Preserving every
# candidate is a prerequisite for correct resolution.
#
# Guardrails:
# * Each bucket stores unique nids (defends against the same node id
# being appended twice when two of its label variants normalise to the
# same key, or when `all_nodes` transiently contains duplicates from
# layered extractors).
# * When the candidate pool exceeds `max_ambiguity_fanout`, the whole
# fan-out is dropped instead of emitting a flood of AMBIGUOUS edges.
# Picking a "random" winner at that degree would just resurrect the
# old dict-overwrite bug, and keeping all N edges explodes the graph
# for generic verbs that AST alone cannot disambiguate.
from collections import defaultdict
import os as _os

if max_ambiguity_fanout is None:
_env = _os.environ.get("GRAPHIFY_MAX_AMBIGUITY_FANOUT")
if _env is not None:
try:
max_ambiguity_fanout = int(_env)
except ValueError:
max_ambiguity_fanout = 20
else:
max_ambiguity_fanout = 20

global_label_to_nids: dict[str, list[str]] = defaultdict(list)
_seen_per_bucket: dict[str, set[str]] = defaultdict(set)
for n in all_nodes:
raw = n.get("label", "")
normalised = raw.strip("()").lstrip(".")
if normalised:
global_label_to_nid[normalised.lower()] = n["id"]
key = normalised.lower()
nid = n["id"]
if nid not in _seen_per_bucket[key]:
_seen_per_bucket[key].add(nid)
global_label_to_nids[key].append(nid)

existing_pairs = {(e["source"], e["target"]) for e in all_edges}

resolved_single = 0
resolved_ambiguous = 0
truncated_high_degree = 0
truncated_labels_seen: list[str] = []
_truncated_seen_set: set[str] = set()

for result in per_file:
for rc in result.get("raw_calls", []):
callee = rc.get("callee", "")
if not callee:
continue
tgt = global_label_to_nid.get(callee.lower())
caller = rc["caller_nid"]
if tgt and tgt != caller and (caller, tgt) not in existing_pairs:
existing_pairs.add((caller, tgt))
all_edges.append({
"source": caller,
"target": tgt,
"relation": "calls",
"confidence": "INFERRED",
"confidence_score": 0.8,
"source_file": rc.get("source_file", ""),
"source_location": rc.get("source_location"),
"weight": 1.0,
})
# ``dict.fromkeys`` preserves order while deduping — belt-and-braces
# on top of the per-bucket dedup above. Self-reference is filtered
# out: a node never calls itself through its own normalised label.
candidates = list(dict.fromkeys(
c for c in global_label_to_nids.get(callee.lower(), [])
if c != caller
))
if not candidates:
continue

# Invariant: ambiguity_degree MUST equal len(candidates). If this
# ever fires, the upstream dedup guarantees have been broken.
assert len(set(candidates)) == len(candidates), (
f"duplicate nids in candidates for callee={callee!r}: {candidates}"
)

if len(candidates) == 1:
tgt = candidates[0]
if (caller, tgt) not in existing_pairs:
existing_pairs.add((caller, tgt))
all_edges.append({
"source": caller,
"target": tgt,
"relation": "calls",
"confidence": "INFERRED",
"confidence_score": 0.8,
"source_file": rc.get("source_file", ""),
"source_location": rc.get("source_location"),
"weight": 1.0,
})
resolved_single += 1
elif len(candidates) > max_ambiguity_fanout:
# Generic verbs (``.get()``, ``.all()``, ``.delete()``) with
# 30+ candidates are AST-undecidable. Drop the fan-out and
# surface the label in stats so downstream tools can audit.
truncated_high_degree += 1
if callee not in _truncated_seen_set:
_truncated_seen_set.add(callee)
if len(truncated_labels_seen) < 5:
truncated_labels_seen.append(callee)
else:
# Multi-candidate → fan out one AMBIGUOUS edge per candidate.
# Picking an arbitrary winner here is indistinguishable from the
# old dict-overwrite bug; the downstream consumers need the
# ``ambiguity_degree`` signal to triage the result.
degree = len(candidates)
for tgt in candidates:
if (caller, tgt) not in existing_pairs:
existing_pairs.add((caller, tgt))
all_edges.append({
"source": caller,
"target": tgt,
"relation": "calls",
"confidence": "AMBIGUOUS",
"confidence_score": 0.2,
"source_file": rc.get("source_file", ""),
"source_location": rc.get("source_location"),
"ambiguity_degree": degree,
"weight": 1.0,
})
resolved_ambiguous += 1

return {
"nodes": all_nodes,
"edges": all_edges,
"input_tokens": 0,
"output_tokens": 0,
"cross_file_call_stats": {
"resolved_single": resolved_single,
"resolved_ambiguous": resolved_ambiguous,
"truncated_high_degree": truncated_high_degree,
"truncated_examples": truncated_labels_seen,
"max_ambiguity_fanout": max_ambiguity_fanout,
},
}


Expand Down
3 changes: 3 additions & 0 deletions graphify/watch.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ def _rebuild_code(watch_path: Path, *, follow_symlinks: bool = False) -> bool:
"hyperedges": existing.get("hyperedges", []),
"input_tokens": 0,
"output_tokens": 0,
# Preserve fresh AST-pass cross_file_call_stats; merging
# with a stale semantic pass would misreport degree caps.
"cross_file_call_stats": result.get("cross_file_call_stats"),
}
except Exception:
pass # corrupt graph.json - proceed with AST-only
Expand Down
9 changes: 9 additions & 0 deletions tests/fixtures/collision/a.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Fixture for cross-file call-collision tests — module A."""


def shared():
return "a"


def only_in_a():
return "a-only"
5 changes: 5 additions & 0 deletions tests/fixtures/collision/b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Fixture for cross-file call-collision tests — module B (duplicate name)."""


def shared():
return "b"
9 changes: 9 additions & 0 deletions tests/fixtures/collision/c.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Fixture for cross-file call-collision tests — unique-candidate caller.

c.py calls `only_in_a()` which exists in only one other file (a.py),
so the call must resolve to a single INFERRED edge at 0.8.
"""


def caller_unique():
return only_in_a() # noqa: F821 — resolved via cross-file pass
10 changes: 10 additions & 0 deletions tests/fixtures/collision/d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Fixture for cross-file call-collision tests — ambiguous caller.

d.py calls `shared()` which exists in both a.py and b.py, so the cross-file
resolution pass must emit two AMBIGUOUS edges (score 0.2, ambiguity_degree=2)
instead of silently picking one winner.
"""


def caller_ambiguous():
return shared() # noqa: F821 — resolved via cross-file pass
Loading