Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions src/data/recurring_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,27 @@ def detect_recurring(
"""
cutoff = date.today() - timedelta(days=lookback_days)

# Group by merchant name
by_merchant: dict[str, list[dict]] = defaultdict(list)
# Group by (merchant name, account id). A merchant string like "Ameriprise"
# can cover independent recurring streams on different accounts (e.g. two
# household members' insurance debits) — merging them produces a polluted
# bag whose median amount and average interval describe neither stream.
by_group: dict[tuple[str, str], list[dict]] = defaultdict(list)
for txn in transactions:
merchant = (txn.get("merchant") or {}).get("name", "")
if not merchant:
continue
account_id = (txn.get("account") or {}).get("id", "")
txn_date_str = txn.get("date", "")
try:
txn_date = date.fromisoformat(txn_date_str[:10])
except (ValueError, TypeError):
continue
if txn_date < cutoff:
continue
by_merchant[merchant].append(txn)
by_group[(merchant, account_id)].append(txn)

items: list[RecurringItem] = []
for merchant, txns in by_merchant.items():
for (merchant, _account_id), txns in by_group.items():
if len(txns) < min_occurrences:
continue

Expand Down
22 changes: 22 additions & 0 deletions tests/test_recurring_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,28 @@ def test_detects_biweekly(self):
assert items[0].frequency == "biweekly"
assert items[0].amount == 2000.0

def test_same_merchant_on_different_accounts_stays_split(self):
# Two genuinely-monthly Ameriprise streams on different accounts —
# one on the 16th for $1100, one on the 25th for $1000. Grouping by
# merchant alone would interleave them into a bag whose median is
# $1050 and whose avg interval lands in the biweekly bucket.
today = date.today()
txns = [
_make_txn("Ameriprise", -1000.0, today - timedelta(days=89), account_id="karen"),
_make_txn("Ameriprise", -1100.0, today - timedelta(days=66), account_id="rex"),
_make_txn("Ameriprise", -1000.0, today - timedelta(days=58), account_id="karen"),
_make_txn("Ameriprise", -1100.0, today - timedelta(days=39), account_id="rex"),
_make_txn("Ameriprise", -1000.0, today - timedelta(days=30), account_id="karen"),
_make_txn("Ameriprise", -1100.0, today - timedelta(days=8), account_id="rex"),
]
items = detect_recurring(txns)
assert len(items) == 2
by_account = {item.account_id: item for item in items}
assert by_account["karen"].frequency == "monthly"
assert by_account["karen"].amount == -1000.0
assert by_account["rex"].frequency == "monthly"
assert by_account["rex"].amount == -1100.0

def test_includes_account_info(self):
today = date.today()
txns = [
Expand Down
Loading