From e15b16ac95c7719c14740e29be261df1d4399934 Mon Sep 17 00:00:00 2001 From: Rex Lorenzo Date: Fri, 24 Apr 2026 17:38:18 -0700 Subject: [PATCH] fix(recurring): group by merchant and account, not merchant alone MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two genuinely-monthly streams sharing a merchant string but posting to different accounts (e.g. two household members' "Ameriprise" debits) were merged into one bag whose median amount and average interval matched neither stream — surfacing as a fake biweekly row at the median of the two real amounts. --- src/data/recurring_detector.py | 12 ++++++++---- tests/test_recurring_detector.py | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/data/recurring_detector.py b/src/data/recurring_detector.py index 3ceec25..9eca1f5 100644 --- a/src/data/recurring_detector.py +++ b/src/data/recurring_detector.py @@ -27,12 +27,16 @@ def detect_recurring( """ cutoff = date.today() - timedelta(days=lookback_days) - # Group by merchant name - by_merchant: dict[str, list[dict]] = defaultdict(list) + # Group by (merchant name, account id). A merchant string like "Ameriprise" + # can cover independent recurring streams on different accounts (e.g. two + # household members' insurance debits) — merging them produces a polluted + # bag whose median amount and average interval describe neither stream. + by_group: dict[tuple[str, str], list[dict]] = defaultdict(list) for txn in transactions: merchant = (txn.get("merchant") or {}).get("name", "") if not merchant: continue + account_id = (txn.get("account") or {}).get("id", "") txn_date_str = txn.get("date", "") try: txn_date = date.fromisoformat(txn_date_str[:10]) @@ -40,10 +44,10 @@ def detect_recurring( continue if txn_date < cutoff: continue - by_merchant[merchant].append(txn) + by_group[(merchant, account_id)].append(txn) items: list[RecurringItem] = [] - for merchant, txns in by_merchant.items(): + for (merchant, _account_id), txns in by_group.items(): if len(txns) < min_occurrences: continue diff --git a/tests/test_recurring_detector.py b/tests/test_recurring_detector.py index 81d0ad7..66cf8c7 100644 --- a/tests/test_recurring_detector.py +++ b/tests/test_recurring_detector.py @@ -86,6 +86,28 @@ def test_detects_biweekly(self): assert items[0].frequency == "biweekly" assert items[0].amount == 2000.0 + def test_same_merchant_on_different_accounts_stays_split(self): + # Two genuinely-monthly Ameriprise streams on different accounts — + # one on the 16th for $1100, one on the 25th for $1000. Grouping by + # merchant alone would interleave them into a bag whose median is + # $1050 and whose avg interval lands in the biweekly bucket. + today = date.today() + txns = [ + _make_txn("Ameriprise", -1000.0, today - timedelta(days=89), account_id="karen"), + _make_txn("Ameriprise", -1100.0, today - timedelta(days=66), account_id="rex"), + _make_txn("Ameriprise", -1000.0, today - timedelta(days=58), account_id="karen"), + _make_txn("Ameriprise", -1100.0, today - timedelta(days=39), account_id="rex"), + _make_txn("Ameriprise", -1000.0, today - timedelta(days=30), account_id="karen"), + _make_txn("Ameriprise", -1100.0, today - timedelta(days=8), account_id="rex"), + ] + items = detect_recurring(txns) + assert len(items) == 2 + by_account = {item.account_id: item for item in items} + assert by_account["karen"].frequency == "monthly" + assert by_account["karen"].amount == -1000.0 + assert by_account["rex"].frequency == "monthly" + assert by_account["rex"].amount == -1100.0 + def test_includes_account_info(self): today = date.today() txns = [