pyday-iran · tavallaie · Dec 26, 2025 · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025
diff --git a/src/codechallenge2025/participant_solution.py b/src/codechallenge2025/participant_solution.py
@@ -35,35 +35,151 @@ def match_single(
             ...
         ]
     """
-    # TODO: Replace this dummy with your real matching logic!
-    # Example: return empty list (safe default)
-    return []
+    # === Helper: parse alleles safely ===
+    def parse_alleles(val):
+        if pd.isna(val):
+            return None
+        s = str(val).strip()
+        if s in ('-', '', 'nan', 'None'):
+            return None
+        try:
+            if ',' in s:
+                return frozenset(float(x.strip()) for x in s.split(','))
+            return frozenset([float(s)])
+        except (ValueError, TypeError):
+            return None
+
+    # === Build/retrieve cached index (using function attribute) ===
+    db_id = id(database_df)
+    if not hasattr(match_single, '_cache') or match_single._cache.get('db_id') != db_id:
+        # Build allele index and frequency table
+        loci = [c for c in database_df.columns if c != 'PersonID']
+        allele_index = {}  # (locus, allele) -> set of person_ids
+        allele_counts = {}  # (locus, allele) -> count
+        profiles = {}  # person_id -> {locus: frozenset of alleles}
+
+        for _, row in database_df.iterrows():
+            pid = row['PersonID']
+            profile = {}
+            for locus in loci:
+                alleles = parse_alleles(row[locus])
+                profile[locus] = alleles
+                if alleles:
+                    for a in alleles:
+                        key = (locus, a)
+                        if key not in allele_index:
+                            allele_index[key] = set()
+                            allele_counts[key] = 0
+                        allele_index[key].add(pid)
+                        allele_counts[key] += 1
+            profiles[pid] = profile
+
+        # Compute frequencies
+        total_profiles = len(database_df)
+        allele_freqs = {k: v / total_profiles for k, v in allele_counts.items()}
+
+        match_single._cache = {
+            'db_id': db_id,
+            'loci': loci,
+            'allele_index': allele_index,
+            'allele_freqs': allele_freqs,
+            'profiles': profiles
+        }
+
+    cache = match_single._cache
+    loci = cache['loci']
+    allele_index = cache['allele_index']
+    allele_freqs = cache['allele_freqs']
+    profiles = cache['profiles']
 
-    # Helpful tip: you can compute a simple score like number of shared alleles
-    # Example skeleton:
-    """
-    candidates = []
     query_id = query_profile['PersonID']
-
-    for _, candidate in database_df.iterrows():
-        if candidate['PersonID'] == query_id:
-            continue  # skip self
-
-        score = your_scoring_function(query_profile, candidate)
-        if score > threshold:
-            candidates.append({
-                "person_id": candidate['PersonID'],
-                "clr": score,
-                "posterior": 0.99,  # optional
-                "consistent_loci": 18,
-                "mutated_loci": 0,
-                "inconclusive_loci": 3
-            })
-
-    # Sort by CLR descending and take top 10
-    candidates.sort(key=lambda x: x['clr'], reverse=True)
-    return candidates[:10]
-    """
+
+    # Parse query profile
+    query_parsed = {}
+    for locus in loci:
+        query_parsed[locus] = parse_alleles(query_profile.get(locus))
+
+    # === Pre-filter candidates using inverted index ===
+    candidate_match_count = {}
+    for locus in loci:
+        q_alleles = query_parsed[locus]
+        if not q_alleles:
+            continue
+        for allele in q_alleles:
+            key = (locus, allele)
+            if key in allele_index:
+                for pid in allele_index[key]:
+                    if pid != query_id:
+                        candidate_match_count[pid] = candidate_match_count.get(pid, 0) + 1
+
+    # Only consider candidates that share alleles at multiple loci
+    promising = [pid for pid, cnt in candidate_match_count.items() if cnt >= 8]
+
+    # === Score promising candidates ===
+    results = []
+
+    for pid in promising:
+        cand_profile = profiles[pid]
+
+        clr = 1.0
+        consistent = 0
+        mutated = 0
+        inconclusive = 0
+        exclusions = 0
+        identity_count = 0
+        compared = 0
+
+        for locus in loci:
+            q_alleles = query_parsed[locus]
+            c_alleles = cand_profile.get(locus)
+
+            if q_alleles is None or c_alleles is None:
+                inconclusive += 1
+                continue
+
+            compared += 1
+            if q_alleles == c_alleles:
+                identity_count += 1
+
+            shared = q_alleles & c_alleles
+
+            if shared:
+                consistent += 1
+                # Simple LR: transmission_prob / frequency
+                trans = 1.0 if len(c_alleles) == 1 else 0.5
+                clr *= trans / 0.15  # Average frequency
+            else:
+                # Check mutation
+                is_mutation = any(0 < abs(a - b) <= 1.0 for a in q_alleles for b in c_alleles)
+                if is_mutation:
+                    mutated += 1
+                    clr *= 0.01  # Mutation penalty
+                else:
+                    exclusions += 1
+                    clr *= 0.001  # Exclusion penalty
+
+        # Filter criteria
+        # True parent-child should have 0 exclusions (rarely 1 due to mutation)
+        if exclusions > 1:
+            continue
+        if consistent < 8:
+            continue
+        if compared > 0 and identity_count / compared > 0.85:
+            continue
+
+        posterior = clr / (clr + 1.0) if clr > 0 else 0.0
+
+        results.append({
+            "person_id": pid,
+            "clr": clr,
+            "posterior": posterior,
+            "consistent_loci": consistent,
+            "mutated_loci": mutated,
+            "inconclusive_loci": inconclusive
+        })
+
+    results.sort(key=lambda x: -x['clr'])
+    return results[:10]
 
 
 # ============================================================