diff --git a/src/codechallenge2025/participant_solution.py b/src/codechallenge2025/participant_solution.py index b3c06a7..49521a4 100644 --- a/src/codechallenge2025/participant_solution.py +++ b/src/codechallenge2025/participant_solution.py @@ -35,35 +35,151 @@ def match_single( ... ] """ - # TODO: Replace this dummy with your real matching logic! - # Example: return empty list (safe default) - return [] + # === Helper: parse alleles safely === + def parse_alleles(val): + if pd.isna(val): + return None + s = str(val).strip() + if s in ('-', '', 'nan', 'None'): + return None + try: + if ',' in s: + return frozenset(float(x.strip()) for x in s.split(',')) + return frozenset([float(s)]) + except (ValueError, TypeError): + return None + + # === Build/retrieve cached index (using function attribute) === + db_id = id(database_df) + if not hasattr(match_single, '_cache') or match_single._cache.get('db_id') != db_id: + # Build allele index and frequency table + loci = [c for c in database_df.columns if c != 'PersonID'] + allele_index = {} # (locus, allele) -> set of person_ids + allele_counts = {} # (locus, allele) -> count + profiles = {} # person_id -> {locus: frozenset of alleles} + + for _, row in database_df.iterrows(): + pid = row['PersonID'] + profile = {} + for locus in loci: + alleles = parse_alleles(row[locus]) + profile[locus] = alleles + if alleles: + for a in alleles: + key = (locus, a) + if key not in allele_index: + allele_index[key] = set() + allele_counts[key] = 0 + allele_index[key].add(pid) + allele_counts[key] += 1 + profiles[pid] = profile + + # Compute frequencies + total_profiles = len(database_df) + allele_freqs = {k: v / total_profiles for k, v in allele_counts.items()} + + match_single._cache = { + 'db_id': db_id, + 'loci': loci, + 'allele_index': allele_index, + 'allele_freqs': allele_freqs, + 'profiles': profiles + } + + cache = match_single._cache + loci = cache['loci'] + allele_index = cache['allele_index'] + allele_freqs = cache['allele_freqs'] + profiles = cache['profiles'] - # Helpful tip: you can compute a simple score like number of shared alleles - # Example skeleton: - """ - candidates = [] query_id = query_profile['PersonID'] - - for _, candidate in database_df.iterrows(): - if candidate['PersonID'] == query_id: - continue # skip self - - score = your_scoring_function(query_profile, candidate) - if score > threshold: - candidates.append({ - "person_id": candidate['PersonID'], - "clr": score, - "posterior": 0.99, # optional - "consistent_loci": 18, - "mutated_loci": 0, - "inconclusive_loci": 3 - }) - - # Sort by CLR descending and take top 10 - candidates.sort(key=lambda x: x['clr'], reverse=True) - return candidates[:10] - """ + + # Parse query profile + query_parsed = {} + for locus in loci: + query_parsed[locus] = parse_alleles(query_profile.get(locus)) + + # === Pre-filter candidates using inverted index === + candidate_match_count = {} + for locus in loci: + q_alleles = query_parsed[locus] + if not q_alleles: + continue + for allele in q_alleles: + key = (locus, allele) + if key in allele_index: + for pid in allele_index[key]: + if pid != query_id: + candidate_match_count[pid] = candidate_match_count.get(pid, 0) + 1 + + # Only consider candidates that share alleles at multiple loci + promising = [pid for pid, cnt in candidate_match_count.items() if cnt >= 8] + + # === Score promising candidates === + results = [] + + for pid in promising: + cand_profile = profiles[pid] + + clr = 1.0 + consistent = 0 + mutated = 0 + inconclusive = 0 + exclusions = 0 + identity_count = 0 + compared = 0 + + for locus in loci: + q_alleles = query_parsed[locus] + c_alleles = cand_profile.get(locus) + + if q_alleles is None or c_alleles is None: + inconclusive += 1 + continue + + compared += 1 + if q_alleles == c_alleles: + identity_count += 1 + + shared = q_alleles & c_alleles + + if shared: + consistent += 1 + # Simple LR: transmission_prob / frequency + trans = 1.0 if len(c_alleles) == 1 else 0.5 + clr *= trans / 0.15 # Average frequency + else: + # Check mutation + is_mutation = any(0 < abs(a - b) <= 1.0 for a in q_alleles for b in c_alleles) + if is_mutation: + mutated += 1 + clr *= 0.01 # Mutation penalty + else: + exclusions += 1 + clr *= 0.001 # Exclusion penalty + + # Filter criteria + # True parent-child should have 0 exclusions (rarely 1 due to mutation) + if exclusions > 1: + continue + if consistent < 8: + continue + if compared > 0 and identity_count / compared > 0.85: + continue + + posterior = clr / (clr + 1.0) if clr > 0 else 0.0 + + results.append({ + "person_id": pid, + "clr": clr, + "posterior": posterior, + "consistent_loci": consistent, + "mutated_loci": mutated, + "inconclusive_loci": inconclusive + }) + + results.sort(key=lambda x: -x['clr']) + return results[:10] # ============================================================