Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 143 additions & 27 deletions src/codechallenge2025/participant_solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,151 @@ def match_single(
...
]
"""
# TODO: Replace this dummy with your real matching logic!
# Example: return empty list (safe default)
return []
# === Helper: parse alleles safely ===
def parse_alleles(val):
if pd.isna(val):
return None
s = str(val).strip()
if s in ('-', '', 'nan', 'None'):
return None
try:
if ',' in s:
return frozenset(float(x.strip()) for x in s.split(','))
return frozenset([float(s)])
except (ValueError, TypeError):
return None

# === Build/retrieve cached index (using function attribute) ===
db_id = id(database_df)
if not hasattr(match_single, '_cache') or match_single._cache.get('db_id') != db_id:
# Build allele index and frequency table
loci = [c for c in database_df.columns if c != 'PersonID']
allele_index = {} # (locus, allele) -> set of person_ids
allele_counts = {} # (locus, allele) -> count
profiles = {} # person_id -> {locus: frozenset of alleles}

for _, row in database_df.iterrows():
pid = row['PersonID']
profile = {}
for locus in loci:
alleles = parse_alleles(row[locus])
profile[locus] = alleles
if alleles:
for a in alleles:
key = (locus, a)
if key not in allele_index:
allele_index[key] = set()
allele_counts[key] = 0
allele_index[key].add(pid)
allele_counts[key] += 1
profiles[pid] = profile

# Compute frequencies
total_profiles = len(database_df)
allele_freqs = {k: v / total_profiles for k, v in allele_counts.items()}

match_single._cache = {
'db_id': db_id,
'loci': loci,
'allele_index': allele_index,
'allele_freqs': allele_freqs,
'profiles': profiles
}

cache = match_single._cache
loci = cache['loci']
allele_index = cache['allele_index']
allele_freqs = cache['allele_freqs']
profiles = cache['profiles']

# Helpful tip: you can compute a simple score like number of shared alleles
# Example skeleton:
"""
candidates = []
query_id = query_profile['PersonID']

for _, candidate in database_df.iterrows():
if candidate['PersonID'] == query_id:
continue # skip self

score = your_scoring_function(query_profile, candidate)
if score > threshold:
candidates.append({
"person_id": candidate['PersonID'],
"clr": score,
"posterior": 0.99, # optional
"consistent_loci": 18,
"mutated_loci": 0,
"inconclusive_loci": 3
})

# Sort by CLR descending and take top 10
candidates.sort(key=lambda x: x['clr'], reverse=True)
return candidates[:10]
"""

# Parse query profile
query_parsed = {}
for locus in loci:
query_parsed[locus] = parse_alleles(query_profile.get(locus))

# === Pre-filter candidates using inverted index ===
candidate_match_count = {}
for locus in loci:
q_alleles = query_parsed[locus]
if not q_alleles:
continue
for allele in q_alleles:
key = (locus, allele)
if key in allele_index:
for pid in allele_index[key]:
if pid != query_id:
candidate_match_count[pid] = candidate_match_count.get(pid, 0) + 1

# Only consider candidates that share alleles at multiple loci
promising = [pid for pid, cnt in candidate_match_count.items() if cnt >= 8]

# === Score promising candidates ===
results = []

for pid in promising:
cand_profile = profiles[pid]

clr = 1.0
consistent = 0
mutated = 0
inconclusive = 0
exclusions = 0
identity_count = 0
compared = 0

for locus in loci:
q_alleles = query_parsed[locus]
c_alleles = cand_profile.get(locus)

if q_alleles is None or c_alleles is None:
inconclusive += 1
continue

compared += 1
if q_alleles == c_alleles:
identity_count += 1

shared = q_alleles & c_alleles

if shared:
consistent += 1
# Simple LR: transmission_prob / frequency
trans = 1.0 if len(c_alleles) == 1 else 0.5
clr *= trans / 0.15 # Average frequency
else:
# Check mutation
is_mutation = any(0 < abs(a - b) <= 1.0 for a in q_alleles for b in c_alleles)
if is_mutation:
mutated += 1
clr *= 0.01 # Mutation penalty
else:
exclusions += 1
clr *= 0.001 # Exclusion penalty

# Filter criteria
# True parent-child should have 0 exclusions (rarely 1 due to mutation)
if exclusions > 1:
continue
if consistent < 8:
continue
if compared > 0 and identity_count / compared > 0.85:
continue

posterior = clr / (clr + 1.0) if clr > 0 else 0.0

results.append({
"person_id": pid,
"clr": clr,
"posterior": posterior,
"consistent_loci": consistent,
"mutated_loci": mutated,
"inconclusive_loci": inconclusive
})

results.sort(key=lambda x: -x['clr'])
return results[:10]


# ============================================================
Expand Down
Loading