From 48c6a47ff20714aa2b0eb148f36ea0632458f36a Mon Sep 17 00:00:00 2001
From: Dani Bodor <d.bodor@esciencecenter.nl>
Date: Tue, 12 Mar 2024 12:45:56 +0100
Subject: [PATCH 1/7] main function that writes new pdb files

---
 deeprank2/tools/pdbprep/__init__.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 deeprank2/tools/pdbprep/__init__.py

diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py
new file mode 100644
index 000000000..e5d63ef94
--- /dev/null
+++ b/deeprank2/tools/pdbprep/__init__.py
@@ -0,0 +1,28 @@
+# All code in this subpackage has been adapted from https://github.com/DeepRank/pdbprep,
+# which is published under an Apache 2.0 licence
+
+import sys
+
+
+def write_pdb(new_pdb, pdbfh):
+    try:
+        _buffer = []
+        _buffer_size = 5000  # write N lines at a time
+        for lineno, line in enumerate(new_pdb):
+            if not (lineno % _buffer_size):
+                sys.stdout.write("".join(_buffer))
+                _buffer = []
+            _buffer.append(line)
+
+        sys.stdout.write("".join(_buffer))
+        sys.stdout.flush()
+    except OSError:
+        # This is here to catch Broken Pipes
+        # for example to use 'head' or 'tail' without
+        # the error message showing up
+        pass
+
+    # last line of the script
+    # We can close it even if it is sys.stdin
+    pdbfh.close()
+    sys.exit(0)

From 78e07af5be7b299c0702229d29a41f1ac64736a8 Mon Sep 17 00:00:00 2001
From: Dani Bodor <d.bodor@esciencecenter.nl>
Date: Tue, 12 Mar 2024 14:03:29 +0100
Subject: [PATCH 2/7] add pruning pdb file (steps 1-3 from pdbprep)

---
 deeprank2/tools/pdbprep/__init__.py | 45 ++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py
index e5d63ef94..c67fcfde1 100644
--- a/deeprank2/tools/pdbprep/__init__.py
+++ b/deeprank2/tools/pdbprep/__init__.py
@@ -2,9 +2,12 @@
 # which is published under an Apache 2.0 licence
 
 import sys
+from collections.abc import Generator
+from typing import TextIO
 
 
-def write_pdb(new_pdb, pdbfh):
+def write_pdb(new_pdb: list, pdbfh: TextIO) -> None:
+    """Writes new pdb files."""
     try:
         _buffer = []
         _buffer_size = 5000  # write N lines at a time
@@ -26,3 +29,43 @@ def write_pdb(new_pdb, pdbfh):
     # We can close it even if it is sys.stdin
     pdbfh.close()
     sys.exit(0)
+
+
+def _prune_records(fhandle: TextIO) -> Generator[str]:
+    """Prune records before processing.
+
+    Scraps non-atomic records and records from water molecule.
+    Replaces non-standard residue names by their standard counterparts.
+    """
+    atomic_record = ("ATOM", "HETATM")  # TODO: check if we need to keep ANISOU and TER records as well?
+    water = "HOH"
+    standard_resnames = {
+        "MSE": "MET",
+        "HIP": "HIS",
+        "HIE": "HIS",
+        "HID": "HIS",
+        "HSE": "HIS",
+        "HSD": "HIS",
+    }
+
+    for record in fhandle:
+        resname = record[17:20]
+        if record.startswith(atomic_record) and resname != water:
+            standardized_resname = standard_resnames.get(resname, resname)
+            yield record[:17] + standardized_resname + record[20:]
+
+
+def pdb_prep(fhandle: TextIO) -> None:
+    """Run all steps from pdb prep repo."""
+    # step 1 - keep coordinates: removes non coordinate lines for simplicity
+    # step 2 - delresname: remove waters
+    # step 3 - rplresname: convert residue names to standard names, ex: MSE to MET
+    new_pdb = _prune_records(fhandle)
+
+    # step 4 - selaltloc: select most probable alternative location
+
+    # step 5 - fixinsert: fix inserts
+    # step 6 - sort: sort chains and resides, necessary for OpenMM
+    # step 7 - reres: renumber residues from 1
+    # step 8 - reatom: renumber atoms from 1
+    # step 9 - tidy: tidy cleans the PDB, adds TER, etc.

From 845a892723fed6fead47a11dbaf9ddd83c7da7be Mon Sep 17 00:00:00 2001
From: Dani Bodor <d.bodor@esciencecenter.nl>
Date: Tue, 12 Mar 2024 16:33:19 +0100
Subject: [PATCH 3/7] style: ruff settings regarding TODOs

---
 pyproject.toml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7978fdbfa..86793f8c1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,7 @@ ignore = [
     "PLR0913", # Too many arguments in function definition
     "D102",    # Missing docstring in public method
     # Unwanted
-    "FBT",    # Using boolean arguments
+    "FBT",    # Disallow using booleans as function arguments
     "ANN101", # Missing type annotation for `self` in method
     "ANN102", # Missing type annotation for `cls` in classmethod
     "ANN204", # Missing return type annotation for special (dunder) method
@@ -87,6 +87,10 @@ ignore = [
     "S311",   # insecure random generators
     "PT011",  # pytest-raises-too-broad
     "SIM108", # Use ternary operator
+    # TODO formatting
+    "TD002",  # Missing TODO author
+    "TD003",  # Missing TODO link
+    "FIX002", # Consider resolving the issue instead 
     # Unwanted docstrings
     "D100", # Missing module docstring
     "D104", # Missing public package docstring

From 8c43bbfd1e414ec920874571a38422e5c040a52a Mon Sep 17 00:00:00 2001
From: Dani Bodor <d.bodor@esciencecenter.nl>
Date: Tue, 12 Mar 2024 18:41:30 +0100
Subject: [PATCH 4/7] copy step 4 from pdbprep (selaltloc)

---
 deeprank2/tools/pdbprep/__init__.py | 137 ++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)

diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py
index c67fcfde1..a96a3f9b1 100644
--- a/deeprank2/tools/pdbprep/__init__.py
+++ b/deeprank2/tools/pdbprep/__init__.py
@@ -55,6 +55,143 @@ def _prune_records(fhandle: TextIO) -> Generator[str]:
             yield record[:17] + standardized_resname + record[20:]
 
 
+def run(fhandle):
+    records = ("ATOM", "HETATM")
+    # terminators = ('TER', 'END', 'CONECT', 'END', 'ENDMDL', 'MODEL')
+    # meaningful = records + terminators
+
+    # register atom information
+    register = dict()
+
+    # register comment lines
+    others = []
+
+    # register current chain
+    chain = None
+    prev_chain = None
+
+    # keep record of the line number. This will be used to sort lines
+    # after selecting the desired alternative location
+    nline = 0
+
+    # the loop will collect information on the different atoms
+    # throughout the PDB file until a new chain or any terminal line is
+    # found. At that point, the collected information is flushed because
+    # all altlocs for that block have been defined.
+    for line in fhandle:
+        nline += 1
+
+        if line.startswith(records):
+            # here resnum + insertion code are taken to identify
+            # different residues
+            resnum = line[22:27]
+            atomname = line[12:16]
+            altloc = line[16]
+            chain = line[21:22]
+
+            # flush lines because we enter a new chain
+            if chain != prev_chain:
+                # the "yield from" statement is avoided to keep
+                # compatibility with Python 2.7
+                for _line in _flush(register):
+                    yield _line
+
+                # Python 2.7 compatibility. Do not use .clear() method
+                # restart help variables
+                del register, others
+                register, others = dict(), []
+
+            # organizes information hierarchically
+            resnum_d = register.setdefault(resnum, {})
+            atomname_d = resnum_d.setdefault(atomname, {})
+            altloc_d = atomname_d.setdefault(altloc, [])
+
+            # adds info to dictionary
+            altloc_d.append((nline, line))
+
+        # flush information because we reached the end of a block
+        # elif line.startswith(terminators):
+        #    for _line in _flush(register):
+        #        yield _line
+
+        #    del register, others
+        #    register, others = dict(), []
+
+        #    yield line  # yield the current line after flush
+
+        prev_chain = chain
+
+    # at the end of the PDB, flush the remaining lines
+    for _line in _flush(register):
+        yield _line
+
+
+def _flush(register):
+    """Processes the collected atoms according to the selaltloc option."""
+    lines_to_yield = []
+
+    atom_lines = ("ATOM", "HETATM")
+
+    # anisou lines are treated specially
+    anisou_lines = ("ANISOU",)
+
+    for resnum, atomnames in register.items():
+        for atomname, altlocs in atomnames.items():
+            # gathers all alternative locations for the atom
+            all_lines = []
+            for altloc, lines in altlocs.items():
+                all_lines.extend(lines)
+
+            # identifies the highest occupancy combining dictionary
+            # and sorting
+            new = {}
+            for line_number, line in all_lines:
+                if line.startswith(atom_lines):
+                    occupancy_number = line[54:60]
+                    list_ = new.setdefault(occupancy_number, [])
+                    list_.append((line_number, line))
+
+                # assumes ANISOU succeed the respective ATOM line
+                elif line.startswith(anisou_lines):
+                    list_.append((line_number, line))
+
+            # sort keys by occupancy
+            keys_ = sorted(new.keys(), key=lambda x: float(x.strip()), reverse=True)
+
+            these_atom_lines = new[keys_[0]]
+            if len(keys_) == 1 and len(these_atom_lines) > 1:
+                # address "take first if occ is the same"
+                # see: https://github.com/haddocking/pdb-tools/issues/153#issuecomment-1488627668
+                lines_to_yield.extend(_remove_altloc(these_atom_lines[0:1]))
+
+                # if there's ANISOU, add it
+                if these_atom_lines[1][1].startswith(anisou_lines):
+                    lines_to_yield.extend(_remove_altloc(these_atom_lines[1:2]))
+
+            # this should run when there are more than one key or
+            # the key has only one atom line. Keys are the occ
+            # value.
+            else:
+                # when occs are different, select the highest one
+                lines_to_yield.extend(_remove_altloc(these_atom_lines))
+
+            del all_lines, new
+
+    # lines are sorted to the line number so that the output is sorted
+    # the same way as in the input PDB
+    lines_to_yield.sort(key=lambda x: x[0])
+
+    # the line number is ignored, only the line is yield
+    for line_number, line in lines_to_yield:
+        yield line
+
+
+def _remove_altloc(lines):
+    # the altloc ID is removed in processed altloc lines
+    for line_num, line in lines:
+        yield (line_num, line[:16] + " " + line[17:])
+
+
 def pdb_prep(fhandle: TextIO) -> None:
     """Run all steps from pdb prep repo."""
     # step 1 - keep coordinates: removes non coordinate lines for simplicity

From 13c43bec498eb3387975ba1a8b4431bb6ba0ee0f Mon Sep 17 00:00:00 2001
From: Dani Bodor <d.bodor@esciencecenter.nl>
Date: Tue, 12 Mar 2024 18:41:59 +0100
Subject: [PATCH 5/7] start refactoring step 4: select one of alternate
 locations

---
 deeprank2/tools/pdbprep/__init__.py | 135 +++++++++-------------------
 1 file changed, 41 insertions(+), 94 deletions(-)

diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py
index a96a3f9b1..acc671270 100644
--- a/deeprank2/tools/pdbprep/__init__.py
+++ b/deeprank2/tools/pdbprep/__init__.py
@@ -55,117 +55,64 @@ def _prune_records(fhandle: TextIO) -> Generator[str]:
             yield record[:17] + standardized_resname + record[20:]
 
 
-def run(fhandle):
-    records = ("ATOM", "HETATM")
-    # terminators = ('TER', 'END', 'CONECT', 'END', 'ENDMDL', 'MODEL')
-    # meaningful = records + terminators
-
-    # register atom information
-    register = dict()
-
-    # register comment lines
-    others = []
-
-    # register current chain
-    chain = None
-    prev_chain = None
-
-    # keep record of the line number. This will be used to sort lines
-    # after selecting the desired alternative location
-    nline = 0
-
-    # the loop will collect information on the different atoms
-    # throughout the PDB file until a new chain or any terminal line is
-    # found. At that point, the collected information is flushed because
-    # all altlocs for that block have been defined.
-    for line in fhandle:
-        nline += 1
-
-        if line.startswith(records):
-            # here resnum + insertion code are taken to identify
-            # different residues
-            resnum = line[22:27]
-            atomname = line[12:16]
-            altloc = line[16]
-            chain = line[21:22]
-
-            # flush lines because we enter a new chain
-            if chain != prev_chain:
-                # the "yield from" statement is avoided to keep
-                # compatibility with Python 2.7
-                for _line in _flush(register):
-                    yield _line
-
-                # Python 2.7 compatibility. Do not use .clear() method
-                # restart help variables
-                del register, others
-                register, others = dict(), []
-
-            # organizes information hierarchically
-            resnum_d = register.setdefault(resnum, {})
-            atomname_d = resnum_d.setdefault(atomname, {})
-            altloc_d = atomname_d.setdefault(altloc, [])
-
-            # adds info to dictionary
-            altloc_d.append((nline, line))
-
-        # flush information because we reached the end of a block
-        # elif line.startswith(terminators):
-        #    for _line in _flush(register):
-        #        yield _line
-
-        #    del register, others
-        #    register, others = dict(), []
-
-        #    yield line  # yield the current line after flush
+def _select_alt_location(pdb: list[str]) -> Generator[str]:
+    """Select alternate location."""
+    register = {}  # register atom information
+    prev_chain = None  # register previous chain
+
+    # This loop will collect information on the different atoms throughout the PDB file until a new chain or any terminal line is
+    # found. At that point, the collected information is processed because all altlocs for that block have been defined.
+    for nline, record in enumerate(pdb):  # line number will be used to sort lines after selecting the desired alternative location
+        atomname = record[12:16]
+        altloc = record[16]
+        chain = record[21:22]
+        resnum = record[22:27]  # resnum (22-25) + insertion code (26) is taken to identify different residues
+
+        # process lines because we enter a new chain
+        if chain != prev_chain:
+            yield from _process_altloc(register)
+            register = {}
+
+        # add info to dictionary in a hierarchically organized manner
+        resnum_d: dict = register.setdefault(resnum, {})
+        atomname_d: dict = resnum_d.setdefault(atomname, {})
+        altloc_d: list = atomname_d.setdefault(altloc, [])
+        altloc_d.append((nline, record))
 
         prev_chain = chain
 
-    # at the end of the PDB, flush the remaining lines
-    for _line in _flush(register):
-        yield _line
+    # at the end of the PDB, process the remaining lines
+    yield from _process_altloc(register)
 
 
-def _flush(register):
+def _process_altloc(register: dict[str, dict[str, dict[str, list[tuple[int, str]]]]]) -> Generator[str]:
+    # TODO: Reduce complexity of `register` if possible
     """Processes the collected atoms according to the selaltloc option."""
     lines_to_yield = []
 
-    atom_lines = ("ATOM", "HETATM")
+    anisou_record = ("ANISOU",)  # anisou lines are treated specially and always follow atom records
 
-    # anisou lines are treated specially
-    anisou_lines = ("ANISOU",)
+    for atomnames in register.values():
+        for altlocs in atomnames.values():
+            all_lines: list[tuple[int, str]] = list(*altlocs.values())  # all alternative locations for the atom
 
-    for resnum, atomnames in register.items():
-        for atomname, altlocs in atomnames.items():
-            # gathers all alternative locations for the atom
-            all_lines = []
-            for altloc, lines in altlocs.items():
-                all_lines.extend(lines)
-
-            # identifies the highest occupancy combining dictionary
-            # and sorting
-            new = {}
+            # identify the highest occupancy combining dictionary and sorting
+            occ_line_dict = {}  # TODO: rename
             for line_number, line in all_lines:
-                if line.startswith(atom_lines):
-                    occupancy_number = line[54:60]
-                    list_ = new.setdefault(occupancy_number, [])
-                    list_.append((line_number, line))
-
-                # assumes ANISOU succeed the respective ATOM line
-                elif line.startswith(anisou_lines):
-                    list_.append((line_number, line))
+                occupancy = line[54:60]
+                occ_line_dict[occupancy] = [(line_number, line)]
 
             # sort keys by occupancy
-            keys_ = sorted(new.keys(), key=lambda x: float(x.strip()), reverse=True)
+            keys_ = sorted(occ_line_dict.keys(), key=lambda x: float(x.strip()), reverse=True)  # TODO: rename once I know what this is used for
 
-            these_atom_lines = new[keys_[0]]
+            these_atom_lines = occ_line_dict[keys_[0]]
             if len(keys_) == 1 and len(these_atom_lines) > 1:
                 # address "take first if occ is the same"
                 # see: https://github.com/haddocking/pdb-tools/issues/153#issuecomment-1488627668
                 lines_to_yield.extend(_remove_altloc(these_atom_lines[0:1]))
 
                 # if there's ANISOU, add it
-                if these_atom_lines[1][1].startswith(anisou_lines):
+                if these_atom_lines[1][1].startswith(anisou_record):
                     lines_to_yield.extend(_remove_altloc(these_atom_lines[1:2]))
 
             # this should run when there are more than one key or
@@ -175,18 +122,18 @@ def _flush(register):
                 # when occs are different, select the highest one
                 lines_to_yield.extend(_remove_altloc(these_atom_lines))
 
-            del all_lines, new
+            del all_lines, occ_line_dict
 
     # lines are sorted to the line number so that the output is sorted
     # the same way as in the input PDB
     lines_to_yield.sort(key=lambda x: x[0])
 
     # the line number is ignored, only the line is yield
-    for line_number, line in lines_to_yield:
+    for _, line in lines_to_yield:
         yield line
 
 
-def _remove_altloc(lines):
+def _remove_altloc(lines: str) -> Generator[str]:
     # the altloc ID is removed in processed altloc lines
     for line_num, line in lines:
         yield (line_num, line[:16] + " " + line[17:])

From 0ad1c9cd3ca1a5015e7a4b02c732a4a53b56f071 Mon Sep 17 00:00:00 2001
From: Dani Bodor <d.bodor@esciencecenter.nl>
Date: Tue, 12 Mar 2024 21:20:15 +0100
Subject: [PATCH 6/7] identify low occupancy records

---
 deeprank2/tools/pdbprep/__init__.py | 121 ++++++++++------------------
 1 file changed, 44 insertions(+), 77 deletions(-)

diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py
index acc671270..5f0d2fe32 100644
--- a/deeprank2/tools/pdbprep/__init__.py
+++ b/deeprank2/tools/pdbprep/__init__.py
@@ -2,9 +2,17 @@
 # which is published under an Apache 2.0 licence
 
 import sys
+from collections import defaultdict
 from collections.abc import Generator
 from typing import TextIO
 
+# define record columns for each datum
+_ATOMNAME_COLS = slice(12, 16)
+_RESNAME_COLS = slice(17, 20)
+_CHAIN_COLS = slice(21, 22)
+_RESNUM_COLS = slice(22, 27)  # this includes both the residue number and insertion code
+_OCCUPANCY_COLS = slice(54, 60)
+
 
 def write_pdb(new_pdb: list, pdbfh: TextIO) -> None:
     """Writes new pdb files."""
@@ -49,94 +57,53 @@ def _prune_records(fhandle: TextIO) -> Generator[str]:
     }
 
     for record in fhandle:
-        resname = record[17:20]
+        resname = record[_RESNAME_COLS]
         if record.startswith(atomic_record) and resname != water:
             standardized_resname = standard_resnames.get(resname, resname)
             yield record[:17] + standardized_resname + record[20:]
 
 
-def _select_alt_location(pdb: list[str]) -> Generator[str]:
-    """Select alternate location."""
-    register = {}  # register atom information
-    prev_chain = None  # register previous chain
-
-    # This loop will collect information on the different atoms throughout the PDB file until a new chain or any terminal line is
-    # found. At that point, the collected information is processed because all altlocs for that block have been defined.
-    for nline, record in enumerate(pdb):  # line number will be used to sort lines after selecting the desired alternative location
-        atomname = record[12:16]
-        altloc = record[16]
-        chain = record[21:22]
-        resnum = record[22:27]  # resnum (22-25) + insertion code (26) is taken to identify different residues
-
-        # process lines because we enter a new chain
-        if chain != prev_chain:
-            yield from _process_altloc(register)
-            register = {}
-
-        # add info to dictionary in a hierarchically organized manner
-        resnum_d: dict = register.setdefault(resnum, {})
-        atomname_d: dict = resnum_d.setdefault(atomname, {})
-        altloc_d: list = atomname_d.setdefault(altloc, [])
-        altloc_d.append((nline, record))
-
-        prev_chain = chain
-
-    # at the end of the PDB, process the remaining lines
-    yield from _process_altloc(register)
-
+def _find_low_occ_records(pdb: list[str]) -> list[int]:
+    """Helper function to identify records with lowest occupancy alternate locations.
 
-def _process_altloc(register: dict[str, dict[str, dict[str, list[tuple[int, str]]]]]) -> Generator[str]:
-    # TODO: Reduce complexity of `register` if possible
-    """Processes the collected atoms according to the selaltloc option."""
-    lines_to_yield = []
+    In case an atom is detected at more than one position (e.g. due to alternate conformations), the structure will
+    contain the same atom multiple times with separate "alternate location indicators" (col 17 of the pdb record).
+    Each location will have a certain occupancy, i.e. proportion of structures where this particular location is found
+    (and thus all occupancies for a given atom sum to 1).
 
-    anisou_record = ("ANISOU",)  # anisou lines are treated specially and always follow atom records
+    This function first identifies atoms that are listed more than once in a pdb file, based on their chain identifier
+    (col 22), residue sequence number (col 23-26), and atom name (col 13-16). It then identifies the record with the
+    highest occupancy for each atom (in case of equal occupancy, the first entry is considered higher). From this, a
+    list of indices is returned representing the records that do not contain the highest occupancy for the atom in that
+    record.
 
-    for atomnames in register.values():
-        for altlocs in atomnames.values():
-            all_lines: list[tuple[int, str]] = list(*altlocs.values())  # all alternative locations for the atom
+    Args:
+        pdb: list of records (lines) from a pdb file
 
-            # identify the highest occupancy combining dictionary and sorting
-            occ_line_dict = {}  # TODO: rename
-            for line_number, line in all_lines:
-                occupancy = line[54:60]
-                occ_line_dict[occupancy] = [(line_number, line)]
-
-            # sort keys by occupancy
-            keys_ = sorted(occ_line_dict.keys(), key=lambda x: float(x.strip()), reverse=True)  # TODO: rename once I know what this is used for
-
-            these_atom_lines = occ_line_dict[keys_[0]]
-            if len(keys_) == 1 and len(these_atom_lines) > 1:
-                # address "take first if occ is the same"
-                # see: https://github.com/haddocking/pdb-tools/issues/153#issuecomment-1488627668
-                lines_to_yield.extend(_remove_altloc(these_atom_lines[0:1]))
-
-                # if there's ANISOU, add it
-                if these_atom_lines[1][1].startswith(anisou_record):
-                    lines_to_yield.extend(_remove_altloc(these_atom_lines[1:2]))
-
-            # this should run when there are more than one key or
-            # the key has only one atom line. Keys are the occ
-            # value.
-            else:
-                # when occs are different, select the highest one
-                lines_to_yield.extend(_remove_altloc(these_atom_lines))
-
-            del all_lines, occ_line_dict
-
-    # lines are sorted to the line number so that the output is sorted
-    # the same way as in the input PDB
-    lines_to_yield.sort(key=lambda x: x[0])
+    Returns:
+        list of indices of records that do not contain the highest occupancy location
+    """
+    # define record columns for each datum
 
-    # the line number is ignored, only the line is yield
-    for _, line in lines_to_yield:
-        yield line
+    atom_indentiers = [record[_CHAIN_COLS] + record[_RESNUM_COLS] + record[_ATOMNAME_COLS] for record in pdb]
 
+    # create a dictionary containing only duplicated atom_indentiers (keys) and their indices in pdb (values)
+    # from: https://stackoverflow.com/a/11236042/5170442
+    duplicates = defaultdict(list)
+    for i, atom in enumerate(atom_indentiers):
+        duplicates[atom].append(i)
+    duplicates = {k: v for k, v in duplicates.items() if len(v) > 1}
 
-def _remove_altloc(lines: str) -> Generator[str]:
-    # the altloc ID is removed in processed altloc lines
-    for line_num, line in lines:
-        yield (line_num, line[:16] + " " + line[17:])
+    highest_occupancies = {}
+    for atom, record_indices in duplicates.items():
+        highest_occ = 0
+        for i in record_indices:
+            occupancy = pdb[i][_OCCUPANCY_COLS]
+            if occupancy > highest_occ:
+                # only keep the record with the highest occupancy; in case of tie keep the first
+                highest_occ = occupancy
+                highest_occupancies[atom] = i
+    return [x for xs in duplicates.values() for x in xs if x not in highest_occupancies.values()]
 
 
 def pdb_prep(fhandle: TextIO) -> None:
@@ -144,7 +111,7 @@ def pdb_prep(fhandle: TextIO) -> None:
     # step 1 - keep coordinates: removes non coordinate lines for simplicity
     # step 2 - delresname: remove waters
     # step 3 - rplresname: convert residue names to standard names, ex: MSE to MET
-    new_pdb = _prune_records(fhandle)
+    _new_pdb = _prune_records(fhandle)
 
     # step 4 - selaltloc: select most probable alternative location
 

From 65f85d27236d09ccd56ed3764f67288c01db38d9 Mon Sep 17 00:00:00 2001
From: Dani Bodor <d.bodor@esciencecenter.nl>
Date: Tue, 12 Mar 2024 21:45:43 +0100
Subject: [PATCH 7/7] use `_find_low_occ_records` to limit yield

---
 deeprank2/tools/pdbprep/__init__.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py
index 5f0d2fe32..fae37b524 100644
--- a/deeprank2/tools/pdbprep/__init__.py
+++ b/deeprank2/tools/pdbprep/__init__.py
@@ -56,11 +56,14 @@ def _prune_records(fhandle: TextIO) -> Generator[str]:
         "HSD": "HIS",
     }
 
-    for record in fhandle:
+    for i, record in enumerate(fhandle):
         resname = record[_RESNAME_COLS]
-        if record.startswith(atomic_record) and resname != water:
+        if record.startswith(atomic_record) and resname != water and i not in _find_low_occ_records(fhandle):
+            # TODO: if within a single file mixed residue nomenclature is used, it is not detected by _find_low_occ_records
+            # probably fix this by running these in separate functions rather than all at once.
             standardized_resname = standard_resnames.get(resname, resname)
-            yield record[:17] + standardized_resname + record[20:]
+            record = record[: _RESNAME_COLS.start] + standardized_resname + record[_RESNAME_COLS.stop :]  # noqa: PLW2901
+            yield record
 
 
 def _find_low_occ_records(pdb: list[str]) -> list[int]:
@@ -111,9 +114,8 @@ def pdb_prep(fhandle: TextIO) -> None:
     # step 1 - keep coordinates: removes non coordinate lines for simplicity
     # step 2 - delresname: remove waters
     # step 3 - rplresname: convert residue names to standard names, ex: MSE to MET
-    _new_pdb = _prune_records(fhandle)
-
     # step 4 - selaltloc: select most probable alternative location
+    _new_pdb = _prune_records(fhandle)
 
     # step 5 - fixinsert: fix inserts
     # step 6 - sort: sort chains and resides, necessary for OpenMM