From 48c6a47ff20714aa2b0eb148f36ea0632458f36a Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 12 Mar 2024 12:45:56 +0100 Subject: [PATCH 1/7] main function that writes new pdb files --- deeprank2/tools/pdbprep/__init__.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 deeprank2/tools/pdbprep/__init__.py diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py new file mode 100644 index 000000000..e5d63ef94 --- /dev/null +++ b/deeprank2/tools/pdbprep/__init__.py @@ -0,0 +1,28 @@ +# All code in this subpackage has been adapted from https://github.com/DeepRank/pdbprep, +# which is published under an Apache 2.0 licence + +import sys + + +def write_pdb(new_pdb, pdbfh): + try: + _buffer = [] + _buffer_size = 5000 # write N lines at a time + for lineno, line in enumerate(new_pdb): + if not (lineno % _buffer_size): + sys.stdout.write("".join(_buffer)) + _buffer = [] + _buffer.append(line) + + sys.stdout.write("".join(_buffer)) + sys.stdout.flush() + except OSError: + # This is here to catch Broken Pipes + # for example to use 'head' or 'tail' without + # the error message showing up + pass + + # last line of the script + # We can close it even if it is sys.stdin + pdbfh.close() + sys.exit(0) From 78e07af5be7b299c0702229d29a41f1ac64736a8 Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 12 Mar 2024 14:03:29 +0100 Subject: [PATCH 2/7] add pruning pdb file (steps 1-3 from pdbprep) --- deeprank2/tools/pdbprep/__init__.py | 45 ++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py index e5d63ef94..c67fcfde1 100644 --- a/deeprank2/tools/pdbprep/__init__.py +++ b/deeprank2/tools/pdbprep/__init__.py @@ -2,9 +2,12 @@ # which is published under an Apache 2.0 licence import sys +from collections.abc import Generator +from typing import TextIO -def write_pdb(new_pdb, pdbfh): +def write_pdb(new_pdb: list, pdbfh: TextIO) -> None: + """Writes new pdb files.""" try: _buffer = [] _buffer_size = 5000 # write N lines at a time @@ -26,3 +29,43 @@ def write_pdb(new_pdb, pdbfh): # We can close it even if it is sys.stdin pdbfh.close() sys.exit(0) + + +def _prune_records(fhandle: TextIO) -> Generator[str]: + """Prune records before processing. + + Scraps non-atomic records and records from water molecule. + Replaces non-standard residue names by their standard counterparts. + """ + atomic_record = ("ATOM", "HETATM") # TODO: check if we need to keep ANISOU and TER records as well? + water = "HOH" + standard_resnames = { + "MSE": "MET", + "HIP": "HIS", + "HIE": "HIS", + "HID": "HIS", + "HSE": "HIS", + "HSD": "HIS", + } + + for record in fhandle: + resname = record[17:20] + if record.startswith(atomic_record) and resname != water: + standardized_resname = standard_resnames.get(resname, resname) + yield record[:17] + standardized_resname + record[20:] + + +def pdb_prep(fhandle: TextIO) -> None: + """Run all steps from pdb prep repo.""" + # step 1 - keep coordinates: removes non coordinate lines for simplicity + # step 2 - delresname: remove waters + # step 3 - rplresname: convert residue names to standard names, ex: MSE to MET + new_pdb = _prune_records(fhandle) + + # step 4 - selaltloc: select most probable alternative location + + # step 5 - fixinsert: fix inserts + # step 6 - sort: sort chains and resides, necessary for OpenMM + # step 7 - reres: renumber residues from 1 + # step 8 - reatom: renumber atoms from 1 + # step 9 - tidy: tidy cleans the PDB, adds TER, etc. From 845a892723fed6fead47a11dbaf9ddd83c7da7be Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 12 Mar 2024 16:33:19 +0100 Subject: [PATCH 3/7] style: ruff settings regarding TODOs --- pyproject.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7978fdbfa..86793f8c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ ignore = [ "PLR0913", # Too many arguments in function definition "D102", # Missing docstring in public method # Unwanted - "FBT", # Using boolean arguments + "FBT", # Disallow using booleans as function arguments "ANN101", # Missing type annotation for `self` in method "ANN102", # Missing type annotation for `cls` in classmethod "ANN204", # Missing return type annotation for special (dunder) method @@ -87,6 +87,10 @@ ignore = [ "S311", # insecure random generators "PT011", # pytest-raises-too-broad "SIM108", # Use ternary operator + # TODO formatting + "TD002", # Missing TODO author + "TD003", # Missing TODO link + "FIX002", # Consider resolving the issue instead # Unwanted docstrings "D100", # Missing module docstring "D104", # Missing public package docstring From 8c43bbfd1e414ec920874571a38422e5c040a52a Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 12 Mar 2024 18:41:30 +0100 Subject: [PATCH 4/7] copy step 4 from pdbprep (selaltloc) --- deeprank2/tools/pdbprep/__init__.py | 137 ++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py index c67fcfde1..a96a3f9b1 100644 --- a/deeprank2/tools/pdbprep/__init__.py +++ b/deeprank2/tools/pdbprep/__init__.py @@ -55,6 +55,143 @@ def _prune_records(fhandle: TextIO) -> Generator[str]: yield record[:17] + standardized_resname + record[20:] +def run(fhandle): + records = ("ATOM", "HETATM") + # terminators = ('TER', 'END', 'CONECT', 'END', 'ENDMDL', 'MODEL') + # meaningful = records + terminators + + # register atom information + register = dict() + + # register comment lines + others = [] + + # register current chain + chain = None + prev_chain = None + + # keep record of the line number. This will be used to sort lines + # after selecting the desired alternative location + nline = 0 + + # the loop will collect information on the different atoms + # throughout the PDB file until a new chain or any terminal line is + # found. At that point, the collected information is flushed because + # all altlocs for that block have been defined. + for line in fhandle: + nline += 1 + + if line.startswith(records): + # here resnum + insertion code are taken to identify + # different residues + resnum = line[22:27] + atomname = line[12:16] + altloc = line[16] + chain = line[21:22] + + # flush lines because we enter a new chain + if chain != prev_chain: + # the "yield from" statement is avoided to keep + # compatibility with Python 2.7 + for _line in _flush(register): + yield _line + + # Python 2.7 compatibility. Do not use .clear() method + # restart help variables + del register, others + register, others = dict(), [] + + # organizes information hierarchically + resnum_d = register.setdefault(resnum, {}) + atomname_d = resnum_d.setdefault(atomname, {}) + altloc_d = atomname_d.setdefault(altloc, []) + + # adds info to dictionary + altloc_d.append((nline, line)) + + # flush information because we reached the end of a block + # elif line.startswith(terminators): + # for _line in _flush(register): + # yield _line + + # del register, others + # register, others = dict(), [] + + # yield line # yield the current line after flush + + prev_chain = chain + + # at the end of the PDB, flush the remaining lines + for _line in _flush(register): + yield _line + + +def _flush(register): + """Processes the collected atoms according to the selaltloc option.""" + lines_to_yield = [] + + atom_lines = ("ATOM", "HETATM") + + # anisou lines are treated specially + anisou_lines = ("ANISOU",) + + for resnum, atomnames in register.items(): + for atomname, altlocs in atomnames.items(): + # gathers all alternative locations for the atom + all_lines = [] + for altloc, lines in altlocs.items(): + all_lines.extend(lines) + + # identifies the highest occupancy combining dictionary + # and sorting + new = {} + for line_number, line in all_lines: + if line.startswith(atom_lines): + occupancy_number = line[54:60] + list_ = new.setdefault(occupancy_number, []) + list_.append((line_number, line)) + + # assumes ANISOU succeed the respective ATOM line + elif line.startswith(anisou_lines): + list_.append((line_number, line)) + + # sort keys by occupancy + keys_ = sorted(new.keys(), key=lambda x: float(x.strip()), reverse=True) + + these_atom_lines = new[keys_[0]] + if len(keys_) == 1 and len(these_atom_lines) > 1: + # address "take first if occ is the same" + # see: https://github.com/haddocking/pdb-tools/issues/153#issuecomment-1488627668 + lines_to_yield.extend(_remove_altloc(these_atom_lines[0:1])) + + # if there's ANISOU, add it + if these_atom_lines[1][1].startswith(anisou_lines): + lines_to_yield.extend(_remove_altloc(these_atom_lines[1:2])) + + # this should run when there are more than one key or + # the key has only one atom line. Keys are the occ + # value. + else: + # when occs are different, select the highest one + lines_to_yield.extend(_remove_altloc(these_atom_lines)) + + del all_lines, new + + # lines are sorted to the line number so that the output is sorted + # the same way as in the input PDB + lines_to_yield.sort(key=lambda x: x[0]) + + # the line number is ignored, only the line is yield + for line_number, line in lines_to_yield: + yield line + + +def _remove_altloc(lines): + # the altloc ID is removed in processed altloc lines + for line_num, line in lines: + yield (line_num, line[:16] + " " + line[17:]) + + def pdb_prep(fhandle: TextIO) -> None: """Run all steps from pdb prep repo.""" # step 1 - keep coordinates: removes non coordinate lines for simplicity From 13c43bec498eb3387975ba1a8b4431bb6ba0ee0f Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 12 Mar 2024 18:41:59 +0100 Subject: [PATCH 5/7] start refactoring step 4: select one of alternate locations --- deeprank2/tools/pdbprep/__init__.py | 135 +++++++++------------------- 1 file changed, 41 insertions(+), 94 deletions(-) diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py index a96a3f9b1..acc671270 100644 --- a/deeprank2/tools/pdbprep/__init__.py +++ b/deeprank2/tools/pdbprep/__init__.py @@ -55,117 +55,64 @@ def _prune_records(fhandle: TextIO) -> Generator[str]: yield record[:17] + standardized_resname + record[20:] -def run(fhandle): - records = ("ATOM", "HETATM") - # terminators = ('TER', 'END', 'CONECT', 'END', 'ENDMDL', 'MODEL') - # meaningful = records + terminators - - # register atom information - register = dict() - - # register comment lines - others = [] - - # register current chain - chain = None - prev_chain = None - - # keep record of the line number. This will be used to sort lines - # after selecting the desired alternative location - nline = 0 - - # the loop will collect information on the different atoms - # throughout the PDB file until a new chain or any terminal line is - # found. At that point, the collected information is flushed because - # all altlocs for that block have been defined. - for line in fhandle: - nline += 1 - - if line.startswith(records): - # here resnum + insertion code are taken to identify - # different residues - resnum = line[22:27] - atomname = line[12:16] - altloc = line[16] - chain = line[21:22] - - # flush lines because we enter a new chain - if chain != prev_chain: - # the "yield from" statement is avoided to keep - # compatibility with Python 2.7 - for _line in _flush(register): - yield _line - - # Python 2.7 compatibility. Do not use .clear() method - # restart help variables - del register, others - register, others = dict(), [] - - # organizes information hierarchically - resnum_d = register.setdefault(resnum, {}) - atomname_d = resnum_d.setdefault(atomname, {}) - altloc_d = atomname_d.setdefault(altloc, []) - - # adds info to dictionary - altloc_d.append((nline, line)) - - # flush information because we reached the end of a block - # elif line.startswith(terminators): - # for _line in _flush(register): - # yield _line - - # del register, others - # register, others = dict(), [] - - # yield line # yield the current line after flush +def _select_alt_location(pdb: list[str]) -> Generator[str]: + """Select alternate location.""" + register = {} # register atom information + prev_chain = None # register previous chain + + # This loop will collect information on the different atoms throughout the PDB file until a new chain or any terminal line is + # found. At that point, the collected information is processed because all altlocs for that block have been defined. + for nline, record in enumerate(pdb): # line number will be used to sort lines after selecting the desired alternative location + atomname = record[12:16] + altloc = record[16] + chain = record[21:22] + resnum = record[22:27] # resnum (22-25) + insertion code (26) is taken to identify different residues + + # process lines because we enter a new chain + if chain != prev_chain: + yield from _process_altloc(register) + register = {} + + # add info to dictionary in a hierarchically organized manner + resnum_d: dict = register.setdefault(resnum, {}) + atomname_d: dict = resnum_d.setdefault(atomname, {}) + altloc_d: list = atomname_d.setdefault(altloc, []) + altloc_d.append((nline, record)) prev_chain = chain - # at the end of the PDB, flush the remaining lines - for _line in _flush(register): - yield _line + # at the end of the PDB, process the remaining lines + yield from _process_altloc(register) -def _flush(register): +def _process_altloc(register: dict[str, dict[str, dict[str, list[tuple[int, str]]]]]) -> Generator[str]: + # TODO: Reduce complexity of `register` if possible """Processes the collected atoms according to the selaltloc option.""" lines_to_yield = [] - atom_lines = ("ATOM", "HETATM") + anisou_record = ("ANISOU",) # anisou lines are treated specially and always follow atom records - # anisou lines are treated specially - anisou_lines = ("ANISOU",) + for atomnames in register.values(): + for altlocs in atomnames.values(): + all_lines: list[tuple[int, str]] = list(*altlocs.values()) # all alternative locations for the atom - for resnum, atomnames in register.items(): - for atomname, altlocs in atomnames.items(): - # gathers all alternative locations for the atom - all_lines = [] - for altloc, lines in altlocs.items(): - all_lines.extend(lines) - - # identifies the highest occupancy combining dictionary - # and sorting - new = {} + # identify the highest occupancy combining dictionary and sorting + occ_line_dict = {} # TODO: rename for line_number, line in all_lines: - if line.startswith(atom_lines): - occupancy_number = line[54:60] - list_ = new.setdefault(occupancy_number, []) - list_.append((line_number, line)) - - # assumes ANISOU succeed the respective ATOM line - elif line.startswith(anisou_lines): - list_.append((line_number, line)) + occupancy = line[54:60] + occ_line_dict[occupancy] = [(line_number, line)] # sort keys by occupancy - keys_ = sorted(new.keys(), key=lambda x: float(x.strip()), reverse=True) + keys_ = sorted(occ_line_dict.keys(), key=lambda x: float(x.strip()), reverse=True) # TODO: rename once I know what this is used for - these_atom_lines = new[keys_[0]] + these_atom_lines = occ_line_dict[keys_[0]] if len(keys_) == 1 and len(these_atom_lines) > 1: # address "take first if occ is the same" # see: https://github.com/haddocking/pdb-tools/issues/153#issuecomment-1488627668 lines_to_yield.extend(_remove_altloc(these_atom_lines[0:1])) # if there's ANISOU, add it - if these_atom_lines[1][1].startswith(anisou_lines): + if these_atom_lines[1][1].startswith(anisou_record): lines_to_yield.extend(_remove_altloc(these_atom_lines[1:2])) # this should run when there are more than one key or @@ -175,18 +122,18 @@ def _flush(register): # when occs are different, select the highest one lines_to_yield.extend(_remove_altloc(these_atom_lines)) - del all_lines, new + del all_lines, occ_line_dict # lines are sorted to the line number so that the output is sorted # the same way as in the input PDB lines_to_yield.sort(key=lambda x: x[0]) # the line number is ignored, only the line is yield - for line_number, line in lines_to_yield: + for _, line in lines_to_yield: yield line -def _remove_altloc(lines): +def _remove_altloc(lines: str) -> Generator[str]: # the altloc ID is removed in processed altloc lines for line_num, line in lines: yield (line_num, line[:16] + " " + line[17:]) From 0ad1c9cd3ca1a5015e7a4b02c732a4a53b56f071 Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 12 Mar 2024 21:20:15 +0100 Subject: [PATCH 6/7] identify low occupancy records --- deeprank2/tools/pdbprep/__init__.py | 121 ++++++++++------------------ 1 file changed, 44 insertions(+), 77 deletions(-) diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py index acc671270..5f0d2fe32 100644 --- a/deeprank2/tools/pdbprep/__init__.py +++ b/deeprank2/tools/pdbprep/__init__.py @@ -2,9 +2,17 @@ # which is published under an Apache 2.0 licence import sys +from collections import defaultdict from collections.abc import Generator from typing import TextIO +# define record columns for each datum +_ATOMNAME_COLS = slice(12, 16) +_RESNAME_COLS = slice(17, 20) +_CHAIN_COLS = slice(21, 22) +_RESNUM_COLS = slice(22, 27) # this includes both the residue number and insertion code +_OCCUPANCY_COLS = slice(54, 60) + def write_pdb(new_pdb: list, pdbfh: TextIO) -> None: """Writes new pdb files.""" @@ -49,94 +57,53 @@ def _prune_records(fhandle: TextIO) -> Generator[str]: } for record in fhandle: - resname = record[17:20] + resname = record[_RESNAME_COLS] if record.startswith(atomic_record) and resname != water: standardized_resname = standard_resnames.get(resname, resname) yield record[:17] + standardized_resname + record[20:] -def _select_alt_location(pdb: list[str]) -> Generator[str]: - """Select alternate location.""" - register = {} # register atom information - prev_chain = None # register previous chain - - # This loop will collect information on the different atoms throughout the PDB file until a new chain or any terminal line is - # found. At that point, the collected information is processed because all altlocs for that block have been defined. - for nline, record in enumerate(pdb): # line number will be used to sort lines after selecting the desired alternative location - atomname = record[12:16] - altloc = record[16] - chain = record[21:22] - resnum = record[22:27] # resnum (22-25) + insertion code (26) is taken to identify different residues - - # process lines because we enter a new chain - if chain != prev_chain: - yield from _process_altloc(register) - register = {} - - # add info to dictionary in a hierarchically organized manner - resnum_d: dict = register.setdefault(resnum, {}) - atomname_d: dict = resnum_d.setdefault(atomname, {}) - altloc_d: list = atomname_d.setdefault(altloc, []) - altloc_d.append((nline, record)) - - prev_chain = chain - - # at the end of the PDB, process the remaining lines - yield from _process_altloc(register) - +def _find_low_occ_records(pdb: list[str]) -> list[int]: + """Helper function to identify records with lowest occupancy alternate locations. -def _process_altloc(register: dict[str, dict[str, dict[str, list[tuple[int, str]]]]]) -> Generator[str]: - # TODO: Reduce complexity of `register` if possible - """Processes the collected atoms according to the selaltloc option.""" - lines_to_yield = [] + In case an atom is detected at more than one position (e.g. due to alternate conformations), the structure will + contain the same atom multiple times with separate "alternate location indicators" (col 17 of the pdb record). + Each location will have a certain occupancy, i.e. proportion of structures where this particular location is found + (and thus all occupancies for a given atom sum to 1). - anisou_record = ("ANISOU",) # anisou lines are treated specially and always follow atom records + This function first identifies atoms that are listed more than once in a pdb file, based on their chain identifier + (col 22), residue sequence number (col 23-26), and atom name (col 13-16). It then identifies the record with the + highest occupancy for each atom (in case of equal occupancy, the first entry is considered higher). From this, a + list of indices is returned representing the records that do not contain the highest occupancy for the atom in that + record. - for atomnames in register.values(): - for altlocs in atomnames.values(): - all_lines: list[tuple[int, str]] = list(*altlocs.values()) # all alternative locations for the atom + Args: + pdb: list of records (lines) from a pdb file - # identify the highest occupancy combining dictionary and sorting - occ_line_dict = {} # TODO: rename - for line_number, line in all_lines: - occupancy = line[54:60] - occ_line_dict[occupancy] = [(line_number, line)] - - # sort keys by occupancy - keys_ = sorted(occ_line_dict.keys(), key=lambda x: float(x.strip()), reverse=True) # TODO: rename once I know what this is used for - - these_atom_lines = occ_line_dict[keys_[0]] - if len(keys_) == 1 and len(these_atom_lines) > 1: - # address "take first if occ is the same" - # see: https://github.com/haddocking/pdb-tools/issues/153#issuecomment-1488627668 - lines_to_yield.extend(_remove_altloc(these_atom_lines[0:1])) - - # if there's ANISOU, add it - if these_atom_lines[1][1].startswith(anisou_record): - lines_to_yield.extend(_remove_altloc(these_atom_lines[1:2])) - - # this should run when there are more than one key or - # the key has only one atom line. Keys are the occ - # value. - else: - # when occs are different, select the highest one - lines_to_yield.extend(_remove_altloc(these_atom_lines)) - - del all_lines, occ_line_dict - - # lines are sorted to the line number so that the output is sorted - # the same way as in the input PDB - lines_to_yield.sort(key=lambda x: x[0]) + Returns: + list of indices of records that do not contain the highest occupancy location + """ + # define record columns for each datum - # the line number is ignored, only the line is yield - for _, line in lines_to_yield: - yield line + atom_indentiers = [record[_CHAIN_COLS] + record[_RESNUM_COLS] + record[_ATOMNAME_COLS] for record in pdb] + # create a dictionary containing only duplicated atom_indentiers (keys) and their indices in pdb (values) + # from: https://stackoverflow.com/a/11236042/5170442 + duplicates = defaultdict(list) + for i, atom in enumerate(atom_indentiers): + duplicates[atom].append(i) + duplicates = {k: v for k, v in duplicates.items() if len(v) > 1} -def _remove_altloc(lines: str) -> Generator[str]: - # the altloc ID is removed in processed altloc lines - for line_num, line in lines: - yield (line_num, line[:16] + " " + line[17:]) + highest_occupancies = {} + for atom, record_indices in duplicates.items(): + highest_occ = 0 + for i in record_indices: + occupancy = pdb[i][_OCCUPANCY_COLS] + if occupancy > highest_occ: + # only keep the record with the highest occupancy; in case of tie keep the first + highest_occ = occupancy + highest_occupancies[atom] = i + return [x for xs in duplicates.values() for x in xs if x not in highest_occupancies.values()] def pdb_prep(fhandle: TextIO) -> None: @@ -144,7 +111,7 @@ def pdb_prep(fhandle: TextIO) -> None: # step 1 - keep coordinates: removes non coordinate lines for simplicity # step 2 - delresname: remove waters # step 3 - rplresname: convert residue names to standard names, ex: MSE to MET - new_pdb = _prune_records(fhandle) + _new_pdb = _prune_records(fhandle) # step 4 - selaltloc: select most probable alternative location From 65f85d27236d09ccd56ed3764f67288c01db38d9 Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 12 Mar 2024 21:45:43 +0100 Subject: [PATCH 7/7] use `_find_low_occ_records` to limit yield --- deeprank2/tools/pdbprep/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py index 5f0d2fe32..fae37b524 100644 --- a/deeprank2/tools/pdbprep/__init__.py +++ b/deeprank2/tools/pdbprep/__init__.py @@ -56,11 +56,14 @@ def _prune_records(fhandle: TextIO) -> Generator[str]: "HSD": "HIS", } - for record in fhandle: + for i, record in enumerate(fhandle): resname = record[_RESNAME_COLS] - if record.startswith(atomic_record) and resname != water: + if record.startswith(atomic_record) and resname != water and i not in _find_low_occ_records(fhandle): + # TODO: if within a single file mixed residue nomenclature is used, it is not detected by _find_low_occ_records + # probably fix this by running these in separate functions rather than all at once. standardized_resname = standard_resnames.get(resname, resname) - yield record[:17] + standardized_resname + record[20:] + record = record[: _RESNAME_COLS.start] + standardized_resname + record[_RESNAME_COLS.stop :] # noqa: PLW2901 + yield record def _find_low_occ_records(pdb: list[str]) -> list[int]: @@ -111,9 +114,8 @@ def pdb_prep(fhandle: TextIO) -> None: # step 1 - keep coordinates: removes non coordinate lines for simplicity # step 2 - delresname: remove waters # step 3 - rplresname: convert residue names to standard names, ex: MSE to MET - _new_pdb = _prune_records(fhandle) - # step 4 - selaltloc: select most probable alternative location + _new_pdb = _prune_records(fhandle) # step 5 - fixinsert: fix inserts # step 6 - sort: sort chains and resides, necessary for OpenMM