From 5cc2b93d3e41b57942e4184389e37c66d2a9cd53 Mon Sep 17 00:00:00 2001 From: Devang Thakkar Date: Sun, 14 May 2023 06:09:50 +0000 Subject: [PATCH 1/4] replace base split by regex split to fix ; inside quotes --- gffutils/parser.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/gffutils/parser.py b/gffutils/parser.py index 0d82889..db8d963 100644 --- a/gffutils/parser.py +++ b/gffutils/parser.py @@ -216,7 +216,11 @@ def _unquote_quals(quals, dialect): if dialect["trailing semicolon"]: keyval_str = keyval_str.rstrip(";") - parts = keyval_str.split(dialect["field separator"]) + # adding regex to split by separator instead of base split + # adapted from https://stackoverflow.com/a/2787979/7182397 + + # parts = keyval_str.split(dialect["field separator"]) + parts = re.split(f'''{dialect["field separator"]}(?=(?:[^"]|"[^"]*")*$)''', keyval_str) kvsep = dialect["keyval separator"] if dialect["leading semicolon"]: @@ -288,7 +292,11 @@ def _unquote_quals(quals, dialect): # GFF3 works with no spaces. # So split on the first one we can recognize... for sep in (" ; ", "; ", ";"): - parts = keyval_str.split(sep) + # adding regex to split by separator instead of base split + # adapted from https://stackoverflow.com/a/2787979/7182397 + + # parts = keyval_str.split(sep) + parts = re.split(f'''{sep}(?=(?:[^"]|"[^"]*")*$)''', keyval_str) if len(parts) > 1: dialect["field separator"] = sep break From 817bbe3a3526cc170fef8828c276f4fd10c15f4b Mon Sep 17 00:00:00 2001 From: Devang Thakkar Date: Mon, 10 Jul 2023 01:49:12 +0000 Subject: [PATCH 2/4] create a new dialect token to avoid slowing down everything --- gffutils/constants.py | 6 ++++++ gffutils/helpers.py | 2 +- gffutils/parser.py | 34 ++++++++++++++++++++++------------ 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/gffutils/constants.py b/gffutils/constants.py index 901e714..3cc2077 100644 --- a/gffutils/constants.py +++ b/gffutils/constants.py @@ -127,6 +127,12 @@ # vs # ID=001; Name=gene1 "field separator": ";", + # Sometimes there are semicolons inside quotes that break things, e.g., + # + # note "Evidence 1a: Function1, Function2" + # vs + # note "Evidence 1a: Function; PubMedId: 123, 456" + "semicolon_in_quotes": False, # Usually "=" for GFF3; " " for GTF, e.g., # # gene_id "GENE1" diff --git a/gffutils/helpers.py b/gffutils/helpers.py index 0e2f430..b68414c 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -35,7 +35,7 @@ def infer_dialect(attributes): ------- Dictionary representing the inferred dialect """ - attributes, dialect = parser._split_keyvals(attributes) + attributes, dialect = parser._split_keyvals(attributes, infer_dialect_call=True) return dialect diff --git a/gffutils/parser.py b/gffutils/parser.py index db8d963..5dff08e 100644 --- a/gffutils/parser.py +++ b/gffutils/parser.py @@ -174,7 +174,7 @@ def sort_key(x): # TODO: # Cythonize -- profiling shows that the bulk of the time is spent on this # function... -def _split_keyvals(keyval_str, dialect=None): +def _split_keyvals(keyval_str, dialect=None, infer_dialect_call=False): """ Given the string attributes field of a GFF-like line, split it into an attributes dictionary and a "dialect" dictionary which contains information @@ -186,6 +186,11 @@ def _split_keyvals(keyval_str, dialect=None): attribute string. Otherwise, use the provided dialect (and return it at the end). + + The `infer_dialect_call` argument denotes whether the call to this function + has been made as part of the regular parsing or only to obtain the dialect + using helpers.infer_dialect(); this helps us to call the regex from PR #215 + only when absolutely required so as to avoid slowing down every other case. """ def _unquote_quals(quals, dialect): @@ -216,11 +221,10 @@ def _unquote_quals(quals, dialect): if dialect["trailing semicolon"]: keyval_str = keyval_str.rstrip(";") - # adding regex to split by separator instead of base split - # adapted from https://stackoverflow.com/a/2787979/7182397 - - # parts = keyval_str.split(dialect["field separator"]) - parts = re.split(f'''{dialect["field separator"]}(?=(?:[^"]|"[^"]*")*$)''', keyval_str) + if dialect["semicolon_in_quotes"]: + parts = re.split(f'''{dialect["field separator"]}(?=(?:[^"]|"[^"]*")*$)''', keyval_str) + else: + parts = keyval_str.split(dialect["field separator"]) kvsep = dialect["keyval separator"] if dialect["leading semicolon"]: @@ -292,13 +296,15 @@ def _unquote_quals(quals, dialect): # GFF3 works with no spaces. # So split on the first one we can recognize... for sep in (" ; ", "; ", ";"): - # adding regex to split by separator instead of base split - # adapted from https://stackoverflow.com/a/2787979/7182397 - - # parts = keyval_str.split(sep) - parts = re.split(f'''{sep}(?=(?:[^"]|"[^"]*")*$)''', keyval_str) + # We want to run regex only when calling helpers.infer_dialect() + parts = keyval_str.split(sep) + parts_regex = parts + if infer_dialect_call: + parts_regex = re.split(f'''{sep}(?=(?:[^"]|"[^"]*")*$)''', keyval_str) if len(parts) > 1: dialect["field separator"] = sep + if parts != parts_regex: + dialect["semicolon_in_quotes"] = True break # Is it GFF3? They have key-vals separated by "=" @@ -354,7 +360,11 @@ def _unquote_quals(quals, dialect): # strings # quals[key].extend([v for v in val.split(',') if v]) - # See issue #198, where + # See issue #198, where commas within a description can incorrectly + # cause the dialect inference to conclude that there are not + # repeated keys. + # + # More description in PR #208. if dialect["repeated keys"]: quals[key].append(val) else: From 0eb9617fab0d92dddbcb5e6e274a8d53c1f3132e Mon Sep 17 00:00:00 2001 From: Devang Thakkar Date: Mon, 10 Jul 2023 01:53:20 +0000 Subject: [PATCH 3/4] undo code duplication --- gffutils/parser.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/gffutils/parser.py b/gffutils/parser.py index 5dff08e..2332400 100644 --- a/gffutils/parser.py +++ b/gffutils/parser.py @@ -360,11 +360,7 @@ def _unquote_quals(quals, dialect): # strings # quals[key].extend([v for v in val.split(',') if v]) - # See issue #198, where commas within a description can incorrectly - # cause the dialect inference to conclude that there are not - # repeated keys. - # - # More description in PR #208. + # See issue #198, where if dialect["repeated keys"]: quals[key].append(val) else: From 8543575e03c93831203acac86988cc57565c6849 Mon Sep 17 00:00:00 2001 From: Devang Thakkar Date: Mon, 10 Jul 2023 01:54:19 +0000 Subject: [PATCH 4/4] undo code duplication --- gffutils/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffutils/parser.py b/gffutils/parser.py index 2332400..e64aa42 100644 --- a/gffutils/parser.py +++ b/gffutils/parser.py @@ -360,7 +360,7 @@ def _unquote_quals(quals, dialect): # strings # quals[key].extend([v for v in val.split(',') if v]) - # See issue #198, where + # See issue #198, where if dialect["repeated keys"]: quals[key].append(val) else: