From 5cc2b93d3e41b57942e4184389e37c66d2a9cd53 Mon Sep 17 00:00:00 2001
From: Devang Thakkar <devang01121995@gmail.com>
Date: Sun, 14 May 2023 06:09:50 +0000
Subject: [PATCH 1/4] replace base split by regex split to fix ; inside quotes

---
 gffutils/parser.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/gffutils/parser.py b/gffutils/parser.py
index 0d82889..db8d963 100644
--- a/gffutils/parser.py
+++ b/gffutils/parser.py
@@ -216,7 +216,11 @@ def _unquote_quals(quals, dialect):
         if dialect["trailing semicolon"]:
             keyval_str = keyval_str.rstrip(";")
 
-        parts = keyval_str.split(dialect["field separator"])
+        # adding regex to split by separator instead of base split
+        # adapted from https://stackoverflow.com/a/2787979/7182397
+
+        # parts = keyval_str.split(dialect["field separator"])
+        parts = re.split(f'''{dialect["field separator"]}(?=(?:[^"]|"[^"]*")*$)''', keyval_str)
 
         kvsep = dialect["keyval separator"]
         if dialect["leading semicolon"]:
@@ -288,7 +292,11 @@ def _unquote_quals(quals, dialect):
     # GFF3 works with no spaces.
     # So split on the first one we can recognize...
     for sep in (" ; ", "; ", ";"):
-        parts = keyval_str.split(sep)
+        # adding regex to split by separator instead of base split
+        # adapted from https://stackoverflow.com/a/2787979/7182397
+
+        # parts = keyval_str.split(sep)
+        parts = re.split(f'''{sep}(?=(?:[^"]|"[^"]*")*$)''', keyval_str)
         if len(parts) > 1:
             dialect["field separator"] = sep
             break

From 817bbe3a3526cc170fef8828c276f4fd10c15f4b Mon Sep 17 00:00:00 2001
From: Devang Thakkar <devang01121995@gmail.com>
Date: Mon, 10 Jul 2023 01:49:12 +0000
Subject: [PATCH 2/4] create a new dialect token to avoid slowing down
 everything

---
 gffutils/constants.py |  6 ++++++
 gffutils/helpers.py   |  2 +-
 gffutils/parser.py    | 34 ++++++++++++++++++++++------------
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/gffutils/constants.py b/gffutils/constants.py
index 901e714..3cc2077 100644
--- a/gffutils/constants.py
+++ b/gffutils/constants.py
@@ -127,6 +127,12 @@
     # vs
     #   ID=001; Name=gene1
     "field separator": ";",
+    # Sometimes there are semicolons inside quotes that break things, e.g.,
+    #
+    #   note "Evidence 1a: Function1, Function2"
+    # vs
+    #   note "Evidence 1a: Function; PubMedId: 123, 456"
+    "semicolon_in_quotes": False,
     # Usually "=" for GFF3; " " for GTF, e.g.,
     #
     #   gene_id "GENE1"
diff --git a/gffutils/helpers.py b/gffutils/helpers.py
index 0e2f430..b68414c 100644
--- a/gffutils/helpers.py
+++ b/gffutils/helpers.py
@@ -35,7 +35,7 @@ def infer_dialect(attributes):
     -------
     Dictionary representing the inferred dialect
     """
-    attributes, dialect = parser._split_keyvals(attributes)
+    attributes, dialect = parser._split_keyvals(attributes, infer_dialect_call=True)
     return dialect
 
 
diff --git a/gffutils/parser.py b/gffutils/parser.py
index db8d963..5dff08e 100644
--- a/gffutils/parser.py
+++ b/gffutils/parser.py
@@ -174,7 +174,7 @@ def sort_key(x):
 # TODO:
 # Cythonize -- profiling shows that the bulk of the time is spent on this
 # function...
-def _split_keyvals(keyval_str, dialect=None):
+def _split_keyvals(keyval_str, dialect=None, infer_dialect_call=False):
     """
     Given the string attributes field of a GFF-like line, split it into an
     attributes dictionary and a "dialect" dictionary which contains information
@@ -186,6 +186,11 @@ def _split_keyvals(keyval_str, dialect=None):
     attribute string.
 
     Otherwise, use the provided dialect (and return it at the end).
+
+    The `infer_dialect_call` argument denotes whether the call to this function
+    has been made as part of the regular parsing or only to obtain the dialect
+    using helpers.infer_dialect(); this helps us to call the regex from PR #215
+    only when absolutely required so as to avoid slowing down every other case.
     """
 
     def _unquote_quals(quals, dialect):
@@ -216,11 +221,10 @@ def _unquote_quals(quals, dialect):
         if dialect["trailing semicolon"]:
             keyval_str = keyval_str.rstrip(";")
 
-        # adding regex to split by separator instead of base split
-        # adapted from https://stackoverflow.com/a/2787979/7182397
-
-        # parts = keyval_str.split(dialect["field separator"])
-        parts = re.split(f'''{dialect["field separator"]}(?=(?:[^"]|"[^"]*")*$)''', keyval_str)
+        if dialect["semicolon_in_quotes"]:
+            parts = re.split(f'''{dialect["field separator"]}(?=(?:[^"]|"[^"]*")*$)''', keyval_str)
+        else:
+            parts = keyval_str.split(dialect["field separator"])
 
         kvsep = dialect["keyval separator"]
         if dialect["leading semicolon"]:
@@ -292,13 +296,15 @@ def _unquote_quals(quals, dialect):
     # GFF3 works with no spaces.
     # So split on the first one we can recognize...
     for sep in (" ; ", "; ", ";"):
-        # adding regex to split by separator instead of base split
-        # adapted from https://stackoverflow.com/a/2787979/7182397
-
-        # parts = keyval_str.split(sep)
-        parts = re.split(f'''{sep}(?=(?:[^"]|"[^"]*")*$)''', keyval_str)
+        # We want to run regex only when calling helpers.infer_dialect()
+        parts = keyval_str.split(sep)
+        parts_regex = parts
+        if infer_dialect_call:
+            parts_regex = re.split(f'''{sep}(?=(?:[^"]|"[^"]*")*$)''', keyval_str)
         if len(parts) > 1:
             dialect["field separator"] = sep
+            if parts != parts_regex:
+                dialect["semicolon_in_quotes"] = True
             break
 
     # Is it GFF3?  They have key-vals separated by "="
@@ -354,7 +360,11 @@ def _unquote_quals(quals, dialect):
             # strings
             # quals[key].extend([v for v in val.split(',') if v])
 
-            # See issue #198, where 
+            # See issue #198, where commas within a description can incorrectly
+            # cause the dialect inference to conclude that there are not
+            # repeated keys.
+            #
+            # More description in PR #208.
             if dialect["repeated keys"]:
                 quals[key].append(val)
             else:

From 0eb9617fab0d92dddbcb5e6e274a8d53c1f3132e Mon Sep 17 00:00:00 2001
From: Devang Thakkar <devang01121995@gmail.com>
Date: Mon, 10 Jul 2023 01:53:20 +0000
Subject: [PATCH 3/4] undo code duplication

---
 gffutils/parser.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/gffutils/parser.py b/gffutils/parser.py
index 5dff08e..2332400 100644
--- a/gffutils/parser.py
+++ b/gffutils/parser.py
@@ -360,11 +360,7 @@ def _unquote_quals(quals, dialect):
             # strings
             # quals[key].extend([v for v in val.split(',') if v])
 
-            # See issue #198, where commas within a description can incorrectly
-            # cause the dialect inference to conclude that there are not
-            # repeated keys.
-            #
-            # More description in PR #208.
+            # See issue #198, where
             if dialect["repeated keys"]:
                 quals[key].append(val)
             else:

From 8543575e03c93831203acac86988cc57565c6849 Mon Sep 17 00:00:00 2001
From: Devang Thakkar <devang01121995@gmail.com>
Date: Mon, 10 Jul 2023 01:54:19 +0000
Subject: [PATCH 4/4] undo code duplication

---
 gffutils/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffutils/parser.py b/gffutils/parser.py
index 2332400..e64aa42 100644
--- a/gffutils/parser.py
+++ b/gffutils/parser.py
@@ -360,7 +360,7 @@ def _unquote_quals(quals, dialect):
             # strings
             # quals[key].extend([v for v in val.split(',') if v])
 
-            # See issue #198, where
+            # See issue #198, where 
             if dialect["repeated keys"]:
                 quals[key].append(val)
             else: