diff --git a/src/turninator.py b/src/turninator.py index 76b56f2..ea8a6ae 100644 --- a/src/turninator.py +++ b/src/turninator.py @@ -28,11 +28,11 @@ def dialog_turns_old(self, text: str): return re.findall(r'([A-Za-z ]+):((?:.|\n)*?)(?=(?:^[A-Za-z ]+:)|\Z)', text) def dialog_turns(self, text: str): - '''Extract dialog turns from input text using regex.''' - # Remove any HTML tags - text = re.sub('<.*?>', '', text, flags=re.DOTALL) - # Regular expression to capture speaker and their corresponding text - return re.findall(r'([A-Za-z0-9 ]+)\s*:\s*((?:.|\n)*?)(?=\n[A-Za-z0-9 ]+\s*:\s*|\Z)', text) + '''Extract dialog turns from input text using regex.''' + # Remove any HTML tags + text = re.sub('<.*?>', '', text, flags=re.DOTALL) + # Regular expression to capture speaker and their corresponding text + return re.findall(r'([A-Za-z0-9_ -]+)\s*:\s*((?:.|\n)*?)(?=\n[A-Za-z0-9_ -]+\s*:\s*|\Z)', text) def monolog_text(self, text: str) -> str: '''Extract the entire text if monolog.'''