From 71a5d5d8ff5355b7b6fa943941d0c41054b0426b Mon Sep 17 00:00:00 2001 From: debelatesfaye Date: Tue, 4 Mar 2025 15:31:05 +0000 Subject: [PATCH] modified the regex --- src/turninator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/turninator.py b/src/turninator.py index 76b56f2..ea8a6ae 100644 --- a/src/turninator.py +++ b/src/turninator.py @@ -28,11 +28,11 @@ def dialog_turns_old(self, text: str): return re.findall(r'([A-Za-z ]+):((?:.|\n)*?)(?=(?:^[A-Za-z ]+:)|\Z)', text) def dialog_turns(self, text: str): - '''Extract dialog turns from input text using regex.''' - # Remove any HTML tags - text = re.sub('<.*?>', '', text, flags=re.DOTALL) - # Regular expression to capture speaker and their corresponding text - return re.findall(r'([A-Za-z0-9 ]+)\s*:\s*((?:.|\n)*?)(?=\n[A-Za-z0-9 ]+\s*:\s*|\Z)', text) + '''Extract dialog turns from input text using regex.''' + # Remove any HTML tags + text = re.sub('<.*?>', '', text, flags=re.DOTALL) + # Regular expression to capture speaker and their corresponding text + return re.findall(r'([A-Za-z0-9_ -]+)\s*:\s*((?:.|\n)*?)(?=\n[A-Za-z0-9_ -]+\s*:\s*|\Z)', text) def monolog_text(self, text: str) -> str: '''Extract the entire text if monolog.'''