From 700988307099f7f8cfd3f4e82a7ba778bc9bff86 Mon Sep 17 00:00:00 2001
From: Ross Cutler <46252169+rosscutler@users.noreply.github.com>
Date: Wed, 4 Jun 2025 14:50:34 -0700
Subject: [PATCH] Add regex documentation

---
 docs/results.md      | 15 ++++++++++-----
 src/result_parser.py | 41 ++++++++++++++++++++++-------------------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/docs/results.md b/docs/results.md
index 0bf0127..69f9716 100644
--- a/docs/results.md
+++ b/docs/results.md
@@ -11,11 +11,16 @@ created in the first step ([preparation](preparation.md)).
     **Note**: In case there is possible to have a condition level aggregation in your dataset, uncomment the 
     `condition_pattern` and `condition_keys`.
     
-    **Note**: The `condition_pattern` specifies which part of the clip URL refers to the condition name/number that they are
-    representing. Clips with the same value on that position, are considered to belong to the same condition and votes 
-    assigned to them will be aggregated to create the `per_condition` report. Example: Assuming `D501_C03_M2_S02.wav` is 
-    a file name,and `03` is the condition name. The pattern should be set to `.*_c(?P<condition_num>\d{1,2})_.*.wav` , 
-    and the `condition_keys` to `condition_num`.
+    **Note**: The `condition_pattern` specifies which part of the clip URL refers to the condition name/number that they are
+    representing. Clips with the same value on that position, are considered to belong to the same condition and votes
+    assigned to them will be aggregated to create the `per_condition` report.
+
+    The pattern follows Python regular expression syntax and should use **named
+    capture groups**. The names of the groups become column names in the reports.
+    Example: Assuming `D501_C03_M2_S02.wav` is a file name and `03` is the
+    condition identifier, set
+    `condition_pattern: .*_c(?P<condition_num>\d{1,2})_.*\.wav` and
+    `condition_keys` to `condition_num`.
    
     **Note**: You can activate the automatic outlier detection method per condition. To do so
     open `YOUR_PROJECT_NAME_ccr_result_parser.cfg`, in section `[accept_and_use]` add `outlier_removal: true`. The [z-score
diff --git a/src/result_parser.py b/src/result_parser.py
index e9e0112..9fcd103 100644
--- a/src/result_parser.py
+++ b/src/result_parser.py
@@ -1,9 +1,15 @@
 """
-/*---------------------------------------------------------------------------------------------
+/*-------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the MIT License. See License.txt in the project root for license information.
-*--------------------------------------------------------------------------------------------*/
+*-------------------------------------------------------------------------------*/
 @author: Babak Naderi
+
+This module parses MTurk result files and aggregates the votes per clip or per
+condition. Condition names can be automatically derived from the clip URL or file
+name using regular expressions. Set the desired pattern via the ``condition_pattern``
+option in the configuration file. The pattern must contain named capture groups
+whose values will appear in the generated reports.
 """
 
 import csv
@@ -1283,10 +1289,19 @@ def write_dict_as_csv(dic_to_write, file_name, *args, **kwargs):
 
 
 def conv_filename_to_condition(f_name):
-    """
-    extract the condition name from filename given the mask in the config
-    :param f_name:
-    :return:
+    """Return condition information extracted from ``f_name``.
+
+    The regular expression defined by ``condition_pattern`` in the configuration
+    file is applied to the file name. The pattern should contain named capture
+    groups so that their values can be used as columns in the final reports. If
+    the pattern does not match the given file name the dictionary
+    ``{"Unknown": "NoCondition"}`` is returned.
+
+    Example
+    -------
+    >>> config['general']['condition_pattern'] = r".*_c(?P<cond>\\d{2})_.*\\.wav"
+    >>> conv_filename_to_condition("D501_C03_M2_S02.wav")
+    {'cond': '03'}
     """
     if f_name in file_to_condition_map:
         return file_to_condition_map[f_name]
@@ -1577,7 +1592,7 @@ def calc_payment_stat(df):
         median_time_in_sec = df[word_duration_col].median()
     
     payment_text = df['Reward'].values[0]
-    paymnet = re.findall("\d+\.\d+", payment_text)
+    paymnet = re.findall(r"\\d+\\.\\d+", payment_text)
 
     avg_pay = 3600*float(paymnet[0])/median_time_in_sec
     
@@ -1591,18 +1606,6 @@ def calc_stats(input_file):
     :param input_file:
     :return:
     """
-    """
-    df = pd.read_csv(input_file, low_memory=False)
-    median_time_in_sec = df["WorkTimeInSeconds"].median()
-    payment_text = df['Reward'].values[0]
-    paymnet = re.findall("\d+\.\d+", payment_text)
-
-    avg_pay = 3600*float(paymnet[0])/median_time_in_sec
-    formatted_time = time.strftime("%M:%S", time.gmtime(median_time_in_sec))
-    print(
-        f"Stats: work duration (median) {formatted_time} (MM:SS), payment per hour: ${avg_pay:.2f}"
-    )
-    """
     df = pd.read_csv(input_file, low_memory=False)
     df_full = df.copy()
     overall_time, overall_pay = calc_payment_stat(df)