From 700988307099f7f8cfd3f4e82a7ba778bc9bff86 Mon Sep 17 00:00:00 2001 From: Ross Cutler <46252169+rosscutler@users.noreply.github.com> Date: Wed, 4 Jun 2025 14:50:34 -0700 Subject: [PATCH] Add regex documentation --- docs/results.md | 15 ++++++++++----- src/result_parser.py | 41 ++++++++++++++++++++++------------------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/docs/results.md b/docs/results.md index 0bf0127..69f9716 100644 --- a/docs/results.md +++ b/docs/results.md @@ -11,11 +11,16 @@ created in the first step ([preparation](preparation.md)). **Note**: In case there is possible to have a condition level aggregation in your dataset, uncomment the `condition_pattern` and `condition_keys`. - **Note**: The `condition_pattern` specifies which part of the clip URL refers to the condition name/number that they are - representing. Clips with the same value on that position, are considered to belong to the same condition and votes - assigned to them will be aggregated to create the `per_condition` report. Example: Assuming `D501_C03_M2_S02.wav` is - a file name,and `03` is the condition name. The pattern should be set to `.*_c(?P\d{1,2})_.*.wav` , - and the `condition_keys` to `condition_num`. + **Note**: The `condition_pattern` specifies which part of the clip URL refers to the condition name/number that they are + representing. Clips with the same value on that position, are considered to belong to the same condition and votes + assigned to them will be aggregated to create the `per_condition` report. + + The pattern follows Python regular expression syntax and should use **named + capture groups**. The names of the groups become column names in the reports. + Example: Assuming `D501_C03_M2_S02.wav` is a file name and `03` is the + condition identifier, set + `condition_pattern: .*_c(?P\d{1,2})_.*\.wav` and + `condition_keys` to `condition_num`. **Note**: You can activate the automatic outlier detection method per condition. To do so open `YOUR_PROJECT_NAME_ccr_result_parser.cfg`, in section `[accept_and_use]` add `outlier_removal: true`. The [z-score diff --git a/src/result_parser.py b/src/result_parser.py index e9e0112..9fcd103 100644 --- a/src/result_parser.py +++ b/src/result_parser.py @@ -1,9 +1,15 @@ """ -/*--------------------------------------------------------------------------------------------- +/*------------------------------------------------------------------------------- * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See License.txt in the project root for license information. -*--------------------------------------------------------------------------------------------*/ +*-------------------------------------------------------------------------------*/ @author: Babak Naderi + +This module parses MTurk result files and aggregates the votes per clip or per +condition. Condition names can be automatically derived from the clip URL or file +name using regular expressions. Set the desired pattern via the ``condition_pattern`` +option in the configuration file. The pattern must contain named capture groups +whose values will appear in the generated reports. """ import csv @@ -1283,10 +1289,19 @@ def write_dict_as_csv(dic_to_write, file_name, *args, **kwargs): def conv_filename_to_condition(f_name): - """ - extract the condition name from filename given the mask in the config - :param f_name: - :return: + """Return condition information extracted from ``f_name``. + + The regular expression defined by ``condition_pattern`` in the configuration + file is applied to the file name. The pattern should contain named capture + groups so that their values can be used as columns in the final reports. If + the pattern does not match the given file name the dictionary + ``{"Unknown": "NoCondition"}`` is returned. + + Example + ------- + >>> config['general']['condition_pattern'] = r".*_c(?P\\d{2})_.*\\.wav" + >>> conv_filename_to_condition("D501_C03_M2_S02.wav") + {'cond': '03'} """ if f_name in file_to_condition_map: return file_to_condition_map[f_name] @@ -1577,7 +1592,7 @@ def calc_payment_stat(df): median_time_in_sec = df[word_duration_col].median() payment_text = df['Reward'].values[0] - paymnet = re.findall("\d+\.\d+", payment_text) + paymnet = re.findall(r"\\d+\\.\\d+", payment_text) avg_pay = 3600*float(paymnet[0])/median_time_in_sec @@ -1591,18 +1606,6 @@ def calc_stats(input_file): :param input_file: :return: """ - """ - df = pd.read_csv(input_file, low_memory=False) - median_time_in_sec = df["WorkTimeInSeconds"].median() - payment_text = df['Reward'].values[0] - paymnet = re.findall("\d+\.\d+", payment_text) - - avg_pay = 3600*float(paymnet[0])/median_time_in_sec - formatted_time = time.strftime("%M:%S", time.gmtime(median_time_in_sec)) - print( - f"Stats: work duration (median) {formatted_time} (MM:SS), payment per hour: ${avg_pay:.2f}" - ) - """ df = pd.read_csv(input_file, low_memory=False) df_full = df.copy() overall_time, overall_pay = calc_payment_stat(df)