From f553711779ebbd9f7a2ffec4bac21a6dd313152c Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 11 Apr 2023 16:26:37 +0200
Subject: [PATCH 01/65] Initial Malania 2007 commit with preliminary threshold
 metric, the benchmark, packaging and a few tests.

---
 brainscore/benchmarks/malania2007.py          | 152 +++++++++
 brainscore/benchmarks/screen.py               |  13 +-
 brainscore/metrics/threshold.py               | 301 ++++++++++++++++++
 packaging/malania2007/__init__.py             |   0
 .../malania2007/malania_data_assembly.py      | 102 ++++++
 packaging/malania2007/malania_stimulus_set.py |  71 +++++
 tests/test_benchmarks/test_malania2007.py     |  44 +++
 7 files changed, 679 insertions(+), 4 deletions(-)
 create mode 100644 brainscore/benchmarks/malania2007.py
 create mode 100644 brainscore/metrics/threshold.py
 create mode 100644 packaging/malania2007/__init__.py
 create mode 100644 packaging/malania2007/malania_data_assembly.py
 create mode 100644 packaging/malania2007/malania_stimulus_set.py
 create mode 100644 tests/test_benchmarks/test_malania2007.py

diff --git a/brainscore/benchmarks/malania2007.py b/brainscore/benchmarks/malania2007.py
new file mode 100644
index 000000000..0fc1c825a
--- /dev/null
+++ b/brainscore/benchmarks/malania2007.py
@@ -0,0 +1,152 @@
+import numpy as np
+from scipy.optimize import least_squares
+from scipy.stats import t
+import xarray as xr
+
+import brainscore
+from brainio.assemblies import walk_coords
+from brainscore.benchmarks import BenchmarkBase
+from brainscore.benchmarks.screen import place_on_screen
+from brainscore.metrics import Score
+from brainscore.metrics.accuracy import Accuracy
+from brainscore.metrics.distribution_similarity import BootstrapDistributionSimilarity
+from brainscore.metrics.threshold import ThresholdElevation
+from brainscore.model_interface import BrainModel
+from brainscore.utils import LazyLoad
+
+BIBTEX = """@article{malania2007,
+            author = {Malania, Maka and Herzog, Michael H. and Westheimer, Gerald},
+            title = "{Grouping of contextual elements that affect vernier thresholds}",
+            journal = {Journal of Vision},
+            volume = {7},
+            number = {2},
+            pages = {1-1},
+            year = {2007},
+            issn = {1534-7362},
+            doi = {10.1167/7.2.1},
+            url = {https://doi.org/10.1167/7.2.1}
+        }"""
+
+BASELINE_CONDITION = 'vernier-only'
+DATASETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2', 'long-2', 'equal-16', 'long-16']
+# Values in NUM_FLANKERS_PER_CONDITION denote the condition (i.e., in this case the number of flankers) to be selected
+# This is kept track of simply because the benchmark uses threshold elevation - i.e., a comparison of 2 conditions
+NUM_FLANKERS_PER_CONDITION = {'short-2': 2, 'short-4': 4, 'short-6': 6, 'short-8': 8,
+            'short-16': 16, 'equal-2': 2, 'long-2': 2, 'equal-16': 16,
+            'long-16': 16, 'vernier-only': 0}
+
+
+for dataset in DATASETS:
+    # behavioral benchmark
+    identifier = f"Malania_{dataset.replace('-', '')}"
+    globals()[identifier] = lambda dataset=dataset: _Malania2007Base(dataset)
+
+
+class _Malania2007Base(BenchmarkBase):
+    """
+    INFORMATION:
+
+    Benchmark DATASETS should be considered as independent. This means that participant-specific across-condition data
+    should only ever be compared using the 'subject_unique_id'. In some conditions (short-2, vernier_only, short-16)
+    an additional observer was added from the original paper's plots. This is because in these conditions, two
+    experiments were independently conducted, and 1 additional observer that was non-overlapping between the
+    experiments was added to the aggregate benchmark.
+
+    While humans and models are performing the same testing task in this benchmark, there are a number of choices
+    that are made in this benchmark that make minor deviations from the human experiment. The choices that make
+    deviations from the human experiment are listed below alongside the reason for why the departure was made,
+    and what the 'precisely faithful' alternative would be.
+
+    Benchmark Choices:
+
+    1) The number and type of fitting stimuli are unfounded choices. Currently, the number of fitting stimuli is chosen
+        to be relatively small, but sufficient for good decoding performance in the baseline condition.
+        - Precisely faithful alternative: Present text instructions to models as they were presented to humans
+            * Why not this alternative? Since the experiment is about early visual perception, and there are currently
+            few/no models capable of a task like this, it would not be interesting.
+        - Somewhat faithful alternative: Present a smaller number of training stimuli, motivated by work like
+        Lee & DiCarlo (2023), biorXiv (doi:https://doi.org/10.1101/2022.12.31.522402).
+            * Why not this alternative? Since the experiment is not about perceptual learning but about early visual
+            perception, and there are few/no models capable of a task like this, it would not be interesting.
+    2) In the human experiment, stimuli were presented at exactly the foveal position. In the model experiment,
+        testing stimuli are presented at exactly the foveal position +- 72arcsec = 0.02deg.
+        * Why this alternative? Since most models evaluated are test-time deterministic, we want a more precise
+        estimate of the threshold than a point estimate. Since human microsaccades of small distances are generally
+        uncontrolled and uncontrollable for (e.g., up to 360arcsec = 6arcmin = 0.1 deg), we believe the tiny jitter
+        of 0.02deg to have no impact at all on the comparison under study, while improving the precision of threshold
+        estimates.
+
+    """
+    def __init__(self, condition: str):
+        self.baseline_condition = BASELINE_CONDITION
+        self.condition = condition
+
+        # since this benchmark compares threshold elevation against a baseline, we omit one subject
+        # in some conditions in which that subject did not perform both the baseline and the test
+        # condition
+        baseline_assembly = LazyLoad(lambda: load_assembly(self.baseline_condition))
+        condition_assembly = LazyLoad(lambda: load_assembly(self.condition))
+        self._assembly, self._baseline_assembly = remove_subjects_with_nans(condition_assembly,
+                                                                            baseline_assembly)
+
+        self._assemblies = {'baseline_assembly': self._baseline_assembly,
+                            'condition_assembly': self._assembly}
+        self._fitting_stimuli = load_assembly(f'{self.condition}_fit')
+
+        self._metric = ThresholdElevation(independent_variable='vernier_offset',
+                                          baseline_condition=self.baseline_condition,
+                                          test_condition=self.condition,
+                                          threshold_accuracy=0.75)
+        self._ceiling = self._metric.ceiling(self._assemblies)
+
+        self._visual_degrees = 3  # the precise number is 2.66667 but place_on_screen expects Optional[int]?
+        self._number_of_trials = 1
+
+        super(_Malania2007Base, self).__init__(
+            identifier=f'Malania2007_{condition}', version=1,
+            ceiling_func=lambda: self._metric.ceiling(self._assembly),
+            parent='Malania2007',
+            bibtex=BIBTEX)
+
+    def __call__(self, candidate: BrainModel):
+        model_response = {}
+        candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli)
+        for condition in (self.baseline_condition, self.condition):
+            model_response[condition] = place_on_screen(
+                self._assembly.stimulus_set.sel(num_flankers=NUM_FLANKERS_PER_CONDITION[condition]),
+                target_visual_degrees=candidate.visual_degrees(),
+                source_visual_degrees=self._visual_degrees
+            )
+
+        raw_score = self._metric(model_response, self._assemblies)
+
+        # Adjust score to ceiling
+        ceiling = self._ceiling
+        score = raw_score / ceiling.sel(aggregation='center')
+
+        # cap score at 1 if ceiled score > 1
+        if score[(score['aggregation'] == 'center')] > 1:
+            score.__setitem__({'aggregation': score['aggregation'] == 'center'}, 1)
+
+        score.attrs['raw'] = raw_score
+        score.attrs['ceiling'] = ceiling
+        return score
+
+
+def load_assembly(dataset):
+    assembly = brainscore.get_assembly(f'Malania2007_{dataset}')
+    return assembly
+
+
+def remove_subjects_with_nans(condition_assembly, baseline_assembly):
+    # Find the indices of the subjects with NaN values in the first PropertyAssembly
+    nan_subjects = np.isnan(condition_assembly.values)
+
+    # Convert the boolean array to a DataArray with the same coordinates as the input assemblies
+    nan_subjects_da = xr.DataArray(nan_subjects, coords=condition_assembly.coords, dims=condition_assembly.dims)
+
+    # Filter out the subjects with NaN values from both PropertyAssemblies
+    filtered_condition_assembly = condition_assembly.where(~nan_subjects_da, drop=True)
+    filtered_baseline_assembly = baseline_assembly.where(~nan_subjects_da, drop=True)
+
+    return filtered_condition_assembly, filtered_baseline_assembly
\ No newline at end of file
diff --git a/brainscore/benchmarks/screen.py b/brainscore/benchmarks/screen.py
index 9b9b992d1..20fb8158c 100644
--- a/brainscore/benchmarks/screen.py
+++ b/brainscore/benchmarks/screen.py
@@ -3,6 +3,7 @@
 """
 import copy
 import logging
+from typing import Union
 import os
 
 import numpy as np
@@ -18,7 +19,9 @@
 _logger = logging.getLogger(__name__)
 
 
-def place_on_screen(stimulus_set: StimulusSet, target_visual_degrees: int, source_visual_degrees: int = None):
+def place_on_screen(stimulus_set: StimulusSet,
+                    target_visual_degrees: Union[int, float],
+                    source_visual_degrees: Union[int, float, None] = None):
     _logger.debug(f"Converting {stimulus_set.identifier} to {target_visual_degrees} degrees")
 
     assert source_visual_degrees or 'degrees' in stimulus_set, \
@@ -42,8 +45,10 @@ def _determine_visual_degrees(visual_degrees, stimulus_set):
 
 @store(identifier_ignore=['stimulus_set'])
 def _place_on_screen(stimuli_identifier: str, stimulus_set: StimulusSet,
-                     target_visual_degrees: int, source_visual_degrees: int = None):
-    converted_stimuli_id = f"{stimuli_identifier}--target{target_visual_degrees}--source{source_visual_degrees}"
+                     target_visual_degrees: Union[int, float], source_visual_degrees: Union[int, float, None] = None):
+    source_degrees_formatted = f"{source_visual_degrees}" if source_visual_degrees is None \
+        else f"{source_visual_degrees:.2f}"  # make sure we do not try to print a None with 2 decimal places
+    converted_stimuli_id = f"{stimuli_identifier}--target{target_visual_degrees:.2f}--source{source_degrees_formatted}"
     source_visual_degrees = _determine_visual_degrees(source_visual_degrees, stimulus_set)
 
     target_dir = root_path / converted_stimuli_id
@@ -69,7 +74,7 @@ class ImageConverter:
     def __init__(self, target_dir):
         self._target_dir = Path(target_dir)
 
-    def convert_image(self, image_path, source_degrees, target_degrees):
+    def convert_image(self, image_path, source_degrees: Union[int, float], target_degrees: Union[int, float]):
         if source_degrees == target_degrees:
             return image_path
         ratio = target_degrees / source_degrees
diff --git a/brainscore/metrics/threshold.py b/brainscore/metrics/threshold.py
new file mode 100644
index 000000000..e947f1945
--- /dev/null
+++ b/brainscore/metrics/threshold.py
@@ -0,0 +1,301 @@
+import itertools
+from typing import Dict, Union, Literal
+
+import numpy as np
+from scipy.optimize import curve_fit
+from scipy.stats import norm
+from scipy.special import erf, erfinv
+
+from brainscore.metrics import Metric, Score
+from brainscore.metrics.ceiling import SplitHalfConsistency
+from brainio.assemblies import PropertyAssembly, BehavioralAssembly, DataAssembly
+from brainio.stimuli import StimulusSet
+
+
+def cumulative_gaussian(x, mu, sigma):
+    return 0.5 * (1 + erf((x - mu) / (sigma * np.sqrt(2))))
+
+
+def inverse_cumulative_gaussian(x_point, mu, sigma):
+    return np.sqrt(2) * sigma * erfinv(2 * x_point - 1) + mu
+
+
+class Threshold(Metric):
+    """
+    Computes a psychometric threshold function from model responses and compares against human-computed psychometric
+    thresholds.
+
+    The model comparison to human data is currently individual-subject based, i.e., models and ceilings are compared
+    against the mean of the distance of the model threshold to human thresholds.
+    """
+    def __init__(self,
+                 independent_variable,
+                 fit_function=cumulative_gaussian,
+                 fit_inverse_function=inverse_cumulative_gaussian,
+                 threshold_accuracy: Union[Literal['inflection'], float] = 'inflection',
+                 scoring: Union[Literal['individual'], Literal['pool']] = 'individual'
+                 ):
+        self.fit_function = fit_function
+        self.fit_inverse_function = fit_inverse_function
+        self._independent_variable = independent_variable
+        self.threshold_accuracy = threshold_accuracy
+        self.scoring = scoring
+
+    def __call__(self, source: Union[np.array, float], target: Union[list, PropertyAssembly]) -> Score:
+        """
+        :param source: Either a np.array containing model responses to individual stimuli, or a pre-computed threshold
+                        as a float.
+        :param target: Either a list containing human thresholds (for the ceiling function & ThresholdElevation),
+                        or a PropertyAsssembly.
+        :return: A Score containing the evaluated model's distance to target thresholds in units of multiples of
+                  the human score.
+        """
+        # compute threshold from measurements if the input is not a threshold already
+        if not isinstance(source, float):
+            source_threshold = self.compute_threshold(source, self._independent_variable)
+        else:
+            source_threshold = source
+
+        if source_threshold == 'fit_fail':
+            return Score([0.], coords={'aggregation': ['center', ]}, dims=['aggregation'])
+
+        # compare threshold to target thresholds
+        if self.scoring == 'pool':
+            return self.pool_score(source_threshold, target)
+        elif self.scoring == 'individual':
+            return self.individual_score(source_threshold, target)
+        else:
+            raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
+
+    def ceiling(self, assembly: PropertyAssembly):
+        """
+        :param assembly: the human PropertyAssembly containing human responses
+        :return: Score object with coords center (ceiling) and error (STD)
+        """
+        # compare threshold to target thresholds
+        if self.scoring == 'pool':
+            return self.pool_ceiling(assembly)
+        elif self.scoring == 'individual':
+            return self.individual_ceiling(assembly)
+        else:
+            raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
+
+    def pool_ceiling(self, assembly: PropertyAssembly):
+        # Still not super sure what a logical pooled ceiling here is - some split-half procedure like in
+        # 'https://github.com/brain-score/brain-score/blob/
+        # 9fbf4eda24d081c0ec7bc4d7b5572d8c13dc92d2/brainscore/metrics/image_level_behavior.py#L92'
+        # likely makes sense, but is quite problematic with the small amount of target data available in most
+        # thresholding studies.
+        raise NotImplementedError
+
+    def individual_ceiling(self, assembly: PropertyAssembly):
+        """
+        Computed by one-vs all for each of the NUM_TRIALS human indexes. One index is removed, and scored against
+        a pool of the other values.
+
+        Currently copied with modification from 'https://github.com/brain-score/brain-score/blob/
+        jacob2020_occlusion_depth_ordering/brainscore/metrics/data_cloud_comparision.py#L54'.
+
+        :param assembly:
+        :return:
+        """
+        human_thresholds: list = assembly.values.tolist()
+        scores = []
+        for i in range(len(human_thresholds)):
+            random_state = np.random.RandomState(i)
+            random_human_score = random_state.choice(human_thresholds, replace=False)
+            metric = Threshold(self._independent_variable, self.fit_function, self.fit_inverse_function,
+                               self.threshold_accuracy)
+            human_thresholds.remove(random_human_score)
+            score = metric(random_human_score, human_thresholds)
+            score = float(score[(score['aggregation'] == 'center')].values)
+            human_thresholds.append(random_human_score)
+            scores.append(score)
+
+        ceiling, ceiling_error = np.mean(scores), np.std(scores)
+        ceiling = Score([ceiling, ceiling_error], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
+        return ceiling
+
+    def compute_threshold(self, source, independent_variable):
+        assert len(source.values) == len(source[independent_variable].values)
+
+        x_points = source[independent_variable].values
+        accuracies = self.convert_proba_to_correct(source)
+        fit_params = self.fit_threshold_function(x_points, accuracies)
+        if (type(fit_params) == str) and (fit_params == 'fit_fail'):
+            return fit_params
+
+        if self.threshold_accuracy == 'inflection':
+            self.threshold_accuracy = self.inflection_accuracy(x_points, fit_params)
+
+        threshold = self.find_threshold(self.threshold_accuracy, fit_params)
+        #plot_psychometric_curve(fit_params[0], fit_params[1], scatter=(x_points, accuracies))
+        return threshold
+
+    def fit_threshold_function(self, x_points, y_points):
+        initial_guess = [np.mean(x_points), np.mean(x_points)]
+        try:
+            fit = curve_fit(self.fit_function, x_points, y_points, p0=initial_guess)
+            # curve_fit returns a ndarray of which the 0th element are the optimized parameters
+            params = fit[0].flatten()
+            return params
+        except RuntimeError:
+            print('Model threshold fit unsuccessful. This is likely because of the model outputting the same value '
+                  'for every input.')
+            return 'fit_fail'
+
+    def find_threshold(self, threshold_accuracy, fit_params):
+        threshold = self.fit_inverse_function(threshold_accuracy, *fit_params)
+        return threshold
+
+    def inflection_accuracy(self, x_points, fit_params):
+        """
+        A function that finds the accuracy at the inflection point of the fit function. Useful if you do not care
+        about the specific threshold accuracy, but rather about e.g. the elevation at the inflection point.
+        """
+        max_fit_accuracy = self.fit_function(np.max(x_points), *fit_params)
+        min_fit_accuracy = self.fit_function(np.min(x_points), *fit_params)
+        threshold_accuracy = min_fit_accuracy + (max_fit_accuracy - min_fit_accuracy) / 2
+        return threshold_accuracy
+
+    @staticmethod
+    def individual_score(source_threshold, target):
+        raw_scores = []
+        for human_threshold in target:
+            raw_score = max((1 - ((np.abs(human_threshold - source_threshold)) / human_threshold)), 0)
+            raw_scores.append(raw_score)
+
+        raw_score, model_error = np.mean(raw_scores), np.std(raw_scores)
+        raw_score = Score([raw_score, model_error], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
+        return raw_score
+
+    @staticmethod
+    def pool_score(source_threshold, target):
+        if not isinstance(target, list):
+            human_mean = np.mean(target.values)
+        else:
+            human_mean = np.mean(target)
+        raw_score = max((1 - ((np.abs(human_mean - source_threshold)) / human_mean)), 0)
+        raw_score = Score([raw_score], coords={'aggregation': ['center', ]}, dims=['aggregation'])
+        return raw_score
+
+    @staticmethod
+    def convert_proba_to_correct(source):
+        decisions = np.argmax(source.values, axis=1)
+        correct = []
+        for presentation, decision in enumerate(decisions):
+            if source['choice'].values[decision] == source['image_label'].values[presentation]:
+                correct.append(1)
+            else:
+                correct.append(0)
+        return np.array(correct)
+
+    @staticmethod
+    def remove_data_after_asymptote(x):
+        """NOTE: CURRENTLY NOT IN USE"""
+        # reverse array to get the last occurrence of the max in case of duplicate maxes
+        last_max_index = np.argmax(x[::-1])
+        return x[:last_max_index]
+
+
+class ThresholdElevation(Threshold):
+    def __init__(self,
+                 independent_variable: str,
+                 baseline_condition: str,
+                 test_condition: str,
+                 threshold_accuracy: Union[Literal['inflection'], float] = 'inflection',
+                 scoring: Union[Literal['individual'], Literal['pool']] = 'individual'
+                 ):
+        super(ThresholdElevation, self).__init__(independent_variable)
+        self.baseline_threshold_metric = Threshold(self._independent_variable,
+                                                   threshold_accuracy=threshold_accuracy)
+        self.test_threshold_metric = Threshold(self._independent_variable,
+                                               threshold_accuracy=threshold_accuracy)
+        self.baseline_condition = baseline_condition
+        self.test_condition = test_condition
+        self.threshold_accuracy = threshold_accuracy
+        self.scoring = scoring
+
+    def __call__(self,
+                 source: Union[float, Dict[str, np.array]],
+                 target: Union[list, Dict[str, PropertyAssembly]]
+                 ) -> Score:
+        if isinstance(source, Dict):
+            source_baseline_threshold = self.baseline_threshold_metric.compute_threshold(source[self.baseline_condition],
+                                                                                         self._independent_variable)
+            if self.threshold_accuracy == 'inflection':
+                self.test_threshold_metric.threshold_accuracy = self.baseline_threshold_metric.threshold_accuracy
+            source_test_threshold = self.test_threshold_metric.compute_threshold(source[self.test_condition],
+                                                                                 self._independent_variable)
+            raw_source_threshold_elevation = source_test_threshold / source_baseline_threshold
+        else:
+            raw_source_threshold_elevation = source
+
+        if isinstance(target, Dict):
+            target_threshold_elevations = self.compute_threshold_elevations(target)
+        else:
+            target_threshold_elevations = target
+
+        # compare threshold to target thresholds
+        if self.scoring == 'pool':
+            return self.pool_score(raw_source_threshold_elevation, target_threshold_elevations)
+        elif self.scoring == 'individual':
+            return self.individual_score(raw_source_threshold_elevation, target_threshold_elevations)
+        else:
+            raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
+
+    def ceiling(self, assemblies: Dict[str, PropertyAssembly]):
+        if self.scoring == 'pool':
+            return self.pool_ceiling(assemblies)
+        elif self.scoring == 'individual':
+            return self.individual_ceiling(assemblies)
+        else:
+            raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
+
+    def pool_ceiling(self, assemblies: Dict[str, PropertyAssembly]):
+        # Still not super sure what a logical pooled ceiling here is - some split-half procedure like in
+        # 'https://github.com/brain-score/brain-score/blob/
+        # 9fbf4eda24d081c0ec7bc4d7b5572d8c13dc92d2/brainscore/metrics/image_level_behavior.py#L92'
+        # likely makes sense, but is quite problematic with the small amount of target data available in most
+        # thresholding studies.
+        raise NotImplementedError
+
+    def individual_ceiling(self, assemblies: Dict[str, PropertyAssembly]):
+        """
+        Computed by one-vs all for each of the NUM_TRIALS human indexes. One index is removed, and scored against
+        a pool of the other values.
+
+        Currently copied with modification from 'https://github.com/brain-score/brain-score/blob/
+        jacob2020_occlusion_depth_ordering/brainscore/metrics/data_cloud_comparision.py#L54'.
+
+        :param assemblies:
+        :return:
+        """
+        baseline_assembly = assemblies['baseline_assembly']
+        condition_assembly = assemblies['condition_assembly']
+        human_threshold_elevations = list(condition_assembly.values / baseline_assembly.values)
+        scores = []
+        for i in range(len(human_threshold_elevations)):
+            random_state = np.random.RandomState(i)
+            random_human_score = random_state.choice(human_threshold_elevations, replace=False)
+            metric = ThresholdElevation(self._independent_variable, self.baseline_condition, self.test_condition,
+                                        self.threshold_accuracy, self.scoring)
+            human_threshold_elevations.remove(random_human_score)
+            score = metric(random_human_score, human_threshold_elevations)
+            score = float(score[(score['aggregation'] == 'center')].values)
+            human_threshold_elevations.append(random_human_score)
+            scores.append(score)
+
+        ceiling, ceiling_error = np.mean(scores), np.std(scores)
+        ceiling = Score([ceiling, ceiling_error], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
+        return ceiling
+
+    def compute_threshold_elevations(self, assemblies: Dict[str, PropertyAssembly]) -> list:
+        baseline_assembly = assemblies['baseline_assembly']
+        condition_assembly = assemblies['condition_assembly']
+        threshold_elevations = []
+        for i, baseline_threshold in baseline_assembly.values:
+            condition_threshold = condition_assembly[i]
+            threshold_elevations.append(condition_threshold / baseline_threshold)
+        return threshold_elevations
+
diff --git a/packaging/malania2007/__init__.py b/packaging/malania2007/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/packaging/malania2007/malania_data_assembly.py b/packaging/malania2007/malania_data_assembly.py
new file mode 100644
index 000000000..756702cbf
--- /dev/null
+++ b/packaging/malania2007/malania_data_assembly.py
@@ -0,0 +1,102 @@
+from pathlib import Path
+import numpy as np
+import xarray as xr
+
+from brainio.assemblies import PropertyAssembly
+from brainio.packaging import package_data_assembly
+import pandas as pd
+
+
+DATASETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2',
+            'long-2', 'equal-16', 'long-16', 'vernier-only']
+NUM_SUBJECTS = {'short-2': 6,
+                'short-4': 5,
+                'short-6': 5,
+                'short-8': 5,
+                'short-16': 6,
+                'equal-2': 5,
+                'long-2': 5,
+                'equal-16': 5,
+                'long-16': 5,
+                'vernier-only': 6}
+
+
+def collect_malania_data_assembly(root_directory, dataset):
+    """
+    Experiment Information:
+    ... todo
+    """
+    # construct the assembly
+    metadata_directory = Path(f'{root_directory}/{dataset}/metadata_human.xlsx')
+    metadata = pd.read_excel(metadata_directory)
+    # Since subjects are uniquely held using 'unique_subject_id', drop the rows with a subject
+    #  without measurement
+    assembly = PropertyAssembly(metadata['threshold'],
+                                  coords={
+                                      'subject_unique_id': ('subject', metadata['subject_unique_id'])
+                                  },
+                                  dims=['subject']
+                                  )
+
+    # give the assembly an identifier name
+    assembly.name = f'Malania2007_{dataset}'
+
+    # test subject numbers after removing the NaN subject
+    metadata = metadata.dropna(subset=['threshold'], axis=0)
+    assert len(metadata) == NUM_SUBJECTS[dataset]
+
+    return assembly
+
+
+def return_local_data_assembly(dataset):
+    root_directory = Path(r'./malania2007_data_assembly')
+    assembly = collect_malania_data_assembly(root_directory, dataset)
+    return assembly
+
+
+def remove_subjects_with_nans(assembly1, assembly2):
+    # Find the indices of the subjects with NaN values in the first PropertyAssembly
+    nan_subjects = np.isnan(assembly1.values)
+
+    # Convert the boolean array to a DataArray with the same coordinates as the input assemblies
+    nan_subjects_da = xr.DataArray(nan_subjects, coords=assembly1.coords, dims=assembly1.dims)
+
+    # Filter out the subjects with NaN values from both PropertyAssemblies
+    filtered_assembly1 = assembly1.where(~nan_subjects_da, drop=True)
+    filtered_assembly2 = assembly2.where(~nan_subjects_da, drop=True)
+
+    return filtered_assembly1, filtered_assembly2
+
+
+# def get_local_ceilings():
+#     from brainscore.metrics.threshold import ThresholdElevation
+#     ceilings = {}
+#     for dataset in DATASETS:
+#         baseline_assembly = return_local_data_assembly('vernier-only')
+#         condition_assembly = return_local_data_assembly(dataset)
+#
+#         condition_assembly, baseline_assembly = remove_subjects_with_nans(condition_assembly, baseline_assembly)
+#
+#         assemblies = {'baseline_assembly': baseline_assembly,
+#                       'condition_assembly': condition_assembly}
+#         metric = ThresholdElevation(independent_variable='vernier_offset',
+#                                     baseline_condition='vernier-only',
+#                                     test_condition=dataset,
+#                                     threshold_accuracy=0.75)
+#         ceiling = metric.individual_ceiling(assemblies)
+#         ceilings[dataset] = ceiling
+#     print(ceilings)
+#     # compute the average ceiling for every condition except the baseline-baseline condition
+#     mean = np.mean([xarray.values[0] for xarray in ceilings.values()][:-1])
+#     print(mean)
+
+
+if __name__ == '__main__':
+    # get_local_ceilings()
+    root_directory = Path(r'./malania2007_data_assembly')
+    for dataset in DATASETS:
+        assembly = collect_malania_data_assembly(root_directory, dataset)
+        # upload to S3
+        # package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
+        #                      stimulus_set_identifier=f"Malania2007_{dataset}",
+        #                      assembly_class="BehavioralAssembly", bucket_name="brainio-brainscore")
\ No newline at end of file
diff --git a/packaging/malania2007/malania_stimulus_set.py b/packaging/malania2007/malania_stimulus_set.py
new file mode 100644
index 000000000..70d0e451f
--- /dev/null
+++ b/packaging/malania2007/malania_stimulus_set.py
@@ -0,0 +1,71 @@
+import csv
+from pathlib import Path
+from brainio.stimuli import StimulusSet
+from brainio.packaging import package_stimulus_set
+
+
+# every stimulus set is separate, incl. baseline condition
+STIMULUS_SETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2',
+                 'long-2', 'equal-16', 'long-16', 'vernier-only', 'short-2_fit',
+                 'short-4_fit', 'short-6_fit', 'short-8_fit', 'short-16_fit',
+                 'equal-2_fit', 'long-2_fit', 'equal-16_fit', 'long-16_fit']
+DATASET_LENGTHS = {'test': 588, 'fit': 432}
+
+
+def collect_malania_stimulus_set(root_directory, dataset):
+    """
+    Dataset Meta Info
+    ... todo
+    """
+    stimuli = []
+    image_paths = {}
+
+    dataset_type = 'fit' if dataset[-3:] == 'fit' else 'test'
+    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
+    image_directory = Path(f'{root_directory}/{dataset}/images')
+    with open(metadata_directory, 'r') as metadata:
+        reader = csv.DictReader(metadata)
+        for row in reader:
+            stimuli.append({
+                'image_size_x': int(row['image_size_x']),
+                'image_size_y': int(row['image_size_y']),
+                'image_size_c': int(row['image_size_c']),
+                'image_size_degrees': float(row['image_size_degrees']),
+                'vernier_height': float(row['vernier_height']),
+                'vernier_offset': float(row['vernier_offset']),
+                'image_label': row['image_label'],
+                'flanker_height': float(row['flanker_height']),
+                'flanker_spacing': float(row['flanker_spacing']),
+                'line_width': float(row['line_width']),
+                'flanker_distance': float(row['flanker_distance']),
+                'num_flankers': int(row['num_flankers']),
+                'vernier_position_x': int(row['vernier_position_x']),
+                'vernier_position_y': int(row['vernier_position_y']),
+                'filename': row['filename'],
+                'stimulus_id': int(row['stimulus_id'])
+            })
+            image_paths[int(row['stimulus_id'])] = Path(f'{image_directory}/{row["filename"]}')
+
+    stimuli = StimulusSet(stimuli)
+    stimuli.image_paths = image_paths
+    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
+
+    # Ensure expected number of stimuli in datasets
+    assert len(stimuli) == DATASET_LENGTHS[dataset_type]
+    return stimuli
+
+
+def return_local_stimulus_set(dataset):
+    root_directory = Path(r'./malania2007_stimulus_set')
+    stimuli = collect_malania_stimulus_set(root_directory, dataset)
+    return stimuli
+
+
+if __name__ == '__main__':
+    root_directory = Path(r'./malania2007_stimulus_set')
+    for stimulus_set in STIMULUS_SETS:
+        stimuli = collect_malania_stimulus_set(root_directory, stimulus_set)
+
+        # upload to S3
+        # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
+        #                      bucket_name="brainio-brainscore")
diff --git a/tests/test_benchmarks/test_malania2007.py b/tests/test_benchmarks/test_malania2007.py
new file mode 100644
index 000000000..237ecaea8
--- /dev/null
+++ b/tests/test_benchmarks/test_malania2007.py
@@ -0,0 +1,44 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+from pytest import approx
+
+from brainio.assemblies import BehavioralAssembly
+from brainscore import benchmark_pool
+from brainscore.benchmarks.malania2007 import DATASETS
+from tests.test_benchmarks import PrecomputedFeatures
+
+
+class TestBehavioral:
+    def test_count(self):
+        assert len(DATASETS) == 5 + 2 + 2
+
+    @pytest.mark.parametrize('dataset', DATASETS)
+    def test_in_pool(self, dataset):
+        identifier = f"Malania2007_{dataset.replace('-', '')}"
+        assert identifier in benchmark_pool
+
+    def test_mean_ceiling(self):
+        benchmarks = [f"Malania2007_{dataset.replace('-', '')}" for dataset in DATASETS]
+        benchmarks = [benchmark_pool[benchmark] for benchmark in benchmarks]
+        ceilings = [benchmark.ceiling.sel(aggregation='center') for benchmark in benchmarks]
+        mean_ceiling = np.mean(ceilings)
+        assert mean_ceiling == approx(0.5618048355142616, abs=0.001)
+
+    @pytest.mark.parametrize('dataset, expected_ceiling', [
+        ('short-2', approx(0.78719345, abs=0.001)),
+        ('short-4', approx(0.49998989, abs=0.001)),
+        ('short-6', approx(0.50590051, abs=0.001)),
+        ('short-8', approx(0.4426336, abs=0.001)),
+        ('short-16', approx(0.8383443, abs=0.001)),
+        ('equal-2', approx(0.56664015, abs=0.001)),
+        ('long-2', approx(0.46470421, abs=0.001)),
+        ('equal-16', approx(0.44087153, abs=0.001)),
+        ('long-16', approx(0.50996587, abs=0.001))
+    ])
+    def test_dataset_ceiling(self, dataset, expected_ceiling):
+        benchmark = f"Malania2007_{dataset.replace('-', '')}"
+        benchmark = benchmark_pool[benchmark]
+        ceiling = benchmark.ceiling
+        assert ceiling.sel(aggregation='center').values.item() == expected_ceiling

From 661a3aaa784ad9615162706292988e041c39b191 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 12 Apr 2023 09:37:18 +0200
Subject: [PATCH 02/65] Add correct visual angle

---
 brainscore/benchmarks/malania2007.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brainscore/benchmarks/malania2007.py b/brainscore/benchmarks/malania2007.py
index 0fc1c825a..674b660aa 100644
--- a/brainscore/benchmarks/malania2007.py
+++ b/brainscore/benchmarks/malania2007.py
@@ -99,7 +99,7 @@ def __init__(self, condition: str):
                                           threshold_accuracy=0.75)
         self._ceiling = self._metric.ceiling(self._assemblies)
 
-        self._visual_degrees = 3  # the precise number is 2.66667 but place_on_screen expects Optional[int]?
+        self._visual_degrees = 2.66667
         self._number_of_trials = 1
 
         super(_Malania2007Base, self).__init__(

From d2aaf5268f558196bf0fb9fefff78a8a690e2b9d Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 12 Apr 2023 11:57:31 +0200
Subject: [PATCH 03/65] clean up threshold metric file

---
 brainscore/metrics/threshold.py | 152 +++++++++++++++++++-------------
 1 file changed, 92 insertions(+), 60 deletions(-)

diff --git a/brainscore/metrics/threshold.py b/brainscore/metrics/threshold.py
index e947f1945..eb2fe164e 100644
--- a/brainscore/metrics/threshold.py
+++ b/brainscore/metrics/threshold.py
@@ -12,11 +12,13 @@
 from brainio.stimuli import StimulusSet
 
 
-def cumulative_gaussian(x, mu, sigma):
+def cumulative_gaussian(x: np.array, mu: float, sigma: float) -> float:
+    """The cumulative gaussian function."""
     return 0.5 * (1 + erf((x - mu) / (sigma * np.sqrt(2))))
 
 
-def inverse_cumulative_gaussian(x_point, mu, sigma):
+def inverse_cumulative_gaussian(x_point: float, mu: float, sigma: float) -> float:
+    """Inverts the cumulative_gaussian function."""
     return np.sqrt(2) * sigma * erfinv(2 * x_point - 1) + mu
 
 
@@ -41,23 +43,24 @@ def __init__(self,
         self.threshold_accuracy = threshold_accuracy
         self.scoring = scoring
 
-    def __call__(self, source: Union[np.array, float], target: Union[list, PropertyAssembly]) -> Score:
+    def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list, PropertyAssembly]) -> Score:
         """
-        :param source: Either a np.array containing model responses to individual stimuli, or a pre-computed threshold
-                        as a float.
+        :param source: Either a BehavioralAssembly containing model responses to individual stimuli, or a pre-computed
+                        threshold as a float.
         :param target: Either a list containing human thresholds (for the ceiling function & ThresholdElevation),
                         or a PropertyAsssembly.
-        :return: A Score containing the evaluated model's distance to target thresholds in units of multiples of
-                  the human score.
+        :return: A Score containing the evaluated model's ceiling-adjusted distance to target thresholds.
         """
         # compute threshold from measurements if the input is not a threshold already
-        if not isinstance(source, float):
+        if isinstance(source, float):
+            source_threshold = source
+        elif isinstance(source, BehavioralAssembly):
             source_threshold = self.compute_threshold(source, self._independent_variable)
+            # check whether the psychometric function fit was successful - if not, return a score of 0
+            if source_threshold == 'fit_fail':
+                return Score([0.], coords={'aggregation': ['center', ]}, dims=['aggregation'])
         else:
-            source_threshold = source
-
-        if source_threshold == 'fit_fail':
-            return Score([0.], coords={'aggregation': ['center', ]}, dims=['aggregation'])
+            raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.')
 
         # compare threshold to target thresholds
         if self.scoring == 'pool':
@@ -67,12 +70,14 @@ def __call__(self, source: Union[np.array, float], target: Union[list, PropertyA
         else:
             raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
 
-    def ceiling(self, assembly: PropertyAssembly):
+    def ceiling(self, assembly: Union[PropertyAssembly, Dict[str, PropertyAssembly]]) -> Score:
         """
-        :param assembly: the human PropertyAssembly containing human responses
+        Selects the appropriate ceiling to be computed from target assembly data.
+
+        :param assembly: the human PropertyAssembly containing human responses, or a dict containing the
+                          PropertyAssemblies of the ThresholdElevation metric.
         :return: Score object with coords center (ceiling) and error (STD)
         """
-        # compare threshold to target thresholds
         if self.scoring == 'pool':
             return self.pool_ceiling(assembly)
         elif self.scoring == 'individual':
@@ -95,9 +100,6 @@ def individual_ceiling(self, assembly: PropertyAssembly):
 
         Currently copied with modification from 'https://github.com/brain-score/brain-score/blob/
         jacob2020_occlusion_depth_ordering/brainscore/metrics/data_cloud_comparision.py#L54'.
-
-        :param assembly:
-        :return:
         """
         human_thresholds: list = assembly.values.tolist()
         scores = []
@@ -116,7 +118,8 @@ def individual_ceiling(self, assembly: PropertyAssembly):
         ceiling = Score([ceiling, ceiling_error], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
         return ceiling
 
-    def compute_threshold(self, source, independent_variable):
+    def compute_threshold(self, source: BehavioralAssembly, independent_variable: str) -> float:
+        """Converts the source BehavioralAssembly to a threshold float value."""
         assert len(source.values) == len(source[independent_variable].values)
 
         x_points = source[independent_variable].values
@@ -129,10 +132,16 @@ def compute_threshold(self, source, independent_variable):
             self.threshold_accuracy = self.inflection_accuracy(x_points, fit_params)
 
         threshold = self.find_threshold(self.threshold_accuracy, fit_params)
-        #plot_psychometric_curve(fit_params[0], fit_params[1], scatter=(x_points, accuracies))
         return threshold
 
-    def fit_threshold_function(self, x_points, y_points):
+    def fit_threshold_function(self, x_points: np.array, y_points: np.array) -> Union[np.array, str]:
+        """
+        A function that takes the x and y-points of the measured variable and handles the fitting of the
+        psychometric threshold function.
+
+        Returns either the fit parameters for self.fit_function or a string tag that indicates the failure
+        of the psychometric curve fit.
+        """
         initial_guess = [np.mean(x_points), np.mean(x_points)]
         try:
             fit = curve_fit(self.fit_function, x_points, y_points, p0=initial_guess)
@@ -144,11 +153,15 @@ def fit_threshold_function(self, x_points, y_points):
                   'for every input.')
             return 'fit_fail'
 
-    def find_threshold(self, threshold_accuracy, fit_params):
+    def find_threshold(self, threshold_accuracy: float, fit_params: Tuple[float, ...]) -> float:
+        """
+        A function that uses the inverse fit function to find the value of the threshold in terms of
+        the independent variable (self._independent_variable).
+        """
         threshold = self.fit_inverse_function(threshold_accuracy, *fit_params)
         return threshold
 
-    def inflection_accuracy(self, x_points, fit_params):
+    def inflection_accuracy(self, x_points: np.array, fit_params: np.array) -> float:
         """
         A function that finds the accuracy at the inflection point of the fit function. Useful if you do not care
         about the specific threshold accuracy, but rather about e.g. the elevation at the inflection point.
@@ -159,10 +172,15 @@ def inflection_accuracy(self, x_points, fit_params):
         return threshold_accuracy
 
     @staticmethod
-    def individual_score(source_threshold, target):
+    def individual_score(source: float, target: Union[list, PropertyAssembly]) -> Score:
+        """
+        Computes the average distance of the source from each of the individual targets in units of the
+        individual targets. This is generally a more stringent scoring method than pool_score, aimed
+        to measure the average of the individual target effects.
+        """
         raw_scores = []
-        for human_threshold in target:
-            raw_score = max((1 - ((np.abs(human_threshold - source_threshold)) / human_threshold)), 0)
+        for target_value in target:
+            raw_score = max((1 - ((np.abs(target_value - source)) / target_value)), 0)
             raw_scores.append(raw_score)
 
         raw_score, model_error = np.mean(raw_scores), np.std(raw_scores)
@@ -170,17 +188,23 @@ def individual_score(source_threshold, target):
         return raw_score
 
     @staticmethod
-    def pool_score(source_threshold, target):
+    def pool_score(source: float, target: Union[list, PropertyAssembly]) -> Score:
+        """
+        Computes the distance of the source from the average of the target in units of the target average.
+        This is generally a less stringent scoring method than individual_score, aimed to measure the average
+        target effect.
+        """
         if not isinstance(target, list):
-            human_mean = np.mean(target.values)
+            target_mean = np.mean(target.values)
         else:
-            human_mean = np.mean(target)
-        raw_score = max((1 - ((np.abs(human_mean - source_threshold)) / human_mean)), 0)
+            target_mean = np.mean(target)
+        raw_score = max((1 - ((np.abs(target_mean - source)) / target_mean)), 0)
         raw_score = Score([raw_score], coords={'aggregation': ['center', ]}, dims=['aggregation'])
         return raw_score
 
     @staticmethod
-    def convert_proba_to_correct(source):
+    def convert_proba_to_correct(source: BehavioralAssembly) -> np.array:
+        """Converts the probability values returned by models doing probability tasks to behavioral choices."""
         decisions = np.argmax(source.values, axis=1)
         correct = []
         for presentation, decision in enumerate(decisions):
@@ -190,15 +214,14 @@ def convert_proba_to_correct(source):
                 correct.append(0)
         return np.array(correct)
 
-    @staticmethod
-    def remove_data_after_asymptote(x):
-        """NOTE: CURRENTLY NOT IN USE"""
-        # reverse array to get the last occurrence of the max in case of duplicate maxes
-        last_max_index = np.argmax(x[::-1])
-        return x[:last_max_index]
-
 
 class ThresholdElevation(Threshold):
+    """
+    Computes a threshold elevation from two conditions: a baseline condition and a test condition by dividing
+    the threshold of the test condition by the baseline condition. In other words,
+
+    `threshold_elevation = test_condition_threshold / baseline_condition_threshold`.
+    """
     def __init__(self,
                  independent_variable: str,
                  baseline_condition: str,
@@ -217,26 +240,43 @@ def __init__(self,
         self.scoring = scoring
 
     def __call__(self,
-                 source: Union[float, Dict[str, np.array]],
+                 source: Union[float, Dict[str, BehavioralAssembly]],
                  target: Union[list, Dict[str, PropertyAssembly]]
                  ) -> Score:
-        if isinstance(source, Dict):
+        """
+        :param source: Either a dictionary containing the BehavioralAssemblies for the test condition and the
+                        baseline condition, or a pre-computed float threshold elevation. If Dict, Dict
+                        keys should be 'condition_assembly' and 'baseline_assembly' respectively.
+        :param target: Either a dictionary containing the PropertyAssemblies for the test condition and the
+                        baseline condition, or a list of pre-computed threshold elevations. If Dict, Dict
+                        keys should be 'condition_assembly' and 'baseline_assembly' respectively.
+        :return: A score containing the evaluated model's ceiling-adjusted distance to target threshold elevations.
+        """
+        # check whether source is a threshold elevation already - if not, compute it.
+        if isinstance(source, float):
+            raw_source_threshold_elevation = source
+        elif isinstance(source, Dict):
             source_baseline_threshold = self.baseline_threshold_metric.compute_threshold(source[self.baseline_condition],
                                                                                          self._independent_variable)
+            # if using the inflection accuracy, get the inflection point from the baseline condition, and use that
+            # for the test condition.
             if self.threshold_accuracy == 'inflection':
                 self.test_threshold_metric.threshold_accuracy = self.baseline_threshold_metric.threshold_accuracy
             source_test_threshold = self.test_threshold_metric.compute_threshold(source[self.test_condition],
                                                                                  self._independent_variable)
             raw_source_threshold_elevation = source_test_threshold / source_baseline_threshold
         else:
-            raw_source_threshold_elevation = source
+            raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.')
 
-        if isinstance(target, Dict):
+        # check whether the targets are threshold elevations already - if not, compute them
+        if isinstance(target, list):
+            target_threshold_elevations = target
+        elif isinstance(target, Dict):
             target_threshold_elevations = self.compute_threshold_elevations(target)
         else:
-            target_threshold_elevations = target
+            raise TypeError(f'target is type {type(target)}, but type PropertyAssembly or list is required.')
 
-        # compare threshold to target thresholds
+        # compare threshold elevation to target threshold elevations
         if self.scoring == 'pool':
             return self.pool_score(raw_source_threshold_elevation, target_threshold_elevations)
         elif self.scoring == 'individual':
@@ -244,14 +284,6 @@ def __call__(self,
         else:
             raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
 
-    def ceiling(self, assemblies: Dict[str, PropertyAssembly]):
-        if self.scoring == 'pool':
-            return self.pool_ceiling(assemblies)
-        elif self.scoring == 'individual':
-            return self.individual_ceiling(assemblies)
-        else:
-            raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
-
     def pool_ceiling(self, assemblies: Dict[str, PropertyAssembly]):
         # Still not super sure what a logical pooled ceiling here is - some split-half procedure like in
         # 'https://github.com/brain-score/brain-score/blob/
@@ -267,13 +299,8 @@ def individual_ceiling(self, assemblies: Dict[str, PropertyAssembly]):
 
         Currently copied with modification from 'https://github.com/brain-score/brain-score/blob/
         jacob2020_occlusion_depth_ordering/brainscore/metrics/data_cloud_comparision.py#L54'.
-
-        :param assemblies:
-        :return:
         """
-        baseline_assembly = assemblies['baseline_assembly']
-        condition_assembly = assemblies['condition_assembly']
-        human_threshold_elevations = list(condition_assembly.values / baseline_assembly.values)
+        human_threshold_elevations = self.compute_threshold_elevations(assemblies)
         scores = []
         for i in range(len(human_threshold_elevations)):
             random_state = np.random.RandomState(i)
@@ -290,7 +317,13 @@ def individual_ceiling(self, assemblies: Dict[str, PropertyAssembly]):
         ceiling = Score([ceiling, ceiling_error], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
         return ceiling
 
-    def compute_threshold_elevations(self, assemblies: Dict[str, PropertyAssembly]) -> list:
+    @staticmethod
+    def compute_threshold_elevations(assemblies: Dict[str, PropertyAssembly]) -> list:
+        """
+        Computes the threshold elevations of a baseline condition and a test condition:
+
+        `threshold_elevation = test_condition_threshold / baseline_condition_threshold`.
+        """
         baseline_assembly = assemblies['baseline_assembly']
         condition_assembly = assemblies['condition_assembly']
         threshold_elevations = []
@@ -298,4 +331,3 @@ def compute_threshold_elevations(self, assemblies: Dict[str, PropertyAssembly])
             condition_threshold = condition_assembly[i]
             threshold_elevations.append(condition_threshold / baseline_threshold)
         return threshold_elevations
-

From 0598170b828ce76a70c2799c67c5e30f1b9e0bc7 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 12 Apr 2023 12:22:57 +0200
Subject: [PATCH 04/65] Further threshold cleanup and ceiling compute
 verification

---
 brainscore/metrics/threshold.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/brainscore/metrics/threshold.py b/brainscore/metrics/threshold.py
index eb2fe164e..183fe3166 100644
--- a/brainscore/metrics/threshold.py
+++ b/brainscore/metrics/threshold.py
@@ -1,15 +1,11 @@
-import itertools
-from typing import Dict, Union, Literal
+from typing import Dict, Union, Literal, Tuple
 
 import numpy as np
 from scipy.optimize import curve_fit
-from scipy.stats import norm
 from scipy.special import erf, erfinv
 
 from brainscore.metrics import Metric, Score
-from brainscore.metrics.ceiling import SplitHalfConsistency
-from brainio.assemblies import PropertyAssembly, BehavioralAssembly, DataAssembly
-from brainio.stimuli import StimulusSet
+from brainio.assemblies import PropertyAssembly, BehavioralAssembly
 
 
 def cumulative_gaussian(x: np.array, mu: float, sigma: float) -> float:
@@ -327,7 +323,7 @@ def compute_threshold_elevations(assemblies: Dict[str, PropertyAssembly]) -> lis
         baseline_assembly = assemblies['baseline_assembly']
         condition_assembly = assemblies['condition_assembly']
         threshold_elevations = []
-        for i, baseline_threshold in baseline_assembly.values:
-            condition_threshold = condition_assembly[i]
+        for i, baseline_threshold in enumerate(baseline_assembly.values):
+            condition_threshold = condition_assembly.values[i]
             threshold_elevations.append(condition_threshold / baseline_threshold)
         return threshold_elevations

From 9a66485c2f3bb09a4baf40112f87f15509237f15 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 12 Apr 2023 12:32:16 +0200
Subject: [PATCH 05/65] Remove python 3.8 requirement that came from
 typing.Literal

---
 brainscore/metrics/threshold.py | 36 ++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/brainscore/metrics/threshold.py b/brainscore/metrics/threshold.py
index 183fe3166..07ac7eb1f 100644
--- a/brainscore/metrics/threshold.py
+++ b/brainscore/metrics/threshold.py
@@ -1,4 +1,4 @@
-from typing import Dict, Union, Literal, Tuple
+from typing import Dict, Union, Tuple
 
 import numpy as np
 from scipy.optimize import curve_fit
@@ -30,9 +30,22 @@ def __init__(self,
                  independent_variable,
                  fit_function=cumulative_gaussian,
                  fit_inverse_function=inverse_cumulative_gaussian,
-                 threshold_accuracy: Union[Literal['inflection'], float] = 'inflection',
-                 scoring: Union[Literal['individual'], Literal['pool']] = 'individual'
+                 threshold_accuracy: Union[str, float] = 'inflection',
+                 scoring: str = 'individual'
                  ):
+        """
+        :param independent_variable: The independent variable in the benchmark that the threshold is computed
+                                      over.
+        :param fit_function: The function used to fit the threshold.
+        :param fit_inverse_function: The inverse of fit_function used to find the threshold from the fit.
+        :param threshold_accuracy: The accuracy at which the threshold should be evaluated at. This can be
+                                    either a string Literal['inflection'] or a float. When Literal['inflection']
+                                    is used, the function finds the inflection point of the curve and evaluates
+                                    the threshold at that level. When a float is used, the function evaluates
+                                    the threshold at that level.
+        :param scoring: The scoring function used to evaluate performance. Either Literal['individual'] or
+                         Literal['pool']. See the individual_score and pool_score methods for more information.
+        """
         self.fit_function = fit_function
         self.fit_inverse_function = fit_inverse_function
         self._independent_variable = independent_variable
@@ -222,9 +235,22 @@ def __init__(self,
                  independent_variable: str,
                  baseline_condition: str,
                  test_condition: str,
-                 threshold_accuracy: Union[Literal['inflection'], float] = 'inflection',
-                 scoring: Union[Literal['individual'], Literal['pool']] = 'individual'
+                 threshold_accuracy: Union[str, float] = 'inflection',
+                 scoring: str = 'individual'
                  ):
+        """
+        :param independent_variable: The independent variable in the benchmark that the threshold is computed
+                                      over.
+        :param baseline_condition: The baseline condition against which threshold elevation is measured.
+        :param test_condition: The test condition that is used to measure threshold elevation..
+        :param threshold_accuracy: The accuracy at which the threshold should be evaluated at. This can be
+                                    either a string Literal['inflection'] or a float. When Literal['inflection']
+                                    is used, the function finds the inflection point of the curve and evaluates
+                                    the threshold at that level. When a float is used, the function evaluates
+                                    the threshold at that level.
+        :param scoring: The scoring function used to evaluate performance. Either Literal['individual'] or
+                         Literal['pool']. See the individual_score and pool_score methods for more information.
+                """
         super(ThresholdElevation, self).__init__(independent_variable)
         self.baseline_threshold_metric = Threshold(self._independent_variable,
                                                    threshold_accuracy=threshold_accuracy)

From d77642ae3e90d23901b885a5e0fe045977a87607 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 12 Apr 2023 13:36:11 +0200
Subject: [PATCH 06/65] clean up benchmark file

---
 brainscore/benchmarks/malania2007.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/brainscore/benchmarks/malania2007.py b/brainscore/benchmarks/malania2007.py
index 674b660aa..f1c75d5d1 100644
--- a/brainscore/benchmarks/malania2007.py
+++ b/brainscore/benchmarks/malania2007.py
@@ -1,15 +1,11 @@
+from typing import Tuple
 import numpy as np
-from scipy.optimize import least_squares
-from scipy.stats import t
 import xarray as xr
 
 import brainscore
-from brainio.assemblies import walk_coords
+from brainio.assemblies import PropertyAssembly
 from brainscore.benchmarks import BenchmarkBase
 from brainscore.benchmarks.screen import place_on_screen
-from brainscore.metrics import Score
-from brainscore.metrics.accuracy import Accuracy
-from brainscore.metrics.distribution_similarity import BootstrapDistributionSimilarity
 from brainscore.metrics.threshold import ThresholdElevation
 from brainscore.model_interface import BrainModel
 from brainscore.utils import LazyLoad
@@ -32,8 +28,8 @@
 # Values in NUM_FLANKERS_PER_CONDITION denote the condition (i.e., in this case the number of flankers) to be selected
 # This is kept track of simply because the benchmark uses threshold elevation - i.e., a comparison of 2 conditions
 NUM_FLANKERS_PER_CONDITION = {'short-2': 2, 'short-4': 4, 'short-6': 6, 'short-8': 8,
-            'short-16': 16, 'equal-2': 2, 'long-2': 2, 'equal-16': 16,
-            'long-16': 16, 'vernier-only': 0}
+                              'short-16': 16, 'equal-2': 2, 'long-2': 2, 'equal-16': 16,
+                              'long-16': 16, 'vernier-only': 0}
 
 
 for dataset in DATASETS:
@@ -133,12 +129,14 @@ def __call__(self, candidate: BrainModel):
         return score
 
 
-def load_assembly(dataset):
+def load_assembly(dataset: str) -> PropertyAssembly:
     assembly = brainscore.get_assembly(f'Malania2007_{dataset}')
     return assembly
 
 
-def remove_subjects_with_nans(condition_assembly, baseline_assembly):
+def remove_subjects_with_nans(condition_assembly: PropertyAssembly,
+                              baseline_assembly: PropertyAssembly
+                              ) -> Tuple[PropertyAssembly, PropertyAssembly]:
     # Find the indices of the subjects with NaN values in the first PropertyAssembly
     nan_subjects = np.isnan(condition_assembly.values)
 
@@ -149,4 +147,4 @@ def remove_subjects_with_nans(condition_assembly, baseline_assembly):
     filtered_condition_assembly = condition_assembly.where(~nan_subjects_da, drop=True)
     filtered_baseline_assembly = baseline_assembly.where(~nan_subjects_da, drop=True)
 
-    return filtered_condition_assembly, filtered_baseline_assembly
\ No newline at end of file
+    return filtered_condition_assembly, filtered_baseline_assembly

From 2113d2e8c199f17975b4240e5fbef7ea12a2ea1f Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 12 Apr 2023 14:16:37 +0200
Subject: [PATCH 07/65] add assembly, stimulus, and benchmark general tests

---
 tests/test_assemblies.py               |  63 +++++++++++++-
 tests/test_benchmarks/test___init__.py |  13 ++-
 tests/test_stimuli.py                  | 113 +++++++++++++++++++++++++
 3 files changed, 187 insertions(+), 2 deletions(-)

diff --git a/tests/test_assemblies.py b/tests/test_assemblies.py
index 6f1c222ef..021c850f4 100644
--- a/tests/test_assemblies.py
+++ b/tests/test_assemblies.py
@@ -64,6 +64,16 @@
         'brendel.Geirhos2021_stylized',
         'brendel.Geirhos2021_sketch',
         'brendel.Geirhos2021_uniform-noise',
+        'Malania2007_short-2',
+        'Malania2007_short-4',
+        'Malania2007_short-6',
+        'Malania2007_short-8',
+        'Malania2007_short-16',
+        'Malania2007_equal-2',
+        'Malania2007_long-2',
+        'Malania2007_equal-16',
+        'Malania2007_long-16',
+        'Malania2007_vernier-only',
 ))
 def test_list_assembly(assembly):
     l = brainio.list_assemblies()
@@ -120,6 +130,16 @@ def test_list_assembly(assembly):
     pytest.param('brendel.Geirhos2021_stylized', marks=[pytest.mark.private_access]),
     pytest.param('brendel.Geirhos2021_sketch', marks=[pytest.mark.private_access]),
     pytest.param('brendel.Geirhos2021_uniform-noise', marks=[pytest.mark.private_access]),
+    pytest.param('Malania2007_short-2', marks=[pytest.mark.private_access]),
+    pytest.param('Malania2007_short-4', marks=[pytest.mark.private_access]),
+    pytest.param('Malania2007_short-6', marks=[pytest.mark.private_access]),
+    pytest.param('Malania2007_short-8', marks=[pytest.mark.private_access]),
+    pytest.param('Malania2007_short-16', marks=[pytest.mark.private_access]),
+    pytest.param('Malania2007_equal-2', marks=[pytest.mark.private_access]),
+    pytest.param('Malania2007_long-2', marks=[pytest.mark.private_access]),
+    pytest.param('Malania2007_equal-16', marks=[pytest.mark.private_access]),
+    pytest.param('Malania2007_long-16', marks=[pytest.mark.private_access]),
+    pytest.param('Malania2007_vernier-only', marks=[pytest.mark.private_access]),
 ])
 def test_existence(assembly_identifier):
     assert brainio.get_assembly(assembly_identifier) is not None
@@ -481,4 +501,45 @@ def test_fields_present_abnormal_sets(self, identifier, field):
     ])
     def test_fields_present_cue_conflict(self, identifier, field):
         assembly = brainscore.get_assembly(f"brendel.Geirhos2021_{identifier}")
-        assert hasattr(assembly, field)
\ No newline at end of file
+        assert hasattr(assembly, field)
+
+
+class TestMalania2007:
+
+    # test the number of subjects:
+    @pytest.mark.parametrize('identifier, num_subjects', [
+        ('short-2', 6),
+        ('short-4', 5),
+        ('short-6', 5),
+        ('short-8', 5),
+        ('short-16', 6),
+        ('equal-2', 5),
+        ('long-2', 5),
+        ('equal-16', 5),
+        ('long-16', 5),
+        ('vernier-only', 6)
+    ])
+    def test_num_subjects(self, identifier, num_subjects):
+        assembly = brainscore.get_assembly(f"Malania2007_{identifier}")
+        assembly = assembly.where(~np.isnan(assembly.values), drop=True)
+        assert len(np.unique(assembly['subject'].values)) == num_subjects
+
+    # test assembly coords present in ALL 17 sets:
+    @pytest.mark.parametrize('identifier', [
+        'short-2',
+        'short-4',
+        'short-6',
+        'short-8',
+        'short-16',
+        'equal-2',
+        'long-2',
+        'equal-16',
+        'long-16',
+        'vernier-only',
+    ])
+    @pytest.mark.parametrize('field', [
+        'subject'
+    ])
+    def test_fields_present(self, identifier, field):
+        assembly = brainscore.get_assembly(f"brendel.Geirhos2021_{identifier}")
+        assert hasattr(assembly, field)
diff --git a/tests/test_benchmarks/test___init__.py b/tests/test_benchmarks/test___init__.py
index 5f41c0f70..276d6433c 100644
--- a/tests/test_benchmarks/test___init__.py
+++ b/tests/test_benchmarks/test___init__.py
@@ -96,6 +96,15 @@ def test_exact_evaluation_pool(self):
             'brendel.Geirhos2021stylized-error_consistency',
             'brendel.Geirhos2021sketch-error_consistency',
             'brendel.Geirhos2021uniformnoise-error_consistency',
+            'Malania2007_short-2',
+            'Malania2007_short-4',
+            'Malania2007_short-6',
+            'Malania2007_short-8',
+            'Malania2007_short-16',
+            'Malania2007_equal-2',
+            'Malania2007_long-2',
+            'Malania2007_equal-16',
+            'Malania2007_long-16',
         }
 
     def test_engineering_pool(self):
@@ -548,7 +557,9 @@ class TestNumberOfTrials:
         'dicarlo.SanghaviMurty2020.IT-pls',
         'dicarlo.Kar2019-ost',
         # behavior
-        'dicarlo.Rajalingham2018-i2n',  # Geirhos2021 are single-trial, i.e. not included here
+        'dicarlo.Rajalingham2018-i2n',
+        # Geirhos2021 are single-trial, i.e. not included here
+        # Malania2007 are single-trial, i.e. not included here
     ])
     def test_repetitions(self, benchmark_identifier):
         """ Tests that benchmarks have repetitions in the stimulus_set """
diff --git a/tests/test_stimuli.py b/tests/test_stimuli.py
index e362a91d0..d75dffec8 100644
--- a/tests/test_stimuli.py
+++ b/tests/test_stimuli.py
@@ -58,6 +58,24 @@
         'brendel.Geirhos2021_stylized',
         'brendel.Geirhos2021_sketch',
         'brendel.Geirhos2021_uniform-noise',
+        'Malania2007_short-2',
+        'Malania2007_short-4',
+        'Malania2007_short-6',
+        'Malania2007_short-8',
+        'Malania2007_short-16',
+        'Malania2007_equal-2',
+        'Malania2007_long-2',
+        'Malania2007_equal-16',
+        'Malania2007_long-16',
+        'Malania2007_short-2_fit',
+        'Malania2007_short-4_fit',
+        'Malania2007_short-6_fit',
+        'Malania2007_short-8_fit',
+        'Malania2007_short-16_fit',
+        'Malania2007_equal-2_fit',
+        'Malania2007_long-2_fit',
+        'Malania2007_equal-16_fit',
+        'Malania2007_long-16_fit',
 ))
 def test_list_stimulus_set(stimulus_set):
     l = brainio.list_stimulus_sets()
@@ -248,4 +266,99 @@ def test_fields_present3(self, identifier, field):
         assert hasattr(stimulus_set, field)
 
 
+@pytest.mark.slow
+class TestMalania2007:
+    # test stimulus_set data:
+    @pytest.mark.parametrize('identifier', [
+        'short-2',
+        'short-4',
+        'short-6',
+        'short-8',
+        'short-16',
+        'equal-2',
+        'long-2',
+        'equal-16',
+        'long-16',
+        'short-2_fit',
+        'short-4_fit',
+        'short-6_fit',
+        'short-8_fit',
+        'short-16_fit',
+        'equal-2_fit',
+        'long-2_fit',
+        'equal-16_fit',
+        'long-16_fit',
+    ])
+    def test_stimulus_set_exist(self, identifier):
+        full_name = f"Malania2007_{identifier}"
+        stimulus_set = brainio.get_stimulus_set(full_name)
+        assert stimulus_set is not None
+        assert stimulus_set.identifier == full_name
+
+    # test the number of images
+    @pytest.mark.parametrize('identifier, num_images', [
+        ('short-2', 588),
+        ('short-4', 588),
+        ('short-6', 588),
+        ('short-8', 588),
+        ('short-16', 588),
+        ('equal-2', 588),
+        ('long-2', 588),
+        ('equal-16', 588),
+        ('long-16', 588),
+        ('short-2_fit', 432),
+        ('short-4_fit', 432),
+        ('short-6_fit', 432),
+        ('short-8_fit', 432),
+        ('short-16_fit', 432),
+        ('equal-2_fit', 432),
+        ('long-2_fit', 432),
+        ('equal-16_fit', 432),
+        ('long-16_fit', 432),
+    ])
+    def test_num_images(self, identifier, num_images):
+        stimulus_set = brainscore.get_stimulus_set(f"Malania2007_{identifier}")
+        assert len(np.unique(stimulus_set['stimulus_id'].values)) == num_images
 
+    # tests stimulus_set coords for the 14 "normal" sets:
+    @pytest.mark.parametrize('identifier', [
+        'short-2',
+        'short-4',
+        'short-6',
+        'short-8',
+        'short-16',
+        'equal-2',
+        'long-2',
+        'equal-16',
+        'long-16',
+        'short-2_fit',
+        'short-4_fit',
+        'short-6_fit',
+        'short-8_fit',
+        'short-16_fit',
+        'equal-2_fit',
+        'long-2_fit',
+        'equal-16_fit',
+        'long-16_fit',
+    ])
+    @pytest.mark.parametrize('field', [
+        'image_size_x',
+        'image_size_y',
+        'image_size_c',
+        'image_size_degrees',
+        'vernier_height',
+        'vernier_offset',
+        'image_label',
+        'flanker_height',
+        'flanker_spacing',
+        'line_width',
+        'flanker_distance',
+        'num_flankers',
+        'vernier_position_x',
+        'vernier_position_y',
+        'filename',
+        'stimulus_id',
+    ])
+    def test_fields_present(self, identifier, field):
+        stimulus_set = brainscore.get_stimulus_set(f"Malania2007_{identifier}")
+        assert hasattr(stimulus_set, field)

From 63b4b164e37804b115ee7f3cff9d9fc0d57fbbb2 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 20 Apr 2023 16:18:06 +0200
Subject: [PATCH 08/65] resolve Islam2021 conflict

---
 tests/test_stimuli.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_stimuli.py b/tests/test_stimuli.py
index d75dffec8..a0ac0da8e 100644
--- a/tests/test_stimuli.py
+++ b/tests/test_stimuli.py
@@ -76,6 +76,7 @@
         'Malania2007_long-2_fit',
         'Malania2007_equal-16_fit',
         'Malania2007_long-16_fit',
+        'Islam2021',
 ))
 def test_list_stimulus_set(stimulus_set):
     l = brainio.list_stimulus_sets()

From 15a897bc240dcda9deb3f5c9b6cb7ba918739dd7 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 31 May 2023 14:24:02 +0200
Subject: [PATCH 09/65] Add grid search, correct psychometric function, and
 pooled score

---
 brainscore/metrics/threshold.py | 285 ++++++++++++++++++++++++--------
 1 file changed, 219 insertions(+), 66 deletions(-)

diff --git a/brainscore/metrics/threshold.py b/brainscore/metrics/threshold.py
index 07ac7eb1f..542a71151 100644
--- a/brainscore/metrics/threshold.py
+++ b/brainscore/metrics/threshold.py
@@ -1,21 +1,99 @@
-from typing import Dict, Union, Tuple
+from typing import Dict, Union, Tuple, Optional, Callable
 
 import numpy as np
-from scipy.optimize import curve_fit
-from scipy.special import erf, erfinv
+from scipy.optimize import minimize
+from scipy.stats import norm
 
 from brainscore.metrics import Metric, Score
 from brainio.assemblies import PropertyAssembly, BehavioralAssembly
 
+import matplotlib.pyplot as plt
 
-def cumulative_gaussian(x: np.array, mu: float, sigma: float) -> float:
-    """The cumulative gaussian function."""
-    return 0.5 * (1 + erf((x - mu) / (sigma * np.sqrt(2))))
 
+def wichmann_cum_gauss(x: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float:
+    """
+    The classic psychometric function as implemented in Wichmann & Hill (2001). The psychometric function: I.
+    Fitting, sampling, and goodness of fit, eq. 1.
+
+    Parameters
+    ----------
+    x: the independent variables of the data
+    alpha: the slope parameter
+    beta: the mean of the cdf parameter
+    lambda_: the lapse rate
+    gamma: the upper bound of the fit
+    """
+    return gamma + (1 - gamma - lambda_) * norm.cdf(alpha * (x - beta))
+
+
+def inverse_wichmann_cum_gauss(y: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float:
+    """The inverse of wichmann_cum_gauss."""
+    return beta + (norm.ppf((y - gamma) / (1 - gamma - lambda_)) / alpha)
+
+
+def wichmann_neg_log_likelihood(params: Tuple[float, ...], x: np.array, y: np.array) -> float:
+    """The negative log likelihood function for wichmann_cum_gauss."""
+    alpha, beta, lambda_ = params
+    p = wichmann_cum_gauss(x, alpha, beta, lambda_)
+    logL = y * np.log(p) + (1 - y) * np.log(1 - p)
+    return -np.sum(logL)
+
+
+def grid_search(x: np.array,
+                y: np.array,
+                alpha_values: np.array = np.logspace(-3, 1, 50),
+                beta_values: np.array = None,
+                fit_log_likelihood_fn: Callable = wichmann_neg_log_likelihood,
+                fit_bounds: Tuple = ((None, None), (None, None), (0.03, 0.5))) -> Tuple[float, ...]:
+    """
+    A classic simplified procedure for running sparse grid search over the slope and mean parameters of the
+    psychometric function.
+
+    Parameters
+    ----------
+    x: the independent variables of the data
+    y: the measured accuracy rates for the given x-values
+    alpha_values: the alpha values for the chosen fit function to grid search over
+    beta_values: the beta values for the chosen fit function to grid search over
+    fit_log_likelihood_fn: the log likelihood function that computes the log likelihood of its corresponding
+                            fit function
+    fit_bounds: the bounds assigned to the fit function called by fit_log_likelihood_fn.
+                 The default fit_bounds are assigned as:
+                 alpha: (None, None), to allow any slope
+                 beta: (None, None), any inflection point is allowed, as that is controlled for in the Threshold class
+                 lambda_: (0.03, 0.5)), to require at least a small lapse rate, as is regularly done in human fitting
+
+    Returns
+    -------
+    the parameters of the best fit in the grid search
+    """
+    assert len(x) == len(y)
+    # Default the beta_values grid search to the measured x-points.
+    if beta_values is None:
+        beta_values = x
+
+    # initialize best values for a fit
+    best_alpha, best_beta, best_lambda = None, None, None
+    min_neg_log_likelihood = np.inf
+
+    for alpha_guess in alpha_values:
+        for beta_guess in beta_values:
+            initial_guess = np.array([alpha_guess, beta_guess, 1 - np.max(y)])  # lapse rate guess set to the maximum y
+
+            # wrap inside a RuntimeError block to catch the RuntimeError thrown by scipy.minimize if a fit
+            # entirely fails.
+            try:
+                result = minimize(fit_log_likelihood_fn, initial_guess, args=(x, y), method='L-BFGS-B', bounds=fit_bounds)
+                alpha_hat, beta_hat, lambda_hat = result.x
+                neg_log_likelihood_hat = fit_log_likelihood_fn([alpha_hat, beta_hat, lambda_hat], x, y)
+
+                if neg_log_likelihood_hat < min_neg_log_likelihood:
+                    min_neg_log_likelihood = neg_log_likelihood_hat
+                    best_alpha, best_beta, best_lambda = alpha_hat, beta_hat, lambda_hat
+            except RuntimeError:
+                pass
 
-def inverse_cumulative_gaussian(x_point: float, mu: float, sigma: float) -> float:
-    """Inverts the cumulative_gaussian function."""
-    return np.sqrt(2) * sigma * erfinv(2 * x_point - 1) + mu
+    return best_alpha, best_beta, best_lambda
 
 
 class Threshold(Metric):
@@ -27,11 +105,14 @@ class Threshold(Metric):
     against the mean of the distance of the model threshold to human thresholds.
     """
     def __init__(self,
-                 independent_variable,
-                 fit_function=cumulative_gaussian,
-                 fit_inverse_function=inverse_cumulative_gaussian,
+                 independent_variable: str,
+                 fit_function=wichmann_cum_gauss,
+                 fit_inverse_function=inverse_wichmann_cum_gauss,
                  threshold_accuracy: Union[str, float] = 'inflection',
-                 scoring: str = 'individual'
+                 scoring: str = 'pool',
+                 max_fit_params: Optional[Tuple[float, ...]] = None,
+                 required_accuracy: Optional[float] = 0.6,
+                 plot_fit: bool = False
                  ):
         """
         :param independent_variable: The independent variable in the benchmark that the threshold is computed
@@ -51,6 +132,9 @@ def __init__(self,
         self._independent_variable = independent_variable
         self.threshold_accuracy = threshold_accuracy
         self.scoring = scoring
+        self.max_fit_params = max_fit_params
+        self.required_accuracy = required_accuracy
+        self.plot_fit = plot_fit
 
     def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list, PropertyAssembly]) -> Score:
         """
@@ -67,7 +151,7 @@ def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list,
             source_threshold = self.compute_threshold(source, self._independent_variable)
             # check whether the psychometric function fit was successful - if not, return a score of 0
             if source_threshold == 'fit_fail':
-                return Score([0.], coords={'aggregation': ['center', ]}, dims=['aggregation'])
+                return Score([0., 0.], coords={'aggregation': ['center', ]}, dims=['aggregation'])
         else:
             raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.')
 
@@ -80,35 +164,16 @@ def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list,
             raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
 
     def ceiling(self, assembly: Union[PropertyAssembly, Dict[str, PropertyAssembly]]) -> Score:
-        """
-        Selects the appropriate ceiling to be computed from target assembly data.
-
-        :param assembly: the human PropertyAssembly containing human responses, or a dict containing the
-                          PropertyAssemblies of the ThresholdElevation metric.
-        :return: Score object with coords center (ceiling) and error (STD)
-        """
-        if self.scoring == 'pool':
-            return self.pool_ceiling(assembly)
-        elif self.scoring == 'individual':
-            return self.individual_ceiling(assembly)
-        else:
-            raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
-
-    def pool_ceiling(self, assembly: PropertyAssembly):
-        # Still not super sure what a logical pooled ceiling here is - some split-half procedure like in
-        # 'https://github.com/brain-score/brain-score/blob/
-        # 9fbf4eda24d081c0ec7bc4d7b5572d8c13dc92d2/brainscore/metrics/image_level_behavior.py#L92'
-        # likely makes sense, but is quite problematic with the small amount of target data available in most
-        # thresholding studies.
-        raise NotImplementedError
-
-    def individual_ceiling(self, assembly: PropertyAssembly):
         """
         Computed by one-vs all for each of the NUM_TRIALS human indexes. One index is removed, and scored against
         a pool of the other values.
 
         Currently copied with modification from 'https://github.com/brain-score/brain-score/blob/
         jacob2020_occlusion_depth_ordering/brainscore/metrics/data_cloud_comparision.py#L54'.
+
+        :param assembly: the human PropertyAssembly containing human responses, or a dict containing the
+                          PropertyAssemblies of the ThresholdElevation metric.
+        :return: Score object with coords center (ceiling) and error (STD)
         """
         human_thresholds: list = assembly.values.tolist()
         scores = []
@@ -116,7 +181,7 @@ def individual_ceiling(self, assembly: PropertyAssembly):
             random_state = np.random.RandomState(i)
             random_human_score = random_state.choice(human_thresholds, replace=False)
             metric = Threshold(self._independent_variable, self.fit_function, self.fit_inverse_function,
-                               self.threshold_accuracy)
+                               self.threshold_accuracy, scoring=self.scoring)
             human_thresholds.remove(random_human_score)
             score = metric(random_human_score, human_thresholds)
             score = float(score[(score['aggregation'] == 'center')].values)
@@ -127,13 +192,17 @@ def individual_ceiling(self, assembly: PropertyAssembly):
         ceiling = Score([ceiling, ceiling_error], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
         return ceiling
 
-    def compute_threshold(self, source: BehavioralAssembly, independent_variable: str) -> float:
+    def compute_threshold(self, source: BehavioralAssembly, independent_variable: str) -> Union[float, str]:
         """Converts the source BehavioralAssembly to a threshold float value."""
         assert len(source.values) == len(source[independent_variable].values)
 
         x_points = source[independent_variable].values
         accuracies = self.convert_proba_to_correct(source)
-        fit_params = self.fit_threshold_function(x_points, accuracies)
+        if np.mean(accuracies) < self.required_accuracy:
+            print('Psychometric threshold fit failure due to low accuracy.')
+            fit_params = 'fit_fail'
+        else:
+            fit_params, measurement_max = self.fit_threshold_function(x_points, accuracies)
         if (type(fit_params) == str) and (fit_params == 'fit_fail'):
             return fit_params
 
@@ -141,6 +210,11 @@ def compute_threshold(self, source: BehavioralAssembly, independent_variable: st
             self.threshold_accuracy = self.inflection_accuracy(x_points, fit_params)
 
         threshold = self.find_threshold(self.threshold_accuracy, fit_params)
+
+        # check whether the fit is outside the measured model responses to discard spurious thresholds
+        if (threshold > measurement_max) or np.isnan(threshold):
+            print('Fit fail because threshold is outside of the measured range of responses.')
+            return 'fit_fail'
         return threshold
 
     def fit_threshold_function(self, x_points: np.array, y_points: np.array) -> Union[np.array, str]:
@@ -151,16 +225,29 @@ def fit_threshold_function(self, x_points: np.array, y_points: np.array) -> Unio
         Returns either the fit parameters for self.fit_function or a string tag that indicates the failure
         of the psychometric curve fit.
         """
-        initial_guess = [np.mean(x_points), np.mean(x_points)]
-        try:
-            fit = curve_fit(self.fit_function, x_points, y_points, p0=initial_guess)
-            # curve_fit returns a ndarray of which the 0th element are the optimized parameters
-            params = fit[0].flatten()
-            return params
-        except RuntimeError:
-            print('Model threshold fit unsuccessful. This is likely because of the model outputting the same value '
-                  'for every input.')
-            return 'fit_fail'
+        x_points, y_points = self.aggregate_psychometric_fit_data(x_points, y_points)
+        aggregated_x_points, aggregated_y_points, at_least_third_remaining = self.remove_data_after_asymptote(x_points,
+                                                                                                              y_points)
+        measurement_max = np.max(aggregated_x_points)
+        if not at_least_third_remaining:
+            # This failure indicates that there is too little data to accurately fit the psychometric function.
+            print('Psychometric curve fit fail because performance is decreasing with the independent variable.')
+            return 'fit_fail', measurement_max
+
+        params = grid_search(aggregated_x_points, aggregated_y_points)
+        # if all the fits in the grid search failed, there will be a None value in params. In this case, we reject
+        #  the fit. This typically only ever happens when a model outputs one value for all test images.
+        if None in params:
+            params = 'fit_fail'
+
+        if self.plot_fit:
+            self.plot_fit_(x_points,
+                           aggregated_x_points,
+                           y_points,
+                           aggregated_y_points,
+                           params,
+                           fit_function=self.fit_function)
+        return params, measurement_max
 
     def find_threshold(self, threshold_accuracy: float, fit_params: Tuple[float, ...]) -> float:
         """
@@ -180,6 +267,39 @@ def inflection_accuracy(self, x_points: np.array, fit_params: np.array) -> float
         threshold_accuracy = min_fit_accuracy + (max_fit_accuracy - min_fit_accuracy) / 2
         return threshold_accuracy
 
+    def plot_fit_(self, x_points, x_points_removed, y_points, y_points_removed, fit_params, fit_function):
+        # Create a dense set of x values for plotting the fitted curve
+        x_dense = np.linspace(min(x_points), max(x_points), 1000)
+        # Calculate the corresponding y values using the fit function and parameters
+        y_dense = fit_function(x_dense, *fit_params)
+
+        # Plot the original data points
+        plt.scatter(x_points, y_points, label='Before asymptote removal',
+                    marker='o', color='blue', alpha=0.5)
+        plt.scatter(x_points_removed, y_points_removed, label='After asymptote removal',
+                    marker='o', color='red', alpha=0.5)
+
+        # Plot the fitted curve
+        plt.plot(x_dense, y_dense, label='Fitted curve', color='red', linewidth=2)
+
+        # Add labels and a legend
+        plt.xlabel(self._independent_variable)
+        plt.ylabel('Accuracy')
+        plt.legend()
+        plt.show()
+
+    @staticmethod
+    def aggregate_psychometric_fit_data(x_points, y_points):
+        unique_x = np.unique(x_points)
+        correct_rate = np.zeros(len(unique_x))
+
+        for i, x in enumerate(unique_x):
+            trials = np.sum(x_points == x)
+            correct_trials = np.sum((x_points == x) & (y_points == 1))
+            correct_rate[i] = correct_trials / trials
+
+        return unique_x, correct_rate
+
     @staticmethod
     def individual_score(source: float, target: Union[list, PropertyAssembly]) -> Score:
         """
@@ -189,7 +309,7 @@ def individual_score(source: float, target: Union[list, PropertyAssembly]) -> Sc
         """
         raw_scores = []
         for target_value in target:
-            raw_score = max((1 - ((np.abs(target_value - source)) / target_value)), 0)
+            raw_score = max((1 - ((np.abs(target_value - source)) / (target_value + source))), 0)
             raw_scores.append(raw_score)
 
         raw_score, model_error = np.mean(raw_scores), np.std(raw_scores)
@@ -207,8 +327,8 @@ def pool_score(source: float, target: Union[list, PropertyAssembly]) -> Score:
             target_mean = np.mean(target.values)
         else:
             target_mean = np.mean(target)
-        raw_score = max((1 - ((np.abs(target_mean - source)) / target_mean)), 0)
-        raw_score = Score([raw_score], coords={'aggregation': ['center', ]}, dims=['aggregation'])
+        raw_score = max((1 - ((np.abs(target_mean - source)) / (target_mean + source))), 0)
+        raw_score = Score([raw_score], coords={'aggregation': ['center']}, dims=['aggregation'])
         return raw_score
 
     @staticmethod
@@ -223,6 +343,37 @@ def convert_proba_to_correct(source: BehavioralAssembly) -> np.array:
                 correct.append(0)
         return np.array(correct)
 
+    @staticmethod
+    def remove_data_after_asymptote(x_values, y_values):
+        # Compute the standard deviation of y_values
+        std_dev = np.std(y_values)
+
+        # Find the index of the maximum y_value
+        max_y_idx = np.argmax(y_values)
+
+        # Initialize the index for the first data point after the maximum y_value
+        # that deviates from the maximum by at least 1 standard deviation
+        index_to_remove = None
+
+        # Iterate through the y_values after the maximum y_value
+        for idx, y in enumerate(y_values[max_y_idx + 1:], start=max_y_idx + 1):
+            # Check if all the remaining y_values deviate by at least 1 standard deviation
+            if all([abs(val - y_values[max_y_idx]) >= std_dev for val in y_values[idx:]]):
+                index_to_remove = idx
+                break
+
+        pre_remove_length = len(y_values)
+        # If we found an index to remove, remove the data after that index
+        if index_to_remove is not None:
+            x_values = x_values[:index_to_remove]
+            y_values = y_values[:index_to_remove]
+
+        # Check if at least a third of the elements remain
+        remaining_fraction = len(y_values) / pre_remove_length
+        is_at_least_third_remaining = remaining_fraction >= 1 / 3
+
+        return x_values, y_values, is_at_least_third_remaining
+
 
 class ThresholdElevation(Threshold):
     """
@@ -236,7 +387,10 @@ def __init__(self,
                  baseline_condition: str,
                  test_condition: str,
                  threshold_accuracy: Union[str, float] = 'inflection',
-                 scoring: str = 'individual'
+                 scoring: str = 'pool',
+                 max_fit_params: Optional[Tuple[float, ...]] = None,
+                 required_baseline_accuracy: Optional[float] = 0.6,
+                 required_test_accuracy: Optional[float] = 0.0
                  ):
         """
         :param independent_variable: The independent variable in the benchmark that the threshold is computed
@@ -253,9 +407,13 @@ def __init__(self,
                 """
         super(ThresholdElevation, self).__init__(independent_variable)
         self.baseline_threshold_metric = Threshold(self._independent_variable,
-                                                   threshold_accuracy=threshold_accuracy)
+                                                   threshold_accuracy=threshold_accuracy,
+                                                   max_fit_params=max_fit_params,
+                                                   required_accuracy=required_baseline_accuracy)
         self.test_threshold_metric = Threshold(self._independent_variable,
-                                               threshold_accuracy=threshold_accuracy)
+                                               threshold_accuracy=threshold_accuracy,
+                                               max_fit_params=max_fit_params,
+                                               required_accuracy=required_test_accuracy)
         self.baseline_condition = baseline_condition
         self.test_condition = test_condition
         self.threshold_accuracy = threshold_accuracy
@@ -280,12 +438,15 @@ def __call__(self,
         elif isinstance(source, Dict):
             source_baseline_threshold = self.baseline_threshold_metric.compute_threshold(source[self.baseline_condition],
                                                                                          self._independent_variable)
+
             # if using the inflection accuracy, get the inflection point from the baseline condition, and use that
             # for the test condition.
             if self.threshold_accuracy == 'inflection':
                 self.test_threshold_metric.threshold_accuracy = self.baseline_threshold_metric.threshold_accuracy
             source_test_threshold = self.test_threshold_metric.compute_threshold(source[self.test_condition],
                                                                                  self._independent_variable)
+            if source_baseline_threshold == 'fit_fail' or source_test_threshold == 'fit_fail':
+                return Score([0., 0.], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
             raw_source_threshold_elevation = source_test_threshold / source_baseline_threshold
         else:
             raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.')
@@ -306,15 +467,7 @@ def __call__(self,
         else:
             raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
 
-    def pool_ceiling(self, assemblies: Dict[str, PropertyAssembly]):
-        # Still not super sure what a logical pooled ceiling here is - some split-half procedure like in
-        # 'https://github.com/brain-score/brain-score/blob/
-        # 9fbf4eda24d081c0ec7bc4d7b5572d8c13dc92d2/brainscore/metrics/image_level_behavior.py#L92'
-        # likely makes sense, but is quite problematic with the small amount of target data available in most
-        # thresholding studies.
-        raise NotImplementedError
-
-    def individual_ceiling(self, assemblies: Dict[str, PropertyAssembly]):
+    def ceiling(self, assemblies: Dict[str, PropertyAssembly]) -> Score:
         """
         Computed by one-vs all for each of the NUM_TRIALS human indexes. One index is removed, and scored against
         a pool of the other values.
@@ -328,7 +481,7 @@ def individual_ceiling(self, assemblies: Dict[str, PropertyAssembly]):
             random_state = np.random.RandomState(i)
             random_human_score = random_state.choice(human_threshold_elevations, replace=False)
             metric = ThresholdElevation(self._independent_variable, self.baseline_condition, self.test_condition,
-                                        self.threshold_accuracy, self.scoring)
+                                        self.threshold_accuracy, scoring=self.scoring)
             human_threshold_elevations.remove(random_human_score)
             score = metric(random_human_score, human_threshold_elevations)
             score = float(score[(score['aggregation'] == 'center')].values)

From 161eb67a0a9ba795edd256e456066f036adfecf5 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 31 May 2023 14:31:06 +0200
Subject: [PATCH 10/65] Standardized vernier train and test set sizes

---
 tests/test_stimuli.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/tests/test_stimuli.py b/tests/test_stimuli.py
index a0ac0da8e..c3c4cd06b 100644
--- a/tests/test_stimuli.py
+++ b/tests/test_stimuli.py
@@ -298,24 +298,24 @@ def test_stimulus_set_exist(self, identifier):
 
     # test the number of images
     @pytest.mark.parametrize('identifier, num_images', [
-        ('short-2', 588),
-        ('short-4', 588),
-        ('short-6', 588),
-        ('short-8', 588),
-        ('short-16', 588),
-        ('equal-2', 588),
-        ('long-2', 588),
-        ('equal-16', 588),
-        ('long-16', 588),
-        ('short-2_fit', 432),
-        ('short-4_fit', 432),
-        ('short-6_fit', 432),
-        ('short-8_fit', 432),
-        ('short-16_fit', 432),
-        ('equal-2_fit', 432),
-        ('long-2_fit', 432),
-        ('equal-16_fit', 432),
-        ('long-16_fit', 432),
+        ('short-2', 1225),
+        ('short-4', 1225),
+        ('short-6', 1225),
+        ('short-8', 1225),
+        ('short-16', 1225),
+        ('equal-2', 1225),
+        ('long-2', 1225),
+        ('equal-16', 1225),
+        ('long-16', 1225),
+        ('short-2_fit', 1225),
+        ('short-4_fit', 1225),
+        ('short-6_fit', 1225),
+        ('short-8_fit', 1225),
+        ('short-16_fit', 1225),
+        ('equal-2_fit', 1225),
+        ('long-2_fit', 1225),
+        ('equal-16_fit', 1225),
+        ('long-16_fit', 1225),
     ])
     def test_num_images(self, identifier, num_images):
         stimulus_set = brainscore.get_stimulus_set(f"Malania2007_{identifier}")

From 2e0fc3c3a1cf2d9a39a9fef1cc7e7b091f09959a Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 8 Jun 2023 12:13:44 +0200
Subject: [PATCH 11/65] add goodness of fit check to discard fits to random
 responses

---
 brainscore/metrics/threshold.py | 44 +++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/brainscore/metrics/threshold.py b/brainscore/metrics/threshold.py
index 542a71151..1c831e724 100644
--- a/brainscore/metrics/threshold.py
+++ b/brainscore/metrics/threshold.py
@@ -3,6 +3,7 @@
 import numpy as np
 from scipy.optimize import minimize
 from scipy.stats import norm
+from sklearn.metrics import mean_squared_error, r2_score
 
 from brainscore.metrics import Metric, Score
 from brainio.assemblies import PropertyAssembly, BehavioralAssembly
@@ -35,19 +36,29 @@ def wichmann_neg_log_likelihood(params: Tuple[float, ...], x: np.array, y: np.ar
     """The negative log likelihood function for wichmann_cum_gauss."""
     alpha, beta, lambda_ = params
     p = wichmann_cum_gauss(x, alpha, beta, lambda_)
-    logL = y * np.log(p) + (1 - y) * np.log(1 - p)
-    return -np.sum(logL)
+    log_likelihood = y * np.log(p) + (1 - y) * np.log(1 - p)
+    return -np.sum(log_likelihood)
+
+
+def get_predicted(params: Tuple[float, ...], x: np.array, fit_fn: Callable) -> np.array:
+    """Returns the predicted values based on the model parameters."""
+    return fit_fn(x, *params)
 
 
 def grid_search(x: np.array,
                 y: np.array,
                 alpha_values: np.array = np.logspace(-3, 1, 50),
                 beta_values: np.array = None,
+                fit_fn: Callable = wichmann_cum_gauss,
                 fit_log_likelihood_fn: Callable = wichmann_neg_log_likelihood,
-                fit_bounds: Tuple = ((None, None), (None, None), (0.03, 0.5))) -> Tuple[float, ...]:
+                fit_bounds: Tuple = ((None, None), (None, None), (0.03, 0.5))
+                ) -> Tuple[Tuple[float, ...], float]:
     """
     A classic simplified procedure for running sparse grid search over the slope and mean parameters of the
     psychometric function.
+    This function is implemented here instead of using sklearn.GridSearchCV since we would have to make a custom
+    sklearn estimator class to use GridSearchCV with psychometric functions, likely increasing code bloat
+    substantially.
 
     Parameters
     ----------
@@ -55,6 +66,7 @@ def grid_search(x: np.array,
     y: the measured accuracy rates for the given x-values
     alpha_values: the alpha values for the chosen fit function to grid search over
     beta_values: the beta values for the chosen fit function to grid search over
+    fit_fn: the psychometric function that is fit
     fit_log_likelihood_fn: the log likelihood function that computes the log likelihood of its corresponding
                             fit function
     fit_bounds: the bounds assigned to the fit function called by fit_log_likelihood_fn.
@@ -81,9 +93,10 @@ def grid_search(x: np.array,
             initial_guess = np.array([alpha_guess, beta_guess, 1 - np.max(y)])  # lapse rate guess set to the maximum y
 
             # wrap inside a RuntimeError block to catch the RuntimeError thrown by scipy.minimize if a fit
-            # entirely fails.
+            # entirely fails. The case where all fits fail here is handled by the Threshold metric.
             try:
-                result = minimize(fit_log_likelihood_fn, initial_guess, args=(x, y), method='L-BFGS-B', bounds=fit_bounds)
+                result = minimize(fit_log_likelihood_fn, initial_guess, args=(x, y),
+                                  method='L-BFGS-B', bounds=fit_bounds)
                 alpha_hat, beta_hat, lambda_hat = result.x
                 neg_log_likelihood_hat = fit_log_likelihood_fn([alpha_hat, beta_hat, lambda_hat], x, y)
 
@@ -93,7 +106,10 @@ def grid_search(x: np.array,
             except RuntimeError:
                 pass
 
-    return best_alpha, best_beta, best_lambda
+    y_pred = fit_fn(x, best_alpha, best_beta, best_lambda)
+    mse = mean_squared_error(y, y_pred)
+    r2 = r2_score(y, y_pred)
+    return (best_alpha, best_beta, best_lambda), r2
 
 
 class Threshold(Metric):
@@ -234,7 +250,12 @@ def fit_threshold_function(self, x_points: np.array, y_points: np.array) -> Unio
             print('Psychometric curve fit fail because performance is decreasing with the independent variable.')
             return 'fit_fail', measurement_max
 
-        params = grid_search(aggregated_x_points, aggregated_y_points)
+        params, r2 = grid_search(aggregated_x_points, aggregated_y_points)
+
+        # remove fits to random data
+        if r2 < 0.4:
+            params = 'fit_fail'
+
         # if all the fits in the grid search failed, there will be a None value in params. In this case, we reject
         #  the fit. This typically only ever happens when a model outputs one value for all test images.
         if None in params:
@@ -390,7 +411,8 @@ def __init__(self,
                  scoring: str = 'pool',
                  max_fit_params: Optional[Tuple[float, ...]] = None,
                  required_baseline_accuracy: Optional[float] = 0.6,
-                 required_test_accuracy: Optional[float] = 0.0
+                 required_test_accuracy: Optional[float] = 0.6,
+                 plot_fit: bool = False
                  ):
         """
         :param independent_variable: The independent variable in the benchmark that the threshold is computed
@@ -409,11 +431,13 @@ def __init__(self,
         self.baseline_threshold_metric = Threshold(self._independent_variable,
                                                    threshold_accuracy=threshold_accuracy,
                                                    max_fit_params=max_fit_params,
-                                                   required_accuracy=required_baseline_accuracy)
+                                                   required_accuracy=required_baseline_accuracy,
+                                                   plot_fit=plot_fit)
         self.test_threshold_metric = Threshold(self._independent_variable,
                                                threshold_accuracy=threshold_accuracy,
                                                max_fit_params=max_fit_params,
-                                               required_accuracy=required_test_accuracy)
+                                               required_accuracy=required_test_accuracy,
+                                               plot_fit=plot_fit)
         self.baseline_condition = baseline_condition
         self.test_condition = test_condition
         self.threshold_accuracy = threshold_accuracy

From ebfc04ad1d6c8eeb7f76c29cc46c167747f6acd5 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 8 Jun 2023 15:11:26 +0200
Subject: [PATCH 12/65] update ceilings to pooled expectations

---
 tests/test_assemblies.py                  |  2 +-
 tests/test_benchmarks/test_malania2007.py | 67 +++++++++++++++++++----
 2 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/tests/test_assemblies.py b/tests/test_assemblies.py
index 021c850f4..dbf7e65fb 100644
--- a/tests/test_assemblies.py
+++ b/tests/test_assemblies.py
@@ -541,5 +541,5 @@ def test_num_subjects(self, identifier, num_subjects):
         'subject'
     ])
     def test_fields_present(self, identifier, field):
-        assembly = brainscore.get_assembly(f"brendel.Geirhos2021_{identifier}")
+        assembly = brainscore.get_assembly(f"Malania2007_{identifier}")
         assert hasattr(assembly, field)
diff --git a/tests/test_benchmarks/test_malania2007.py b/tests/test_benchmarks/test_malania2007.py
index 237ecaea8..6aac00721 100644
--- a/tests/test_benchmarks/test_malania2007.py
+++ b/tests/test_benchmarks/test_malania2007.py
@@ -24,21 +24,68 @@ def test_mean_ceiling(self):
         benchmarks = [benchmark_pool[benchmark] for benchmark in benchmarks]
         ceilings = [benchmark.ceiling.sel(aggregation='center') for benchmark in benchmarks]
         mean_ceiling = np.mean(ceilings)
-        assert mean_ceiling == approx(0.5618048355142616, abs=0.001)
+        assert mean_ceiling == approx(0.7724487108297781, abs=0.001)  # TODO: check that this is correct
 
+    # these test values are for the pooled score ceiling
     @pytest.mark.parametrize('dataset, expected_ceiling', [
-        ('short-2', approx(0.78719345, abs=0.001)),
-        ('short-4', approx(0.49998989, abs=0.001)),
-        ('short-6', approx(0.50590051, abs=0.001)),
-        ('short-8', approx(0.4426336, abs=0.001)),
-        ('short-16', approx(0.8383443, abs=0.001)),
-        ('equal-2', approx(0.56664015, abs=0.001)),
-        ('long-2', approx(0.46470421, abs=0.001)),
-        ('equal-16', approx(0.44087153, abs=0.001)),
-        ('long-16', approx(0.50996587, abs=0.001))
+        ('short-2', approx(0.82203635, abs=0.001)),
+        ('short-4', approx(0.78841608, abs=0.001)),
+        ('short-6', approx(0.80555853, abs=0.001)),
+        ('short-8', approx(0.7866628, abs=0.001)),
+        ('short-16', approx(0.90941085, abs=0.001)),
+        ('equal-2', approx(0.77990816, abs=0.001)),
+        ('long-2', approx(0.72215817, abs=0.001)),
+        ('equal-16', approx(0.62778544, abs=0.001)),
+        ('long-16', approx(0.71010202, abs=0.001))
     ])
     def test_dataset_ceiling(self, dataset, expected_ceiling):
         benchmark = f"Malania2007_{dataset.replace('-', '')}"
         benchmark = benchmark_pool[benchmark]
         ceiling = benchmark.ceiling
         assert ceiling.sel(aggregation='center').values.item() == expected_ceiling
+
+    @pytest.mark.parametrize('dataset, model, expected_raw_score', [
+        ('short-2', 'resnet-18', approx(0., abs=0.001)),
+        ('short-4', 'resnet-18', approx(0., abs=0.001)),
+        ('short-6', 'resnet-18', approx(0., abs=0.001)),
+        ('short-8', 'resnet-18', approx(0., abs=0.001)),
+        ('short-16', 'resnet-18', approx(0., abs=0.001)),
+        ('equal-2', 'resnet-18', approx(0., abs=0.001)),
+        ('long-2', 'resnet-18', approx(0., abs=0.001)),
+        ('equal-16', 'resnet-18', approx(0., abs=0.001)),
+        ('long-16', 'resnet-18', approx(0., abs=0.001)),
+    ])
+    def test_model_8degrees(self, dataset, model, expected_raw_score):
+        benchmark = benchmark_pool[f"Malania_{dataset.replace('-', '')}"]
+        # load features
+        precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'
+        precomputed_features = BehavioralAssembly.from_files(file_path=precomputed_features)
+        precomputed_features = PrecomputedFeatures(precomputed_features,
+                                                   visual_degrees=8,  # doesn't matter, features are already computed
+                                                   )
+        # score
+        score = benchmark(precomputed_features).raw
+        assert score == expected_raw_score
+
+    @pytest.mark.parametrize('dataset, model, expected_raw_score', [
+        ('short-2', 'resnet-18-3deg', approx(0., abs=0.001)),
+        ('short-4', 'resnet-18-3deg', approx(0., abs=0.001)),
+        ('short-6', 'resnet-18-3deg', approx(0., abs=0.001)),
+        ('short-8', 'resnet-18-3deg', approx(0., abs=0.001)),
+        ('short-16', 'resnet-18-3deg', approx(0., abs=0.001)),
+        ('equal-2', 'resnet-18-3deg', approx(0., abs=0.001)),
+        ('long-2', 'resnet-18-3deg', approx(0., abs=0.001)),
+        ('equal-16', 'resnet-18-3deg', approx(0., abs=0.001)),
+        ('long-16', 'resnet-18-3deg', approx(0., abs=0.001)),
+    ])
+    def test_model_8degrees(self, dataset, model, expected_raw_score):
+        benchmark = benchmark_pool[f"Malania_{dataset.replace('-', '')}"]
+        # load features
+        precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'
+        precomputed_features = BehavioralAssembly.from_files(file_path=precomputed_features)
+        precomputed_features = PrecomputedFeatures(precomputed_features,
+                                                   visual_degrees=3,  # doesn't matter, features are already computed
+                                                   )
+        # score
+        score = benchmark(precomputed_features).raw
+        assert score == expected_raw_score

From 52b55e52fc97340f9f9f2d45077116744220542b Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 9 Jun 2023 09:03:31 +0200
Subject: [PATCH 13/65] packaged commit

---
 brainscore/benchmarks/malania2007.py          | 23 +++++++-----
 brainscore/metrics/threshold.py               | 11 ++----
 .../malania2007/malania_data_assembly.py      | 35 ++++---------------
 packaging/malania2007/malania_stimulus_set.py | 28 ++++++++++-----
 tests/test_stimuli.py                         |  6 +++-
 5 files changed, 47 insertions(+), 56 deletions(-)

diff --git a/brainscore/benchmarks/malania2007.py b/brainscore/benchmarks/malania2007.py
index f1c75d5d1..5657d28b0 100644
--- a/brainscore/benchmarks/malania2007.py
+++ b/brainscore/benchmarks/malania2007.py
@@ -10,6 +10,7 @@
 from brainscore.model_interface import BrainModel
 from brainscore.utils import LazyLoad
 
+
 BIBTEX = """@article{malania2007,
             author = {Malania, Maka and Herzog, Michael H. and Westheimer, Gerald},
             title = "{Grouping of contextual elements that affect vernier thresholds}",
@@ -31,7 +32,6 @@
                               'short-16': 16, 'equal-2': 2, 'long-2': 2, 'equal-16': 16,
                               'long-16': 16, 'vernier-only': 0}
 
-
 for dataset in DATASETS:
     # behavioral benchmark
     identifier = f"Malania_{dataset.replace('-', '')}"
@@ -56,7 +56,7 @@ class _Malania2007Base(BenchmarkBase):
     Benchmark Choices:
 
     1) The number and type of fitting stimuli are unfounded choices. Currently, the number of fitting stimuli is chosen
-        to be relatively small, but sufficient for good decoding performance in the baseline condition.
+        to be relatively small, but sufficient for good decoding performance in the baseline condition in general.
         - Precisely faithful alternative: Present text instructions to models as they were presented to humans
             * Why not this alternative? Since the experiment is about early visual perception, and there are currently
             few/no models capable of a task like this, it would not be interesting.
@@ -87,7 +87,11 @@ def __init__(self, condition: str):
 
         self._assemblies = {'baseline_assembly': self._baseline_assembly,
                             'condition_assembly': self._assembly}
-        self._fitting_stimuli = load_assembly(f'{self.condition}_fit')
+        self._stimulus_set = brainscore.get_stimulus_set(f'{self.condition}')
+        self._baseline_stimulus_set = brainscore.get_stimulus_set(f'{self.baseline_condition}')
+        self._stimulus_sets = {self.condition: self._stimulus_set,
+                               self.baseline_condition: self._baseline_stimulus_set}
+        self._fitting_stimuli = brainscore.get_stimulus_set(f'{self.condition}_fit')
 
         self._metric = ThresholdElevation(independent_variable='vernier_offset',
                                           baseline_condition=self.baseline_condition,
@@ -95,26 +99,27 @@ def __init__(self, condition: str):
                                           threshold_accuracy=0.75)
         self._ceiling = self._metric.ceiling(self._assemblies)
 
-        self._visual_degrees = 2.66667
+        self._visual_degrees = 2.986667
         self._number_of_trials = 1
 
         super(_Malania2007Base, self).__init__(
             identifier=f'Malania2007_{condition}', version=1,
-            ceiling_func=lambda: self._metric.ceiling(self._assembly),
+            ceiling_func=lambda: self._ceiling,
             parent='Malania2007',
             bibtex=BIBTEX)
 
     def __call__(self, candidate: BrainModel):
-        model_response = {}
+        model_responses = {}
         candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli)
         for condition in (self.baseline_condition, self.condition):
-            model_response[condition] = place_on_screen(
-                self._assembly.stimulus_set.sel(num_flankers=NUM_FLANKERS_PER_CONDITION[condition]),
+            stimulus_set = place_on_screen(
+                self._stimulus_sets[condition],
                 target_visual_degrees=candidate.visual_degrees(),
                 source_visual_degrees=self._visual_degrees
             )
+            model_responses[condition] = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials)
 
-        raw_score = self._metric(model_response, self._assemblies)
+        raw_score = self._metric(model_responses, self._assemblies)
 
         # Adjust score to ceiling
         ceiling = self._ceiling
diff --git a/brainscore/metrics/threshold.py b/brainscore/metrics/threshold.py
index 1c831e724..ee9cd07f3 100644
--- a/brainscore/metrics/threshold.py
+++ b/brainscore/metrics/threshold.py
@@ -1,15 +1,13 @@
 from typing import Dict, Union, Tuple, Optional, Callable
-
 import numpy as np
 from scipy.optimize import minimize
 from scipy.stats import norm
 from sklearn.metrics import mean_squared_error, r2_score
+import matplotlib.pyplot as plt
 
 from brainscore.metrics import Metric, Score
 from brainio.assemblies import PropertyAssembly, BehavioralAssembly
 
-import matplotlib.pyplot as plt
-
 
 def wichmann_cum_gauss(x: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float:
     """
@@ -107,7 +105,6 @@ def grid_search(x: np.array,
                 pass
 
     y_pred = fit_fn(x, best_alpha, best_beta, best_lambda)
-    mse = mean_squared_error(y, y_pred)
     r2 = r2_score(y, y_pred)
     return (best_alpha, best_beta, best_lambda), r2
 
@@ -126,7 +123,6 @@ def __init__(self,
                  fit_inverse_function=inverse_wichmann_cum_gauss,
                  threshold_accuracy: Union[str, float] = 'inflection',
                  scoring: str = 'pool',
-                 max_fit_params: Optional[Tuple[float, ...]] = None,
                  required_accuracy: Optional[float] = 0.6,
                  plot_fit: bool = False
                  ):
@@ -148,7 +144,6 @@ def __init__(self,
         self._independent_variable = independent_variable
         self.threshold_accuracy = threshold_accuracy
         self.scoring = scoring
-        self.max_fit_params = max_fit_params
         self.required_accuracy = required_accuracy
         self.plot_fit = plot_fit
 
@@ -254,6 +249,7 @@ def fit_threshold_function(self, x_points: np.array, y_points: np.array) -> Unio
 
         # remove fits to random data
         if r2 < 0.4:
+            print('Fit fail due to low fit R^2.')
             params = 'fit_fail'
 
         # if all the fits in the grid search failed, there will be a None value in params. In this case, we reject
@@ -409,7 +405,6 @@ def __init__(self,
                  test_condition: str,
                  threshold_accuracy: Union[str, float] = 'inflection',
                  scoring: str = 'pool',
-                 max_fit_params: Optional[Tuple[float, ...]] = None,
                  required_baseline_accuracy: Optional[float] = 0.6,
                  required_test_accuracy: Optional[float] = 0.6,
                  plot_fit: bool = False
@@ -430,12 +425,10 @@ def __init__(self,
         super(ThresholdElevation, self).__init__(independent_variable)
         self.baseline_threshold_metric = Threshold(self._independent_variable,
                                                    threshold_accuracy=threshold_accuracy,
-                                                   max_fit_params=max_fit_params,
                                                    required_accuracy=required_baseline_accuracy,
                                                    plot_fit=plot_fit)
         self.test_threshold_metric = Threshold(self._independent_variable,
                                                threshold_accuracy=threshold_accuracy,
-                                               max_fit_params=max_fit_params,
                                                required_accuracy=required_test_accuracy,
                                                plot_fit=plot_fit)
         self.baseline_condition = baseline_condition
diff --git a/packaging/malania2007/malania_data_assembly.py b/packaging/malania2007/malania_data_assembly.py
index 756702cbf..cfe9f9a8c 100644
--- a/packaging/malania2007/malania_data_assembly.py
+++ b/packaging/malania2007/malania_data_assembly.py
@@ -24,7 +24,10 @@
 def collect_malania_data_assembly(root_directory, dataset):
     """
     Experiment Information:
-    ... todo
+        - 5-6 observers per condition (for exact value, see NUM_SUBJECTS)
+        - 2AFC left/right offset discrimination task
+        - PEST staircase to 75% correct responses
+        - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
     """
     # construct the assembly
     metadata_directory = Path(f'{root_directory}/{dataset}/metadata_human.xlsx')
@@ -68,35 +71,11 @@ def remove_subjects_with_nans(assembly1, assembly2):
     return filtered_assembly1, filtered_assembly2
 
 
-# def get_local_ceilings():
-#     from brainscore.metrics.threshold import ThresholdElevation
-#     ceilings = {}
-#     for dataset in DATASETS:
-#         baseline_assembly = return_local_data_assembly('vernier-only')
-#         condition_assembly = return_local_data_assembly(dataset)
-#
-#         condition_assembly, baseline_assembly = remove_subjects_with_nans(condition_assembly, baseline_assembly)
-#
-#         assemblies = {'baseline_assembly': baseline_assembly,
-#                       'condition_assembly': condition_assembly}
-#         metric = ThresholdElevation(independent_variable='vernier_offset',
-#                                     baseline_condition='vernier-only',
-#                                     test_condition=dataset,
-#                                     threshold_accuracy=0.75)
-#         ceiling = metric.individual_ceiling(assemblies)
-#         ceilings[dataset] = ceiling
-#     print(ceilings)
-#     # compute the average ceiling for every condition except the baseline-baseline condition
-#     mean = np.mean([xarray.values[0] for xarray in ceilings.values()][:-1])
-#     print(mean)
-
-
 if __name__ == '__main__':
-    # get_local_ceilings()
     root_directory = Path(r'./malania2007_data_assembly')
     for dataset in DATASETS:
         assembly = collect_malania_data_assembly(root_directory, dataset)
         # upload to S3
-        # package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-        #                      stimulus_set_identifier=f"Malania2007_{dataset}",
-        #                      assembly_class="BehavioralAssembly", bucket_name="brainio-brainscore")
\ No newline at end of file
+        #package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
+        #                     stimulus_set_identifier=f"Malania2007_{dataset}",
+        #                     assembly_class_name="BehavioralAssembly", bucket_name="brainio-brainscore")
\ No newline at end of file
diff --git a/packaging/malania2007/malania_stimulus_set.py b/packaging/malania2007/malania_stimulus_set.py
index 70d0e451f..72726fd12 100644
--- a/packaging/malania2007/malania_stimulus_set.py
+++ b/packaging/malania2007/malania_stimulus_set.py
@@ -9,16 +9,27 @@
                  'long-2', 'equal-16', 'long-16', 'vernier-only', 'short-2_fit',
                  'short-4_fit', 'short-6_fit', 'short-8_fit', 'short-16_fit',
                  'equal-2_fit', 'long-2_fit', 'equal-16_fit', 'long-16_fit']
-DATASET_LENGTHS = {'test': 588, 'fit': 432}
+DATASET_LENGTHS = {'test': 1225, 'fit': 1225}
 
 
 def collect_malania_stimulus_set(root_directory, dataset):
     """
     Dataset Meta Info
-    ... todo
+
+    Reported in pixels:
+        - image_size_x; image_size_y
+        - vernier_position_x; vernier_position_y
+
+    Reported in arcsec:
+        - vernier_height (height of the vernier elements combined, - middle gap)
+        - vernier_offset (horizontal offset between flankers)
+        - flanker_height (height of the flanker elements)
+        - flanker_spacing (distance between a flanker element and another flanker element)
+        - line_width (width of all the lines in all elements)
+        - flanker_distance (distance between a flanker and a vernier)
     """
     stimuli = []
-    image_paths = {}
+    stimulus_paths = {}
 
     dataset_type = 'fit' if dataset[-3:] == 'fit' else 'test'
     metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
@@ -41,13 +52,12 @@ def collect_malania_stimulus_set(root_directory, dataset):
                 'num_flankers': int(row['num_flankers']),
                 'vernier_position_x': int(row['vernier_position_x']),
                 'vernier_position_y': int(row['vernier_position_y']),
-                'filename': row['filename'],
-                'stimulus_id': int(row['stimulus_id'])
+                'stimulus_id': str(row['stimulus_id']),
             })
-            image_paths[int(row['stimulus_id'])] = Path(f'{image_directory}/{row["filename"]}')
+            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
 
     stimuli = StimulusSet(stimuli)
-    stimuli.image_paths = image_paths
+    stimuli.stimulus_paths = stimulus_paths
     stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
 
     # Ensure expected number of stimuli in datasets
@@ -67,5 +77,5 @@ def return_local_stimulus_set(dataset):
         stimuli = collect_malania_stimulus_set(root_directory, stimulus_set)
 
         # upload to S3
-        # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-        #                      bucket_name="brainio-brainscore")
+        #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
+        #                     bucket_name="brainio-brainscore")
diff --git a/tests/test_stimuli.py b/tests/test_stimuli.py
index e3def80dc..a458882f2 100644
--- a/tests/test_stimuli.py
+++ b/tests/test_stimuli.py
@@ -266,6 +266,7 @@ def test_fields_present3(self, identifier, field):
         stimulus_set = brainscore.get_assembly(f"brendel.Geirhos2021_{identifier}")
         assert hasattr(stimulus_set, field)
 
+
 @pytest.mark.slow
 class TestMalania2007:
     # test stimulus_set data:
@@ -288,6 +289,7 @@ class TestMalania2007:
         'long-2_fit',
         'equal-16_fit',
         'long-16_fit',
+        'vernier-only'
     ])
     def test_stimulus_set_exist(self, identifier):
         full_name = f"Malania2007_{identifier}"
@@ -315,6 +317,7 @@ def test_stimulus_set_exist(self, identifier):
         ('long-2_fit', 1225),
         ('equal-16_fit', 1225),
         ('long-16_fit', 1225),
+        ('vernier-only', 1225)
     ])
     def test_num_images(self, identifier, num_images):
         stimulus_set = brainscore.get_stimulus_set(f"Malania2007_{identifier}")
@@ -340,6 +343,7 @@ def test_num_images(self, identifier, num_images):
         'long-2_fit',
         'equal-16_fit',
         'long-16_fit',
+        'vernier-only'
     ])
     @pytest.mark.parametrize('field', [
         'image_size_x',
@@ -356,13 +360,13 @@ def test_num_images(self, identifier, num_images):
         'num_flankers',
         'vernier_position_x',
         'vernier_position_y',
-        'filename',
         'stimulus_id',
     ])
     def test_fields_present(self, identifier, field):
         stimulus_set = brainscore.get_stimulus_set(f"Malania2007_{identifier}")
         assert hasattr(stimulus_set, field)
 
+
 @pytest.mark.private_access
 def test_Islam2021():
     stimulus_set = brainio.get_stimulus_set('Islam2021')

From 2dff67d38f0cc99cedd9b8afb8e9f04020c61b3a Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 9 Jun 2023 10:18:59 +0200
Subject: [PATCH 14/65] add Malania2007 entries to lookup.csv

---
 brainscore/lookup.csv | 48 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/brainscore/lookup.csv b/brainscore/lookup.csv
index 4a51f919c..f88c5c736 100644
--- a/brainscore/lookup.csv
+++ b/brainscore/lookup.csv
@@ -170,3 +170,51 @@ katz.BarbuMayo2019,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.ama
 katz.BarbuMayo2019,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/image_katz_BarbuMayo2019.zip,1365eb2a7231516806127a7d2a908343a7ac9464,
 Islam2021,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_neil_Islam2021.csv,93ab0f6386d8d5fb56640da45980a819c7dd6efc,
 Islam2021,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_neil_Islam2021.zip,e55b673117a472f463e0705ac3e330ef8dfd938b,
+Malania2007_short-2,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-2.csv,612a97945ad127b2902f2aaf1be28ead7d77f21c,
+Malania2007_short-2,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-2.zip,536ef580c95f6a0398dcc50de67ffd888a15c45a,
+Malania2007_short-4,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-4.csv,461600b6f6c5b2508c1def241dc36a266df661b6,
+Malania2007_short-4,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-4.zip,1eea5469f41850ebd2e479463dba3ba7fcd1afb3,
+Malania2007_short-6,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-6.csv,efbf2c4b99e4dfcebe967ba873f8bca51e3310fa,
+Malania2007_short-6,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-6.zip,82fe54cf1e7c7fa551e77dc67cc97f380a8aea17,
+Malania2007_short-8,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-8.csv,34046fa4954ea226ce66c182ac1f8cc68a8a5368,
+Malania2007_short-8,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-8.zip,02ed3043bda3d5037695e304d941e59fc01276e8,
+Malania2007_short-16,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-16.csv,a0a3873aee38a899b6771a7aa22c5cf20d881e29,
+Malania2007_short-16,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-16.zip,2962555520379d273f319acc4bb8ce935625c4d7,
+Malania2007_equal-2,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_equal-2.csv,7e98862fd0a20d55cf9e41176cf2d6cc69a03610,
+Malania2007_equal-2,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_equal-2.zip,451a4a5427eb80f0f5e3c48bf62d078f4b42114f,
+Malania2007_long-2,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_long-2.csv,4963b971310ae3a4a5e8dd1db6e539b14a45f8c2,
+Malania2007_long-2,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_long-2.zip,b1716da4fbb29759e9282e6ef1baf09d2e87b523,
+Malania2007_equal-16,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_equal-16.csv,587b5fe584366e634f5d250dee1c4a5601c225f5,
+Malania2007_equal-16,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_equal-16.zip,35eaa6ab4e31504308668b7874268b61368e5228,
+Malania2007_long-16,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_long-16.csv,e20661c973363995bc05826de7ec5bd9b6da5f4c,
+Malania2007_long-16,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_long-16.zip,7f09ff0993eb11570db5d92b76ba918b811ccb9c,
+Malania2007_vernier-only,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_vernier-only.csv,1a99f79af9cb98a4b60dac3aca5c4c0b5d521a45,
+Malania2007_vernier-only,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_vernier-only.zip,e78cd01e2a077e2dd4cda92107f3a9cfbf9247e3,
+Malania2007_short-2_fit,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-2_fit.csv,e9a58ec2417aa75a1c2821d271c403f0be2e0a54,
+Malania2007_short-2_fit,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-2_fit.zip,686c6404ef5ab02167bb0566d4306da1607a7b93,
+Malania2007_short-4_fit,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-4_fit.csv,070c7fbc95d7652f6c16fcb4597f71c483a6e672,
+Malania2007_short-4_fit,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-4_fit.zip,32bc78522ee5d562b396e79e98625bc56d312b3f,
+Malania2007_short-6_fit,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-6_fit.csv,4795183af995b71a51e02081dfb5e83d96d5c3e5,
+Malania2007_short-6_fit,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-6_fit.zip,e0f2a0cec3d6a5cc5a2b73d7a0cc9570b7c0c27a,
+Malania2007_short-8_fit,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-8_fit.csv,af78f02db2d5298bd6c194db40ab0c4ba96662e0,
+Malania2007_short-8_fit,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-8_fit.zip,b49a1842c47bf6b9a7719dfeeb6a11c69a92d821,
+Malania2007_short-16_fit,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-16_fit.csv,3b45d1329694f50241b23393bd6d4ca665a4f1ce,
+Malania2007_short-16_fit,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_short-16_fit.zip,dfb769033b53ce45ae301f54018db7895ca0e50e,
+Malania2007_equal-2_fit,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_equal-2_fit.csv,371029f3889b6c995e38c9207d411afc1c67f489,
+Malania2007_equal-2_fit,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_equal-2_fit.zip,1bfb14afffe7c767a2b582edf3a8a85423b8974c,
+Malania2007_long-2_fit,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_long-2_fit.csv,fadb7d0accd7607858104e30f3a29845101c12a0,
+Malania2007_long-2_fit,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_long-2_fit.zip,0107c7d65f62c2deabf8a219a30549d1685ff0b3,
+Malania2007_equal-16_fit,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_equal-16_fit.csv,7bd8ef29191c198b15f24d711ba8dcd6dd3f17f3,
+Malania2007_equal-16_fit,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_equal-16_fit.zip,18818d083082edf4f7b139094f3d43844c8d5897,
+Malania2007_long-16_fit,stimulus_set,StimulusSet,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_long-16_fit.csv,d9e2fe513fc677c04df36833e170a88c6a3463b5,
+Malania2007_long-16_fit,stimulus_set,,S3,https://brainio-brainscore.s3.amazonaws.com/stimulus_Malania2007_long-16_fit.zip,0b23161ee6842acf8ad94ff84028b025c9ae5b3d,
+Malania2007_short-2,assembly,BehavioralAssembly,S3,https://brainio-brainscore.s3.amazonaws.com/assy_Malania2007_short-2.nc,85fb65ad76de48033c704b9c5689771e1ea0457d,Malania2007_short-2
+Malania2007_short-4,assembly,BehavioralAssembly,S3,https://brainio-brainscore.s3.amazonaws.com/assy_Malania2007_short-4.nc,75506be9a26ec38a223e41510f1a8cb32d5b0bc9,Malania2007_short-4
+Malania2007_short-6,assembly,BehavioralAssembly,S3,https://brainio-brainscore.s3.amazonaws.com/assy_Malania2007_short-6.nc,2901be6b352e67550da040d79d744819365b8626,Malania2007_short-6
+Malania2007_short-8,assembly,BehavioralAssembly,S3,https://brainio-brainscore.s3.amazonaws.com/assy_Malania2007_short-8.nc,6daf47b086cb969d75222e320f49453ed8437885,Malania2007_short-8
+Malania2007_short-16,assembly,BehavioralAssembly,S3,https://brainio-brainscore.s3.amazonaws.com/assy_Malania2007_short-16.nc,8ae0898caad718b747f85fce5888416affc3a569,Malania2007_short-16
+Malania2007_equal-2,assembly,BehavioralAssembly,S3,https://brainio-brainscore.s3.amazonaws.com/assy_Malania2007_equal-2.nc,277b2fbffed00e16b6a69b488f73eeda5abaaf10,Malania2007_equal-2
+Malania2007_long-2,assembly,BehavioralAssembly,S3,https://brainio-brainscore.s3.amazonaws.com/assy_Malania2007_long-2.nc,9076a5b693948c4992b6c8e753f04a7acd2014a1,Malania2007_long-2
+Malania2007_equal-16,assembly,BehavioralAssembly,S3,https://brainio-brainscore.s3.amazonaws.com/assy_Malania2007_equal-16.nc,ef49506238e8d2554918b113fbc60c133077186e,Malania2007_equal-16
+Malania2007_long-16,assembly,BehavioralAssembly,S3,https://brainio-brainscore.s3.amazonaws.com/assy_Malania2007_long-16.nc,3106cf1f2fa9e66617ebf231df05d29077fc478f,Malania2007_long-16
+Malania2007_vernier-only,assembly,BehavioralAssembly,S3,https://brainio-brainscore.s3.amazonaws.com/assy_Malania2007_vernier-only.nc,1cf83e8b6141f8b0d67ea46994f342325f62001f,Malania2007_vernier-only

From f62017c6ab33bc47bcf5179790b1ffc2a13e3d2d Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 9 Jun 2023 12:18:46 +0200
Subject: [PATCH 15/65] add Malania2007 to benchmark init

---
 brainscore/benchmarks/__init__.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/brainscore/benchmarks/__init__.py b/brainscore/benchmarks/__init__.py
index a005c05a6..1fbad58a8 100644
--- a/brainscore/benchmarks/__init__.py
+++ b/brainscore/benchmarks/__init__.py
@@ -216,6 +216,14 @@ def _evaluation_benchmark_pool():
         pool[f"brendel.{assembly_identifier}-error_consistency"] = LazyLoad(
             # use lambda parameter-binding to avoid `benchmark_ctr` being re-assigned in the next loop iteration
             lambda benchmark_ctr=benchmark_ctr: benchmark_ctr())
+    # Malania2007
+    from . import malania2007
+    for dataset in malania2007.DATASETS:
+        assembly_identifier = f"Malania_{dataset.replace('-', '')}"
+        benchmark_ctr = getattr(malania2007, f"{assembly_identifier}")
+        pool[f"{assembly_identifier}"] = LazyLoad(
+            # use lambda parameter-binding to avoid `benchmark_ctr` being re-assigned in the next loop iteration
+            lambda benchmark_ctr=benchmark_ctr: benchmark_ctr())
 
     return pool
 

From 87b0b714c067d69a8c83dbb8b0217311b542e903 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 9 Jun 2023 13:19:53 +0200
Subject: [PATCH 16/65] fix indexing error in test___init__

---
 tests/test_benchmarks/test___init__.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/test_benchmarks/test___init__.py b/tests/test_benchmarks/test___init__.py
index 276d6433c..5786cae85 100644
--- a/tests/test_benchmarks/test___init__.py
+++ b/tests/test_benchmarks/test___init__.py
@@ -96,15 +96,15 @@ def test_exact_evaluation_pool(self):
             'brendel.Geirhos2021stylized-error_consistency',
             'brendel.Geirhos2021sketch-error_consistency',
             'brendel.Geirhos2021uniformnoise-error_consistency',
-            'Malania2007_short-2',
-            'Malania2007_short-4',
-            'Malania2007_short-6',
-            'Malania2007_short-8',
-            'Malania2007_short-16',
-            'Malania2007_equal-2',
-            'Malania2007_long-2',
-            'Malania2007_equal-16',
-            'Malania2007_long-16',
+            'Malania2007_short2',
+            'Malania2007_short4',
+            'Malania2007_short6',
+            'Malania2007_short8',
+            'Malania2007_short16',
+            'Malania2007_equal2',
+            'Malania2007_long2',
+            'Malania2007_equal16',
+            'Malania2007_long16',
         }
 
     def test_engineering_pool(self):

From d938c3a29f1ac08b885ead6b83c578478d3e8bac Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 9 Jun 2023 13:56:54 +0200
Subject: [PATCH 17/65] add missing year(s) in identifiers where they were
 missing

---
 brainscore/benchmarks/__init__.py         | 2 +-
 brainscore/benchmarks/malania2007.py      | 2 +-
 tests/test_benchmarks/test_malania2007.py | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/brainscore/benchmarks/__init__.py b/brainscore/benchmarks/__init__.py
index 1fbad58a8..7dc972f93 100644
--- a/brainscore/benchmarks/__init__.py
+++ b/brainscore/benchmarks/__init__.py
@@ -219,7 +219,7 @@ def _evaluation_benchmark_pool():
     # Malania2007
     from . import malania2007
     for dataset in malania2007.DATASETS:
-        assembly_identifier = f"Malania_{dataset.replace('-', '')}"
+        assembly_identifier = f"Malania2007_{dataset.replace('-', '')}"
         benchmark_ctr = getattr(malania2007, f"{assembly_identifier}")
         pool[f"{assembly_identifier}"] = LazyLoad(
             # use lambda parameter-binding to avoid `benchmark_ctr` being re-assigned in the next loop iteration
diff --git a/brainscore/benchmarks/malania2007.py b/brainscore/benchmarks/malania2007.py
index 5657d28b0..b6b4f481f 100644
--- a/brainscore/benchmarks/malania2007.py
+++ b/brainscore/benchmarks/malania2007.py
@@ -34,7 +34,7 @@
 
 for dataset in DATASETS:
     # behavioral benchmark
-    identifier = f"Malania_{dataset.replace('-', '')}"
+    identifier = f"Malania2007_{dataset.replace('-', '')}"
     globals()[identifier] = lambda dataset=dataset: _Malania2007Base(dataset)
 
 
diff --git a/tests/test_benchmarks/test_malania2007.py b/tests/test_benchmarks/test_malania2007.py
index 6aac00721..0a6556560 100644
--- a/tests/test_benchmarks/test_malania2007.py
+++ b/tests/test_benchmarks/test_malania2007.py
@@ -24,7 +24,7 @@ def test_mean_ceiling(self):
         benchmarks = [benchmark_pool[benchmark] for benchmark in benchmarks]
         ceilings = [benchmark.ceiling.sel(aggregation='center') for benchmark in benchmarks]
         mean_ceiling = np.mean(ceilings)
-        assert mean_ceiling == approx(0.7724487108297781, abs=0.001)  # TODO: check that this is correct
+        assert mean_ceiling == approx(0.7724487108297781, abs=0.001)
 
     # these test values are for the pooled score ceiling
     @pytest.mark.parametrize('dataset, expected_ceiling', [
@@ -56,7 +56,7 @@ def test_dataset_ceiling(self, dataset, expected_ceiling):
         ('long-16', 'resnet-18', approx(0., abs=0.001)),
     ])
     def test_model_8degrees(self, dataset, model, expected_raw_score):
-        benchmark = benchmark_pool[f"Malania_{dataset.replace('-', '')}"]
+        benchmark = benchmark_pool[f"Malania2007_{dataset.replace('-', '')}"]
         # load features
         precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'
         precomputed_features = BehavioralAssembly.from_files(file_path=precomputed_features)
@@ -79,7 +79,7 @@ def test_model_8degrees(self, dataset, model, expected_raw_score):
         ('long-16', 'resnet-18-3deg', approx(0., abs=0.001)),
     ])
     def test_model_8degrees(self, dataset, model, expected_raw_score):
-        benchmark = benchmark_pool[f"Malania_{dataset.replace('-', '')}"]
+        benchmark = benchmark_pool[f"Malania2007_{dataset.replace('-', '')}"]
         # load features
         precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'
         precomputed_features = BehavioralAssembly.from_files(file_path=precomputed_features)

From a71737fbaf8a0bc52988a8a42f345204d669f97b Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 9 Jun 2023 14:28:20 +0200
Subject: [PATCH 18/65] add S3-related tests to private access, as Travis
 throws a NoCredentialsError on trying to access them

---
 tests/test_assemblies.py                  | 1 +
 tests/test_benchmarks/test_malania2007.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/test_assemblies.py b/tests/test_assemblies.py
index dbf7e65fb..a97e8fff9 100644
--- a/tests/test_assemblies.py
+++ b/tests/test_assemblies.py
@@ -504,6 +504,7 @@ def test_fields_present_cue_conflict(self, identifier, field):
         assert hasattr(assembly, field)
 
 
+@pytest.mark.private_access
 class TestMalania2007:
 
     # test the number of subjects:
diff --git a/tests/test_benchmarks/test_malania2007.py b/tests/test_benchmarks/test_malania2007.py
index 0a6556560..e64a72f51 100644
--- a/tests/test_benchmarks/test_malania2007.py
+++ b/tests/test_benchmarks/test_malania2007.py
@@ -10,6 +10,7 @@
 from tests.test_benchmarks import PrecomputedFeatures
 
 
+@pytest.mark.private_access
 class TestBehavioral:
     def test_count(self):
         assert len(DATASETS) == 5 + 2 + 2

From 32bfbe334402773c55b8f598c9ab3234728a9809 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 9 Jun 2023 14:39:38 +0200
Subject: [PATCH 19/65] minor cleanup of threshold

---
 brainscore/metrics/threshold.py | 72 ++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/brainscore/metrics/threshold.py b/brainscore/metrics/threshold.py
index ee9cd07f3..fde16e4ce 100644
--- a/brainscore/metrics/threshold.py
+++ b/brainscore/metrics/threshold.py
@@ -2,38 +2,38 @@
 import numpy as np
 from scipy.optimize import minimize
 from scipy.stats import norm
-from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.metrics import r2_score
 import matplotlib.pyplot as plt
 
 from brainscore.metrics import Metric, Score
 from brainio.assemblies import PropertyAssembly, BehavioralAssembly
 
 
-def wichmann_cum_gauss(x: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float:
+def psychometric_cum_gauss(x: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float:
     """
     The classic psychometric function as implemented in Wichmann & Hill (2001). The psychometric function: I.
     Fitting, sampling, and goodness of fit, eq. 1.
 
-    Parameters
-    ----------
-    x: the independent variables of the data
-    alpha: the slope parameter
-    beta: the mean of the cdf parameter
-    lambda_: the lapse rate
-    gamma: the upper bound of the fit
+    :param x: the independent variables of the data
+    :param alpha: the slope parameter
+    :param beta: the mean of the cdf parameter
+    :param lambda_: the lapse rate
+    :param gamma: the lower bound of the fit
+
+    :return: the psychometric function values for the given parameters evaluated at `x`.
     """
     return gamma + (1 - gamma - lambda_) * norm.cdf(alpha * (x - beta))
 
 
-def inverse_wichmann_cum_gauss(y: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float:
-    """The inverse of wichmann_cum_gauss."""
+def inverse_psychometric_cum_gauss(y: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float:
+    """The inverse of psychometric_cum_gauss."""
     return beta + (norm.ppf((y - gamma) / (1 - gamma - lambda_)) / alpha)
 
 
-def wichmann_neg_log_likelihood(params: Tuple[float, ...], x: np.array, y: np.array) -> float:
-    """The negative log likelihood function for wichmann_cum_gauss."""
+def cum_gauss_neg_log_likelihood(params: Tuple[float, ...], x: np.array, y: np.array) -> float:
+    """The negative log likelihood function for psychometric_cum_gauss."""
     alpha, beta, lambda_ = params
-    p = wichmann_cum_gauss(x, alpha, beta, lambda_)
+    p = psychometric_cum_gauss(x, alpha, beta, lambda_)
     log_likelihood = y * np.log(p) + (1 - y) * np.log(1 - p)
     return -np.sum(log_likelihood)
 
@@ -47,8 +47,8 @@ def grid_search(x: np.array,
                 y: np.array,
                 alpha_values: np.array = np.logspace(-3, 1, 50),
                 beta_values: np.array = None,
-                fit_fn: Callable = wichmann_cum_gauss,
-                fit_log_likelihood_fn: Callable = wichmann_neg_log_likelihood,
+                fit_fn: Callable = psychometric_cum_gauss,
+                fit_log_likelihood_fn: Callable = cum_gauss_neg_log_likelihood,
                 fit_bounds: Tuple = ((None, None), (None, None), (0.03, 0.5))
                 ) -> Tuple[Tuple[float, ...], float]:
     """
@@ -58,24 +58,20 @@ def grid_search(x: np.array,
     sklearn estimator class to use GridSearchCV with psychometric functions, likely increasing code bloat
     substantially.
 
-    Parameters
-    ----------
-    x: the independent variables of the data
-    y: the measured accuracy rates for the given x-values
-    alpha_values: the alpha values for the chosen fit function to grid search over
-    beta_values: the beta values for the chosen fit function to grid search over
-    fit_fn: the psychometric function that is fit
-    fit_log_likelihood_fn: the log likelihood function that computes the log likelihood of its corresponding
-                            fit function
-    fit_bounds: the bounds assigned to the fit function called by fit_log_likelihood_fn.
-                 The default fit_bounds are assigned as:
-                 alpha: (None, None), to allow any slope
-                 beta: (None, None), any inflection point is allowed, as that is controlled for in the Threshold class
-                 lambda_: (0.03, 0.5)), to require at least a small lapse rate, as is regularly done in human fitting
-
-    Returns
-    -------
-    the parameters of the best fit in the grid search
+    :param x: the independent variables of the data
+    :param y: the measured accuracy rates for the given x-values
+    :param alpha_values: the alpha values for the chosen fit function to grid search over
+    :param beta_values: the beta values for the chosen fit function to grid search over
+    :param fit_fn: the psychometric function that is fit
+    :param fit_log_likelihood_fn: the log likelihood function that computes the log likelihood of its corresponding
+                                  fit function
+    :param fit_bounds: the bounds assigned to the fit function called by fit_log_likelihood_fn.
+                       The default fit_bounds are assigned as:
+                       alpha: (None, None), to allow any slope
+                       beta: (None, None), any inflection point is allowed, as that is controlled for in the Threshold class
+                       lambda_: (0.03, 0.5)), to require at least a small lapse rate, as is regularly done in human fitting
+
+    :return: the parameters of the best fit in the grid search
     """
     assert len(x) == len(y)
     # Default the beta_values grid search to the measured x-points.
@@ -105,7 +101,7 @@ def grid_search(x: np.array,
                 pass
 
     y_pred = fit_fn(x, best_alpha, best_beta, best_lambda)
-    r2 = r2_score(y, y_pred)
+    r2 = r2_score(y, y_pred)  # R^2 of the fit
     return (best_alpha, best_beta, best_lambda), r2
 
 
@@ -119,8 +115,8 @@ class Threshold(Metric):
     """
     def __init__(self,
                  independent_variable: str,
-                 fit_function=wichmann_cum_gauss,
-                 fit_inverse_function=inverse_wichmann_cum_gauss,
+                 fit_function=psychometric_cum_gauss,
+                 fit_inverse_function=inverse_psychometric_cum_gauss,
                  threshold_accuracy: Union[str, float] = 'inflection',
                  scoring: str = 'pool',
                  required_accuracy: Optional[float] = 0.6,
@@ -421,7 +417,7 @@ def __init__(self,
                                     the threshold at that level.
         :param scoring: The scoring function used to evaluate performance. Either Literal['individual'] or
                          Literal['pool']. See the individual_score and pool_score methods for more information.
-                """
+        """
         super(ThresholdElevation, self).__init__(independent_variable)
         self.baseline_threshold_metric = Threshold(self._independent_variable,
                                                    threshold_accuracy=threshold_accuracy,

From 8ba609ab7f893094b02e750a787406233090daa4 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 16 Jun 2023 13:42:31 +0200
Subject: [PATCH 20/65] add a few comments to threshold functions

---
 brainscore/metrics/threshold.py | 49 +++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/brainscore/metrics/threshold.py b/brainscore/metrics/threshold.py
index fde16e4ce..6776e155c 100644
--- a/brainscore/metrics/threshold.py
+++ b/brainscore/metrics/threshold.py
@@ -68,8 +68,10 @@ def grid_search(x: np.array,
     :param fit_bounds: the bounds assigned to the fit function called by fit_log_likelihood_fn.
                        The default fit_bounds are assigned as:
                        alpha: (None, None), to allow any slope
-                       beta: (None, None), any inflection point is allowed, as that is controlled for in the Threshold class
-                       lambda_: (0.03, 0.5)), to require at least a small lapse rate, as is regularly done in human fitting
+                       beta: (None, None), any inflection point is allowed, as that is controlled for in the
+                             Threshold class
+                       lambda_: (0.03, 0.5)), to require at least a small lapse rate, as is regularly done in
+                                human fitting
 
     :return: the parameters of the best fit in the grid search
     """
@@ -243,16 +245,17 @@ def fit_threshold_function(self, x_points: np.array, y_points: np.array) -> Unio
 
         params, r2 = grid_search(aggregated_x_points, aggregated_y_points)
 
-        # remove fits to random data
-        if r2 < 0.4:
-            print('Fit fail due to low fit R^2.')
-            params = 'fit_fail'
-
         # if all the fits in the grid search failed, there will be a None value in params. In this case, we reject
         #  the fit. This typically only ever happens when a model outputs one value for all test images.
         if None in params:
             params = 'fit_fail'
 
+        # remove fits to random data. This choice is preferred over a chi^2 test since chi^2 discards a lot of fits
+        #  that would be acceptable in a human case.
+        if r2 < 0.4:
+            print('Fit fail due to low fit R^2.')
+            params = 'fit_fail'
+
         if self.plot_fit:
             self.plot_fit_(x_points,
                            aggregated_x_points,
@@ -358,30 +361,42 @@ def convert_proba_to_correct(source: BehavioralAssembly) -> np.array:
 
     @staticmethod
     def remove_data_after_asymptote(x_values, y_values):
-        # Compute the standard deviation of y_values
-        std_dev = np.std(y_values)
+        """
+        A function that removes all data after the point at which all values of the measured variable are 1 standard
+        deviation less than the maximum.
+
+        This is done to simulate the procedure in which an experimenter fine-tunes the stimuli in a pilot experiment
+        to the given system (e.g., humans) such that they only measure data in a region within which the psychometric
+        fit is monotone (as per the function fit assumption). When this assumption is violated, the function fit
+        is not a valid measure of the underlying performance function.
+
+        There are circumstances in which this behavior is expected (e.g., crowding). When e.g. a vernier element's
+        offset is increased enough, the task may paradoxically become more difficult, as the offset grows large
+        enough such that the relevant elements do not fall within a spatially relevant window, or group with the
+        flankers more than with each other due to constant target-flanker distance.
+        """
 
-        # Find the index of the maximum y_value
+        std_dev = np.std(y_values)
         max_y_idx = np.argmax(y_values)
 
-        # Initialize the index for the first data point after the maximum y_value
-        # that deviates from the maximum by at least 1 standard deviation
+        # initialize the index for the first data point after the maximum y_value
+        #  that deviates from the maximum by at least 1 standard deviation
         index_to_remove = None
 
-        # Iterate through the y_values after the maximum y_value
+        # iterate through the y_values after the maximum y_value
         for idx, y in enumerate(y_values[max_y_idx + 1:], start=max_y_idx + 1):
-            # Check if all the remaining y_values deviate by at least 1 standard deviation
+            # check if all the remaining y_values deviate by at least 1 standard deviation
             if all([abs(val - y_values[max_y_idx]) >= std_dev for val in y_values[idx:]]):
                 index_to_remove = idx
                 break
-
         pre_remove_length = len(y_values)
-        # If we found an index to remove, remove the data after that index
+        # if we found an index to remove, remove the data after that index
         if index_to_remove is not None:
             x_values = x_values[:index_to_remove]
             y_values = y_values[:index_to_remove]
 
-        # Check if at least a third of the elements remain
+        # check if at least a third of the elements remain. This is done so that we have an adequate amount of data
+        #  to fit a psychometric threshold on.
         remaining_fraction = len(y_values) / pre_remove_length
         is_at_least_third_remaining = remaining_fraction >= 1 / 3
 

From 53a3ea746c9103b34f616134fdd359bc45718e65 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 21 Jun 2023 09:25:53 +0200
Subject: [PATCH 21/65] fix typo in precomputed feature test

---
 tests/test_benchmarks/test_malania2007.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_benchmarks/test_malania2007.py b/tests/test_benchmarks/test_malania2007.py
index e64a72f51..2a0a48904 100644
--- a/tests/test_benchmarks/test_malania2007.py
+++ b/tests/test_benchmarks/test_malania2007.py
@@ -79,7 +79,7 @@ def test_model_8degrees(self, dataset, model, expected_raw_score):
         ('equal-16', 'resnet-18-3deg', approx(0., abs=0.001)),
         ('long-16', 'resnet-18-3deg', approx(0., abs=0.001)),
     ])
-    def test_model_8degrees(self, dataset, model, expected_raw_score):
+    def test_model_3degrees(self, dataset, model, expected_raw_score):
         benchmark = benchmark_pool[f"Malania2007_{dataset.replace('-', '')}"]
         # load features
         precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'

From 29bebe4e1a4ad672626d3264b0a0c3be7a00000b Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 21 Jun 2023 09:52:00 +0200
Subject: [PATCH 22/65] fix typos in dataset and assembly call functions

---
 brainscore/benchmarks/__init__.py         |  2 +-
 brainscore/benchmarks/malania2007.py      |  2 +-
 tests/test_benchmarks/test___init__.py    | 18 +++++++++---------
 tests/test_benchmarks/test_malania2007.py |  6 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/brainscore/benchmarks/__init__.py b/brainscore/benchmarks/__init__.py
index 7dc972f93..6cd44af78 100644
--- a/brainscore/benchmarks/__init__.py
+++ b/brainscore/benchmarks/__init__.py
@@ -219,7 +219,7 @@ def _evaluation_benchmark_pool():
     # Malania2007
     from . import malania2007
     for dataset in malania2007.DATASETS:
-        assembly_identifier = f"Malania2007_{dataset.replace('-', '')}"
+        assembly_identifier = f"Malania2007_{dataset}"
         benchmark_ctr = getattr(malania2007, f"{assembly_identifier}")
         pool[f"{assembly_identifier}"] = LazyLoad(
             # use lambda parameter-binding to avoid `benchmark_ctr` being re-assigned in the next loop iteration
diff --git a/brainscore/benchmarks/malania2007.py b/brainscore/benchmarks/malania2007.py
index b6b4f481f..c8069f218 100644
--- a/brainscore/benchmarks/malania2007.py
+++ b/brainscore/benchmarks/malania2007.py
@@ -34,7 +34,7 @@
 
 for dataset in DATASETS:
     # behavioral benchmark
-    identifier = f"Malania2007_{dataset.replace('-', '')}"
+    identifier = f"Malania2007_{dataset}"
     globals()[identifier] = lambda dataset=dataset: _Malania2007Base(dataset)
 
 
diff --git a/tests/test_benchmarks/test___init__.py b/tests/test_benchmarks/test___init__.py
index 5786cae85..276d6433c 100644
--- a/tests/test_benchmarks/test___init__.py
+++ b/tests/test_benchmarks/test___init__.py
@@ -96,15 +96,15 @@ def test_exact_evaluation_pool(self):
             'brendel.Geirhos2021stylized-error_consistency',
             'brendel.Geirhos2021sketch-error_consistency',
             'brendel.Geirhos2021uniformnoise-error_consistency',
-            'Malania2007_short2',
-            'Malania2007_short4',
-            'Malania2007_short6',
-            'Malania2007_short8',
-            'Malania2007_short16',
-            'Malania2007_equal2',
-            'Malania2007_long2',
-            'Malania2007_equal16',
-            'Malania2007_long16',
+            'Malania2007_short-2',
+            'Malania2007_short-4',
+            'Malania2007_short-6',
+            'Malania2007_short-8',
+            'Malania2007_short-16',
+            'Malania2007_equal-2',
+            'Malania2007_long-2',
+            'Malania2007_equal-16',
+            'Malania2007_long-16',
         }
 
     def test_engineering_pool(self):
diff --git a/tests/test_benchmarks/test_malania2007.py b/tests/test_benchmarks/test_malania2007.py
index 2a0a48904..d9097c229 100644
--- a/tests/test_benchmarks/test_malania2007.py
+++ b/tests/test_benchmarks/test_malania2007.py
@@ -17,7 +17,7 @@ def test_count(self):
 
     @pytest.mark.parametrize('dataset', DATASETS)
     def test_in_pool(self, dataset):
-        identifier = f"Malania2007_{dataset.replace('-', '')}"
+        identifier = f"Malania2007_{dataset}"
         assert identifier in benchmark_pool
 
     def test_mean_ceiling(self):
@@ -57,7 +57,7 @@ def test_dataset_ceiling(self, dataset, expected_ceiling):
         ('long-16', 'resnet-18', approx(0., abs=0.001)),
     ])
     def test_model_8degrees(self, dataset, model, expected_raw_score):
-        benchmark = benchmark_pool[f"Malania2007_{dataset.replace('-', '')}"]
+        benchmark = benchmark_pool[f"Malania2007_{dataset}"]
         # load features
         precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'
         precomputed_features = BehavioralAssembly.from_files(file_path=precomputed_features)
@@ -80,7 +80,7 @@ def test_model_8degrees(self, dataset, model, expected_raw_score):
         ('long-16', 'resnet-18-3deg', approx(0., abs=0.001)),
     ])
     def test_model_3degrees(self, dataset, model, expected_raw_score):
-        benchmark = benchmark_pool[f"Malania2007_{dataset.replace('-', '')}"]
+        benchmark = benchmark_pool[f"Malania2007_{dataset}"]
         # load features
         precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'
         precomputed_features = BehavioralAssembly.from_files(file_path=precomputed_features)

From 832049197f5b06f1035f57229ce305fd5303c722 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 3 Nov 2023 14:37:26 +0100
Subject: [PATCH 23/65] Update Assembly type and name

---
 packaging/malania2007/malania_data_assembly.py | 2 +-
 tests/test_benchmarks/test_malania2007.py      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/packaging/malania2007/malania_data_assembly.py b/packaging/malania2007/malania_data_assembly.py
index cfe9f9a8c..a88c109c0 100644
--- a/packaging/malania2007/malania_data_assembly.py
+++ b/packaging/malania2007/malania_data_assembly.py
@@ -78,4 +78,4 @@ def remove_subjects_with_nans(assembly1, assembly2):
         # upload to S3
         #package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
         #                     stimulus_set_identifier=f"Malania2007_{dataset}",
-        #                     assembly_class_name="BehavioralAssembly", bucket_name="brainio-brainscore")
\ No newline at end of file
+        #                     assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/tests/test_benchmarks/test_malania2007.py b/tests/test_benchmarks/test_malania2007.py
index d9097c229..9ce4955af 100644
--- a/tests/test_benchmarks/test_malania2007.py
+++ b/tests/test_benchmarks/test_malania2007.py
@@ -21,7 +21,7 @@ def test_in_pool(self, dataset):
         assert identifier in benchmark_pool
 
     def test_mean_ceiling(self):
-        benchmarks = [f"Malania2007_{dataset.replace('-', '')}" for dataset in DATASETS]
+        benchmarks = [f"Malania2007_{dataset}" for dataset in DATASETS]
         benchmarks = [benchmark_pool[benchmark] for benchmark in benchmarks]
         ceilings = [benchmark.ceiling.sel(aggregation='center') for benchmark in benchmarks]
         mean_ceiling = np.mean(ceilings)
@@ -40,7 +40,7 @@ def test_mean_ceiling(self):
         ('long-16', approx(0.71010202, abs=0.001))
     ])
     def test_dataset_ceiling(self, dataset, expected_ceiling):
-        benchmark = f"Malania2007_{dataset.replace('-', '')}"
+        benchmark = f"Malania2007_{dataset}"
         benchmark = benchmark_pool[benchmark]
         ceiling = benchmark.ceiling
         assert ceiling.sel(aggregation='center').values.item() == expected_ceiling

From f6dab05d8b1152ec1083f11af244a2372a273d94 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 3 Nov 2023 15:14:44 +0100
Subject: [PATCH 24/65] update scoring method with comment and update ceilings

---
 brainscore/metrics/threshold.py           |  6 ++++--
 tests/test_benchmarks/test_malania2007.py | 18 +++++++++---------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/brainscore/metrics/threshold.py b/brainscore/metrics/threshold.py
index 6776e155c..0a43581dd 100644
--- a/brainscore/metrics/threshold.py
+++ b/brainscore/metrics/threshold.py
@@ -325,7 +325,8 @@ def individual_score(source: float, target: Union[list, PropertyAssembly]) -> Sc
         """
         raw_scores = []
         for target_value in target:
-            raw_score = max((1 - ((np.abs(target_value - source)) / (target_value + source))), 0)
+            # This score = 0 when the source exceeds target_value by 100%
+            raw_score = max((1 - ((np.abs(target_value - source)) / target_value)), 0)
             raw_scores.append(raw_score)
 
         raw_score, model_error = np.mean(raw_scores), np.std(raw_scores)
@@ -343,7 +344,8 @@ def pool_score(source: float, target: Union[list, PropertyAssembly]) -> Score:
             target_mean = np.mean(target.values)
         else:
             target_mean = np.mean(target)
-        raw_score = max((1 - ((np.abs(target_mean - source)) / (target_mean + source))), 0)
+        # This score = 0 when the source exceeds target_mean by 100%
+        raw_score = max((1 - ((np.abs(target_mean - source)) / target_mean)), 0)
         raw_score = Score([raw_score], coords={'aggregation': ['center']}, dims=['aggregation'])
         return raw_score
 
diff --git a/tests/test_benchmarks/test_malania2007.py b/tests/test_benchmarks/test_malania2007.py
index 9ce4955af..70f9ccd8c 100644
--- a/tests/test_benchmarks/test_malania2007.py
+++ b/tests/test_benchmarks/test_malania2007.py
@@ -29,15 +29,15 @@ def test_mean_ceiling(self):
 
     # these test values are for the pooled score ceiling
     @pytest.mark.parametrize('dataset, expected_ceiling', [
-        ('short-2', approx(0.82203635, abs=0.001)),
-        ('short-4', approx(0.78841608, abs=0.001)),
-        ('short-6', approx(0.80555853, abs=0.001)),
-        ('short-8', approx(0.7866628, abs=0.001)),
-        ('short-16', approx(0.90941085, abs=0.001)),
-        ('equal-2', approx(0.77990816, abs=0.001)),
-        ('long-2', approx(0.72215817, abs=0.001)),
-        ('equal-16', approx(0.62778544, abs=0.001)),
-        ('long-16', approx(0.71010202, abs=0.001))
+        ('short-2', approx(0.69824226, abs=0.001)),
+        ('short-4', approx(0.56750692, abs=0.001)),
+        ('short-6', approx(0.62480255, abs=0.001)),
+        ('short-8', approx(0.67478401, abs=0.001)),
+        ('short-16', approx(0.83471481, abs=0.001)),
+        ('equal-2', approx(0.59491172, abs=0.001)),
+        ('long-2', approx(0.52140858, abs=0.001)),
+        ('equal-16', approx(0.3824312, abs=0.001)),
+        ('long-16', approx(0.51425013, abs=0.001))
     ])
     def test_dataset_ceiling(self, dataset, expected_ceiling):
         benchmark = f"Malania2007_{dataset}"

From dffa729373be270ec90bf6e27428dc3be5d66615 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 23 Jan 2024 14:39:42 +0100
Subject: [PATCH 25/65] update benchmark to 2.0 format w/ local test

---
 .../benchmarks/malania2007/benchmark.py       | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index c8069f218..25189a037 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -2,13 +2,13 @@
 import numpy as np
 import xarray as xr
 
-import brainscore
+import brainscore_vision
 from brainio.assemblies import PropertyAssembly
-from brainscore.benchmarks import BenchmarkBase
-from brainscore.benchmarks.screen import place_on_screen
-from brainscore.metrics.threshold import ThresholdElevation
-from brainscore.model_interface import BrainModel
-from brainscore.utils import LazyLoad
+from brainscore_vision.benchmarks import BenchmarkBase
+from brainscore_vision.benchmark_helpers.screen import place_on_screen
+from brainscore_vision import load_dataset, load_stimulus_set, load_metric
+from brainscore_vision.model_interface import BrainModel
+from brainscore_vision.utils import LazyLoad
 
 
 BIBTEX = """@article{malania2007,
@@ -87,16 +87,16 @@ def __init__(self, condition: str):
 
         self._assemblies = {'baseline_assembly': self._baseline_assembly,
                             'condition_assembly': self._assembly}
-        self._stimulus_set = brainscore.get_stimulus_set(f'{self.condition}')
-        self._baseline_stimulus_set = brainscore.get_stimulus_set(f'{self.baseline_condition}')
+        self._stimulus_set = brainscore_vision.load_stimulus_set(f'{self.condition}')
+        self._baseline_stimulus_set = brainscore_vision.load_stimulus_set(f'{self.baseline_condition}')
         self._stimulus_sets = {self.condition: self._stimulus_set,
                                self.baseline_condition: self._baseline_stimulus_set}
-        self._fitting_stimuli = brainscore.get_stimulus_set(f'{self.condition}_fit')
+        self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'{self.condition}_fit')
 
-        self._metric = ThresholdElevation(independent_variable='vernier_offset',
-                                          baseline_condition=self.baseline_condition,
-                                          test_condition=self.condition,
-                                          threshold_accuracy=0.75)
+        self._metric = load_metric('threshold_elevation',
+                                   baseline_condition=self.baseline_condition,
+                                   test_condition=self.condition,
+                                   threshold_accuracy=0.75)
         self._ceiling = self._metric.ceiling(self._assemblies)
 
         self._visual_degrees = 2.986667
@@ -117,6 +117,7 @@ def __call__(self, candidate: BrainModel):
                 target_visual_degrees=candidate.visual_degrees(),
                 source_visual_degrees=self._visual_degrees
             )
+            # model_requirements here
             model_responses[condition] = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials)
 
         raw_score = self._metric(model_responses, self._assemblies)
@@ -135,7 +136,7 @@ def __call__(self, candidate: BrainModel):
 
 
 def load_assembly(dataset: str) -> PropertyAssembly:
-    assembly = brainscore.get_assembly(f'Malania2007_{dataset}')
+    assembly = brainscore_vision.load_dataset(f'Malania2007_{dataset}')
     return assembly
 
 

From 30184dd8d0749b00e6f70796e3c4d5ca5e1b1991 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 23 Jan 2024 14:41:00 +0100
Subject: [PATCH 26/65] update data registry format

---
 .../data/malania2007/__init__.py              | 308 ++++++++++++++++++
 1 file changed, 308 insertions(+)

diff --git a/brainscore_vision/data/malania2007/__init__.py b/brainscore_vision/data/malania2007/__init__.py
index e69de29bb..d52e92776 100644
--- a/brainscore_vision/data/malania2007/__init__.py
+++ b/brainscore_vision/data/malania2007/__init__.py
@@ -0,0 +1,308 @@
+from brainio.assemblies import PropertyAssembly
+
+from brainscore_vision import data_registry, stimulus_set_registry, load_stimulus_set
+from brainscore_vision.data_helpers.s3 import load_assembly_from_s3, load_stimulus_set_from_s3
+
+
+BIBTEX = """@article{malania2007,
+            author = {Malania, Maka and Herzog, Michael H. and Westheimer, Gerald},
+            title = "{Grouping of contextual elements that affect vernier thresholds}",
+            journal = {Journal of Vision},
+            volume = {7},
+            number = {2},
+            pages = {1-1},
+            year = {2007},
+            issn = {1534-7362},
+            doi = {10.1167/7.2.1},
+            url = {https://doi.org/10.1167/7.2.1}
+        }"""
+
+# 'equal-2'
+# assembly
+data_registry['Malania2007_equal-2'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_equal-2',
+    version_id="",
+    sha1="",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_equal-2'),
+)
+
+# stimulus set
+stimulus_set_registry['Malania2007_equal-2'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_equal-2',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# stimulus set fitting stimuli
+stimulus_set_registry['Malania2007_equal-2_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_equal-2_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# 'equal-16'
+# assembly
+data_registry['Malania2007_equal-16'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_equal-16',
+    version_id="",
+    sha1="",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_equal-16'),
+)
+
+# stimulus set
+stimulus_set_registry['Malania2007_equal-16'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_equal-16',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# stimulus set fitting stimuli
+stimulus_set_registry['Malania2007_equal-16_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_equal-16_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# 'long-2'
+# assembly
+data_registry['Malania2007_long-2'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_long-2',
+    version_id="",
+    sha1="",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_long-2'),
+)
+
+# stimulus set
+stimulus_set_registry['Malania2007_long-2'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_long-2',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# stimulus set fitting stimuli
+stimulus_set_registry['Malania2007_long-2_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_long-2_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# 'long-16'
+# assembly
+data_registry['Malania2007_long-16'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_long-16',
+    version_id="",
+    sha1="",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_long-16'),
+)
+
+# stimulus set
+stimulus_set_registry['Malania2007_long-16'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_long-16',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# stimulus set fitting stimuli
+stimulus_set_registry['Malania2007_long-16_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_long-16_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# 'short-2'
+# assembly
+data_registry['Malania2007_short-2'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-2',
+    version_id="",
+    sha1="",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-2'),
+)
+
+# stimulus set
+stimulus_set_registry['Malania2007_short-2'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-2',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# stimulus set fitting stimuli
+stimulus_set_registry['Malania2007_short-2_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-2_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# 'short-4'
+# assembly
+data_registry['Malania2007_short-4'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-4',
+    version_id="",
+    sha1="",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-4'),
+)
+
+# stimulus set
+stimulus_set_registry['Malania2007_short-4'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-4',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# stimulus set fitting stimuli
+stimulus_set_registry['Malania2007_short-4_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-4_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# 'short-6'
+# assembly
+data_registry['Malania2007_short-6'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-6',
+    version_id="",
+    sha1="",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-6'),
+)
+
+# stimulus set
+stimulus_set_registry['Malania2007_short-6'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-6',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# stimulus set fitting stimuli
+stimulus_set_registry['Malania2007_short-6_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-6_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# 'short-8'
+# assembly
+data_registry['Malania2007_short-8'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-8',
+    version_id="",
+    sha1="",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-8'),
+)
+
+# stimulus set
+stimulus_set_registry['Malania2007_short-8'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-8',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# stimulus set fitting stimuli
+stimulus_set_registry['Malania2007_short-8_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-8_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# 'short-16'
+# assembly
+data_registry['Malania2007_short-16'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-16',
+    version_id="",
+    sha1="",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-16'),
+)
+
+# stimulus set
+stimulus_set_registry['Malania2007_short-16'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-16',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# stimulus set fitting stimuli
+stimulus_set_registry['Malania2007_short-16_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-16_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# 'vernier-only'
+# assembly
+data_registry['Malania2007_vernier-only'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_vernier-only',
+    version_id="",
+    sha1="",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_vernier-only'),
+)
+
+# stimulus set
+stimulus_set_registry['Malania2007_vernier-only'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_vernier-only',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
+
+# stimulus set fitting stimuli
+stimulus_set_registry['Malania2007_vernier-only_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_vernier-only_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="",
+    zip_sha1="",
+    csv_version_id="",
+    zip_version_id="")
\ No newline at end of file

From 75f40f5de7c4f60ff75759f88bc99967f33b478d Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 23 Jan 2024 14:42:07 +0100
Subject: [PATCH 27/65] remove BIBTEX from threshold __init__

---
 brainscore_vision/metrics/threshold/__init__.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/brainscore_vision/metrics/threshold/__init__.py b/brainscore_vision/metrics/threshold/__init__.py
index 6e29f13a7..69f6102e8 100644
--- a/brainscore_vision/metrics/threshold/__init__.py
+++ b/brainscore_vision/metrics/threshold/__init__.py
@@ -3,12 +3,3 @@
 
 metric_registry['threshold'] = Threshold
 metric_registry['threshold_elevation'] = ThresholdElevation
-
-BIBTEX = """@article{...,
-  title={...},
-  author={...},
-  journal={...},
-  volume={...},
-  pages={...},
-  year={...}
-}"""

From bc054d05a3973aedfaeadeef142e157ad5dc500e Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 23 Jan 2024 14:52:41 +0100
Subject: [PATCH 28/65] update metric imports to 2.0

---
 brainscore_vision/metrics/threshold/metric.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/brainscore_vision/metrics/threshold/metric.py b/brainscore_vision/metrics/threshold/metric.py
index 0a43581dd..07cdf6599 100644
--- a/brainscore_vision/metrics/threshold/metric.py
+++ b/brainscore_vision/metrics/threshold/metric.py
@@ -1,11 +1,12 @@
 from typing import Dict, Union, Tuple, Optional, Callable
+
 import numpy as np
 from scipy.optimize import minimize
 from scipy.stats import norm
 from sklearn.metrics import r2_score
 import matplotlib.pyplot as plt
 
-from brainscore.metrics import Metric, Score
+from brainscore_core.metrics import Metric, Score
 from brainio.assemblies import PropertyAssembly, BehavioralAssembly
 
 

From a96aa52b58b6e80b17ec3b8988b5d78a655b83c7 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 23 Jan 2024 14:59:21 +0100
Subject: [PATCH 29/65] re-add test_stimuli.py deleted by accident

---
 tests/test_stimuli.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/test_stimuli.py

diff --git a/tests/test_stimuli.py b/tests/test_stimuli.py
new file mode 100644
index 000000000..e69de29bb

From bde98eb024d7047b62d2673f20f43409710b4bcb Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 24 Jan 2024 10:20:36 +0100
Subject: [PATCH 30/65] change nan filtering to subject_unique_id filtering

---
 .../benchmarks/malania2007/benchmark.py       | 26 +++++++------------
 .../benchmarks/scialom2024/__init__.py        |  0
 .../benchmarks/scialom2024/benchmark.py       |  0
 .../benchmarks/scialom2024/test.py            |  0
 4 files changed, 10 insertions(+), 16 deletions(-)
 create mode 100644 brainscore_vision/benchmarks/scialom2024/__init__.py
 create mode 100644 brainscore_vision/benchmarks/scialom2024/benchmark.py
 create mode 100644 brainscore_vision/benchmarks/scialom2024/test.py

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index 25189a037..1c0bd872c 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -82,8 +82,8 @@ def __init__(self, condition: str):
         # condition
         baseline_assembly = LazyLoad(lambda: load_assembly(self.baseline_condition))
         condition_assembly = LazyLoad(lambda: load_assembly(self.condition))
-        self._assembly, self._baseline_assembly = remove_subjects_with_nans(condition_assembly,
-                                                                            baseline_assembly)
+        self._assembly, self._baseline_assembly = filter_baseline_subjects(condition_assembly,
+                                                                           baseline_assembly)
 
         self._assemblies = {'baseline_assembly': self._baseline_assembly,
                             'condition_assembly': self._assembly}
@@ -140,17 +140,11 @@ def load_assembly(dataset: str) -> PropertyAssembly:
     return assembly
 
 
-def remove_subjects_with_nans(condition_assembly: PropertyAssembly,
-                              baseline_assembly: PropertyAssembly
-                              ) -> Tuple[PropertyAssembly, PropertyAssembly]:
-    # Find the indices of the subjects with NaN values in the first PropertyAssembly
-    nan_subjects = np.isnan(condition_assembly.values)
-
-    # Convert the boolean array to a DataArray with the same coordinates as the input assemblies
-    nan_subjects_da = xr.DataArray(nan_subjects, coords=condition_assembly.coords, dims=condition_assembly.dims)
-
-    # Filter out the subjects with NaN values from both PropertyAssemblies
-    filtered_condition_assembly = condition_assembly.where(~nan_subjects_da, drop=True)
-    filtered_baseline_assembly = baseline_assembly.where(~nan_subjects_da, drop=True)
-
-    return filtered_condition_assembly, filtered_baseline_assembly
+def filter_baseline_subjects(condition_assembly: PropertyAssembly,
+                             baseline_assembly: PropertyAssembly
+                             ) -> Tuple[PropertyAssembly, PropertyAssembly]:
+    """A function to select only the unique subjects that exist in the condition_assembly."""
+    unique_ids = condition_assembly.coords['subject_unique_id'].values.tolist()
+    mask = baseline_assembly.coords['subject_unique_id'].isin(unique_ids)
+    filtered_baseline_assembly = baseline_assembly.where(mask, drop=True)
+    return condition_assembly, filtered_baseline_assembly
diff --git a/brainscore_vision/benchmarks/scialom2024/__init__.py b/brainscore_vision/benchmarks/scialom2024/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/brainscore_vision/benchmarks/scialom2024/benchmark.py b/brainscore_vision/benchmarks/scialom2024/benchmark.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/brainscore_vision/benchmarks/scialom2024/test.py b/brainscore_vision/benchmarks/scialom2024/test.py
new file mode 100644
index 000000000..e69de29bb

From b445b5cec23aa061b6c86f817c8c2aac28cec2e2 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Mon, 18 Mar 2024 11:17:30 +0100
Subject: [PATCH 31/65] add require_variance to model call

---
 brainscore_vision/benchmarks/malania2007/benchmark.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index 1c0bd872c..b3cf8f9ea 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -118,7 +118,8 @@ def __call__(self, candidate: BrainModel):
                 source_visual_degrees=self._visual_degrees
             )
             # model_requirements here
-            model_responses[condition] = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials)
+            model_responses[condition] = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials,
+                                                           require_variance=True)
 
         raw_score = self._metric(model_responses, self._assemblies)
 

From 824ffd5e41d1601d10076d7e17a66f16d42a6b17 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 13 Jun 2024 13:50:10 +0200
Subject: [PATCH 32/65] update stimulus and data assembly related information
 with arcane knowledge collected from years of hard work

---
 .../benchmarks/malania2007/benchmark.py       |   6 +-
 .../benchmarks/malania2007/test.py            |   7 +-
 .../data/malania2007/__init__.py              | 378 ++++++++----------
 .../data/malania2007/malania_data_assembly.py |  12 +-
 .../data/malania2007/malania_stimulus_set.py  |  38 +-
 brainscore_vision/data/malania2007/test.py    |  63 ++-
 6 files changed, 229 insertions(+), 275 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index b3cf8f9ea..c0efe035d 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -56,7 +56,7 @@ class _Malania2007Base(BenchmarkBase):
     Benchmark Choices:
 
     1) The number and type of fitting stimuli are unfounded choices. Currently, the number of fitting stimuli is chosen
-        to be relatively small, but sufficient for good decoding performance in the baseline condition in general.
+        to be relatively large, and hopefully sufficient for decoding in the baseline condition in general.
         - Precisely faithful alternative: Present text instructions to models as they were presented to humans
             * Why not this alternative? Since the experiment is about early visual perception, and there are currently
             few/no models capable of a task like this, it would not be interesting.
@@ -64,6 +64,8 @@ class _Malania2007Base(BenchmarkBase):
         Lee & DiCarlo (2023), biorXiv (doi:https://doi.org/10.1101/2022.12.31.522402).
             * Why not this alternative? Since the experiment is not about perceptual learning but about early visual
             perception, and there are few/no models capable of a task like this, it would not be interesting.
+        - Importantly, this means the benchmark examines the models' capability to support a task like this, rather than
+        their capability to learn a task like this.
     2) In the human experiment, stimuli were presented at exactly the foveal position. In the model experiment,
         testing stimuli are presented at exactly the foveal position +- 72arcsec = 0.02deg.
         * Why this alternative? Since most models evaluated are test-time deterministic, we want a more precise
@@ -100,7 +102,7 @@ def __init__(self, condition: str):
         self._ceiling = self._metric.ceiling(self._assemblies)
 
         self._visual_degrees = 2.986667
-        self._number_of_trials = 1
+        self._number_of_trials = 10  # arbitrary choice for microsaccades to improve precision of estimates
 
         super(_Malania2007Base, self).__init__(
             identifier=f'Malania2007_{condition}', version=1,
diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index 95692e37e..b5bcd6948 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -10,7 +10,6 @@
 from brainscore_vision.benchmarks.malania2007.benchmark import DATASETS
 
 
-@pytest.mark.private_access
 class TestBehavioral:
     def test_count(self):
         assert len(DATASETS) == 5 + 2 + 2
@@ -20,6 +19,7 @@ def test_in_pool(self, dataset):
         identifier = f"Malania2007_{dataset}"
         assert identifier in benchmark_registry
 
+    @pytest.mark.private_access
     def test_mean_ceiling(self):
         benchmarks = [f"Malania2007_{dataset}" for dataset in DATASETS]
         benchmarks = [benchmark_registry[benchmark] for benchmark in benchmarks]
@@ -28,6 +28,7 @@ def test_mean_ceiling(self):
         assert mean_ceiling == approx(0.7724487108297781, abs=0.001)
 
     # these test values are for the pooled score ceiling
+    @pytest.mark.private_access
     @pytest.mark.parametrize('dataset, expected_ceiling', [
         ('short-2', approx(0.69824226, abs=0.001)),
         ('short-4', approx(0.56750692, abs=0.001)),
@@ -45,6 +46,7 @@ def test_dataset_ceiling(self, dataset, expected_ceiling):
         ceiling = benchmark.ceiling
         assert ceiling.sel(aggregation='center').values.item() == expected_ceiling
 
+    @pytest.mark.private_access
     @pytest.mark.parametrize('dataset, model, expected_raw_score', [
         ('short-2', 'resnet-18', approx(0., abs=0.001)),
         ('short-4', 'resnet-18', approx(0., abs=0.001)),
@@ -57,6 +59,7 @@ def test_dataset_ceiling(self, dataset, expected_ceiling):
         ('long-16', 'resnet-18', approx(0., abs=0.001)),
     ])
     def test_model_8degrees(self, dataset, model, expected_raw_score):
+        raise Exception("This test needs to be recalculated.")
         benchmark = benchmark_registry[f"Malania2007_{dataset}"]
         # load features
         precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'
@@ -68,6 +71,7 @@ def test_model_8degrees(self, dataset, model, expected_raw_score):
         score = benchmark(precomputed_features).raw
         assert score == expected_raw_score
 
+    @pytest.mark.private_access
     @pytest.mark.parametrize('dataset, model, expected_raw_score', [
         ('short-2', 'resnet-18-3deg', approx(0., abs=0.001)),
         ('short-4', 'resnet-18-3deg', approx(0., abs=0.001)),
@@ -80,6 +84,7 @@ def test_model_8degrees(self, dataset, model, expected_raw_score):
         ('long-16', 'resnet-18-3deg', approx(0., abs=0.001)),
     ])
     def test_model_3degrees(self, dataset, model, expected_raw_score):
+        raise Exception("This test needs to be recalculated.")
         benchmark = benchmark_registry[f"Malania2007_{dataset}"]
         # load features
         precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'
diff --git a/brainscore_vision/data/malania2007/__init__.py b/brainscore_vision/data/malania2007/__init__.py
index d52e92776..2d4b28ab9 100644
--- a/brainscore_vision/data/malania2007/__init__.py
+++ b/brainscore_vision/data/malania2007/__init__.py
@@ -17,292 +17,238 @@
             url = {https://doi.org/10.1167/7.2.1}
         }"""
 
-# 'equal-2'
-# assembly
+
 data_registry['Malania2007_equal-2'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_equal-2',
-    version_id="",
-    sha1="",
+    version_id="yFXK8xjGjEmuYTSfS58rGS_ah3.NGg0X",
+    sha1="277b2fbffed00e16b6a69b488f73eeda5abaaf10",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
     stimulus_set_loader=lambda: load_stimulus_set('Malania2007_equal-2'),
 )
+data_registry['Malania2007_equal-16'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_equal-16',
+    version_id="SRZ7bs.Ek59GkeS084Pvdy38uTzFs4yw",
+    sha1="ef49506238e8d2554918b113fbc60c133077186e",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_equal-16'),
+)
+data_registry['Malania2007_long-2'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_long-2',
+    version_id="2c1lWuXthb3rymB3seTQX1jVqiKUTn1f",
+    sha1="9076a5b693948c4992b6c8e753f04a7acd2014a1",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_long-2'),
+)
+data_registry['Malania2007_long-16'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_long-16',
+    version_id="qshNxhxjgusWyWiXnbfFN6gqjLgRh8fO",
+    sha1="3106cf1f2fa9e66617ebf231df05d29077fc478f",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_long-16'),
+)
+data_registry['Malania2007_short-2'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-2',
+    version_id="8CQ9MupuljAgkkKUXs3hiOliHg8xoDxb",
+    sha1="85fb65ad76de48033c704b9c5689771e1ea0457d",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-2'),
+)
+data_registry['Malania2007_short-4'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-4',
+    version_id=".ZUO0upSfQrWLPgd4oGwAaCbN4bz6S6H",
+    sha1="75506be9a26ec38a223e41510f1a8cb32d5b0bc9",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-4'),
+)
+data_registry['Malania2007_short-6'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-6',
+    version_id="q4FugpNGkT_FQP..qIVzye83hAQR2xfS",
+    sha1="2901be6b352e67550da040d79d744819365b8626",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-6'),
+)
+data_registry['Malania2007_short-8'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-8',
+    version_id="4_lcRl_I7Mp0RHxcfqZ9tkAZjVh.5oMU",
+    sha1="6daf47b086cb969d75222e320f49453ed8437885",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-8'),
+)
+data_registry['Malania2007_short-16'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-16',
+    version_id="fFqEIyIC9CHzqTEmv0MitjCgpeMX5pxJ",
+    sha1="8ae0898caad718b747f85fce5888416affc3a569",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-16'),
+)
+data_registry['Malania2007_vernier-only'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_vernier-only',
+    version_id="JLWf2pIR_UadQHqwtegJkC6XzWdbSNGi",
+    sha1="1cf83e8b6141f8b0d67ea46994f342325f62001f",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_vernier-only'),
+)
+
 
-# stimulus set
 stimulus_set_registry['Malania2007_equal-2'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_equal-2',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# stimulus set fitting stimuli
+    csv_sha1="77e94b9b5122a83ebbaffb4a06fcab68ef652751",
+    zip_sha1="99826d459f6920dafab72eed69eb2a90492ce796",
+    csv_version_id="MlRpSz.4.jvVRFAZl8tGEum1P0Q0GtyS",
+    zip_version_id="vHbAM_FjTbjp5U12BkAelJu4KW6PLYFn"
+)
 stimulus_set_registry['Malania2007_equal-2_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_equal-2_fit',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# 'equal-16'
-# assembly
-data_registry['Malania2007_equal-16'] = lambda: load_assembly_from_s3(
-    identifier='Malania2007_equal-16',
-    version_id="",
-    sha1="",
-    bucket="brainio-brainscore",
-    cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_equal-16'),
+    csv_sha1="bafdfc855c164d3e5443d67dcf9eb7762443f964",
+    zip_sha1="e52fec1a79ac8837e331b180c2a8a140840d6666",
+    csv_version_id="PIXEW.2vHvjIBP0Q2KHIpnxns7t9o8Cf",
+    zip_version_id="h7pp84CYFGLKlPhveD0L5ogePqisk_I7"
 )
-
-# stimulus set
 stimulus_set_registry['Malania2007_equal-16'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_equal-16',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# stimulus set fitting stimuli
+    csv_sha1="5fedcff56c302339c3451ae2edbcb846c39c3189",
+    zip_sha1="b30dc2dc90e4f3d88775622e558db963765f38e0",
+    csv_version_id="VmRGiQkhPALDwq74NpE2VpTiKTGn.30T",
+    zip_version_id="c.DOlVULXZingRJ9gVY_NbZwRrj_xs_i"
+)
 stimulus_set_registry['Malania2007_equal-16_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_equal-16_fit',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# 'long-2'
-# assembly
-data_registry['Malania2007_long-2'] = lambda: load_assembly_from_s3(
-    identifier='Malania2007_long-2',
-    version_id="",
-    sha1="",
-    bucket="brainio-brainscore",
-    cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_long-2'),
+    csv_sha1="3de3e5de19a638767a01ba68cb690dc746c29a77",
+    zip_sha1="1728920c5ea4fb7b3a3cf3c076165aca65c8b751",
+    csv_version_id="joAq8JBC_7axZDfLNFgoXFhTCLU_KKr_",
+    zip_version_id="77JRwdldaHDr6TLW1NnB5HucIrkUCVg."
 )
-
-# stimulus set
 stimulus_set_registry['Malania2007_long-2'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_long-2',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# stimulus set fitting stimuli
+    csv_sha1="ba65316a63dc688d8dfb410219a28fd02850b991",
+    zip_sha1="7fd431fbbd4a4dc0cd271624d3297c19a28a70b5",
+    csv_version_id="_0fqObn6k5KvXurHMsuD4IqtrqbNskyo",
+    zip_version_id="foL92ndVAAAETzMYHdmMtwIwKxXYhAB."
+)
 stimulus_set_registry['Malania2007_long-2_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_long-2_fit',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# 'long-16'
-# assembly
-data_registry['Malania2007_long-16'] = lambda: load_assembly_from_s3(
-    identifier='Malania2007_long-16',
-    version_id="",
-    sha1="",
-    bucket="brainio-brainscore",
-    cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_long-16'),
+    csv_sha1="b91dd9261c1d47bdd37f9b60eb8066b7b719709f",
+    zip_sha1="5be3e1cd57b59081103715b5d318505166e0045e",
+    csv_version_id="mATh8lcVisdsDnPnU6ACE23iBPfpkLZA",
+    zip_version_id="6nEviShTyCYQKrmxyjDyNov9Skc77eXT"
 )
-
-# stimulus set
 stimulus_set_registry['Malania2007_long-16'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_long-16',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# stimulus set fitting stimuli
+    csv_sha1="1f1b03319b81698ba5e7db389dcd4248f94e45ca",
+    zip_sha1="97c70462a28905b58058c687880188d634d357f0",
+    csv_version_id="4RtywQ40hfQA4N80g8lxEScAmMXFRg7E",
+    zip_version_id="lJy2QosABzHtiA6BJaE4OqCn1w1Jhz2k"
+)
 stimulus_set_registry['Malania2007_long-16_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_long-16_fit',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# 'short-2'
-# assembly
-data_registry['Malania2007_short-2'] = lambda: load_assembly_from_s3(
-    identifier='Malania2007_short-2',
-    version_id="",
-    sha1="",
-    bucket="brainio-brainscore",
-    cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-2'),
+    csv_sha1="d80a02c75b9908301c3c8dc9f7116fecf8e060ec",
+    zip_sha1="d8819b94d3f502d7a382c8a0db0a34627132e5e2",
+    csv_version_id="gOxY6tjnT7LO.FDeL1xkRmowl5wYeAia",
+    zip_version_id="71UAPTnZscIuqdx2dhuW9V0O0DO_TgTM"
 )
-
-# stimulus set
 stimulus_set_registry['Malania2007_short-2'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-2',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# stimulus set fitting stimuli
+    csv_sha1="bf0252056d2084e855646f624700ab03c19cfc3d",
+    zip_sha1="eee1270feb7443e7e315d8feb7fb0a6b6908f554",
+    csv_version_id="zcJqM.ZPwJyiMRWa3RBdvv401yPnLQAp",
+    zip_version_id="C8WZzAAQ0JGHAAKii4JpvlRhcUOhgSj."
+)
 stimulus_set_registry['Malania2007_short-2_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-2_fit',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# 'short-4'
-# assembly
-data_registry['Malania2007_short-4'] = lambda: load_assembly_from_s3(
-    identifier='Malania2007_short-4',
-    version_id="",
-    sha1="",
-    bucket="brainio-brainscore",
-    cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-4'),
+    csv_sha1="73127d279a2cd254ae4f07b0053580851e84b00c",
+    zip_sha1="918736349d714a4f784c29bf7e7d218b103e128d",
+    csv_version_id="iwGRp3_ktAHfJ6r7ktSK9gsthDjKek70",
+    zip_version_id="6RpplJ9UVXTlvhmFSXla0Qa20b44m8Ds"
 )
-
-# stimulus set
 stimulus_set_registry['Malania2007_short-4'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-4',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# stimulus set fitting stimuli
+    csv_sha1="816326d89d358f6592bd1f789e5c8d429fbca1cd",
+    zip_sha1="ff57d976ef75ede9148a4097e90d6cf6c8054d34",
+    csv_version_id="Waikk.bktXIncCUtCIAyB2EqynGk.H.F",
+    zip_version_id="rl_muxI4UEpwXVaXuhsqroG..COGILvR"
+)
 stimulus_set_registry['Malania2007_short-4_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-4_fit',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# 'short-6'
-# assembly
-data_registry['Malania2007_short-6'] = lambda: load_assembly_from_s3(
-    identifier='Malania2007_short-6',
-    version_id="",
-    sha1="",
-    bucket="brainio-brainscore",
-    cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-6'),
+    csv_sha1="3512cfd029f4e4299bc41ede519e691d80cfc3d5",
+    zip_sha1="301386408dd1fb8556881f9a171be2d43dbfec6e",
+    csv_version_id="UhisdJqiEmkQ_4zsUtAmaxtle2kMZdcD",
+    zip_version_id="xt_v0xgCB8YUptyPB0yZFHIUcel5MF_x"
 )
-
-# stimulus set
 stimulus_set_registry['Malania2007_short-6'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-6',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# stimulus set fitting stimuli
+    csv_sha1="3d5dd9b48a56ba0c31de94b6221b97df962b6f8a",
+    zip_sha1="120d90a143d1577d4745c3f69291d0db6c7e512e",
+    csv_version_id="GwGHPJkMDdg8N_.boyj8qJ3ChsEx4w._",
+    zip_version_id="gIN1O4yz.THvK0Ifm5M3AI58ZACE1QFh"
+)
 stimulus_set_registry['Malania2007_short-6_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-6_fit',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# 'short-8'
-# assembly
-data_registry['Malania2007_short-8'] = lambda: load_assembly_from_s3(
-    identifier='Malania2007_short-8',
-    version_id="",
-    sha1="",
-    bucket="brainio-brainscore",
-    cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-8'),
+    csv_sha1="27a5be4fca190837fc5b75ed2cdbbffbf6b41338",
+    zip_sha1="c88e05c6cadec88a2c9475b0735323a2b049bd75",
+    csv_version_id="oMlj7wV85s00hJFE84ym0AJHLCfYHVA6",
+    zip_version_id="oS.KrBTlcYAgr_lWyA_bIjVc2js_VeUe"
 )
-
-# stimulus set
 stimulus_set_registry['Malania2007_short-8'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-8',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# stimulus set fitting stimuli
+    csv_sha1="8fc35f607196b4c0cdcebd8102d17e3a637e5988",
+    zip_sha1="a9215ed0cb0f0333582dda65f6afd7015c506ba5",
+    csv_version_id="gzys8s7j7euMEl7JJpqBFLFHMpFjwbA7",
+    zip_version_id="3fYb4Iruh3lRKUwC1APqFH4CNbE5DEuk"
+)
 stimulus_set_registry['Malania2007_short-8_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-8_fit',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# 'short-16'
-# assembly
-data_registry['Malania2007_short-16'] = lambda: load_assembly_from_s3(
-    identifier='Malania2007_short-16',
-    version_id="",
-    sha1="",
-    bucket="brainio-brainscore",
-    cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-16'),
+    csv_sha1="aa4133a9fe19a3c9004a9cb5e6eb5a72564e4883",
+    zip_sha1="beb9f068794708e41750202b78c438538a40a8fb",
+    csv_version_id="7N1Z.uiagqBknJUSBQ4mVfHKWgocM5aA",
+    zip_version_id="kcEOPOkvWymO0wX5j_QKxcNPl9sZsjFd"
 )
-
-# stimulus set
 stimulus_set_registry['Malania2007_short-16'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-16',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# stimulus set fitting stimuli
+    csv_sha1="addd260c9959f2f315db03c0a39c6c1b01fef685",
+    zip_sha1="cba4c2866ec692fb808471df7c2fed446d9fb3fe",
+    csv_version_id="Peu7WU5vanLoZNOFIAbuPzZNPDRgbCSX",
+    zip_version_id="wFkJkZMC8Fs_HfPJy32CMKcHJWeQIUDB"
+)
 stimulus_set_registry['Malania2007_short-16_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-16_fit',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# 'vernier-only'
-# assembly
-data_registry['Malania2007_vernier-only'] = lambda: load_assembly_from_s3(
-    identifier='Malania2007_vernier-only',
-    version_id="",
-    sha1="",
-    bucket="brainio-brainscore",
-    cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_vernier-only'),
+    csv_sha1="9b340fe242117482f6992f48a805297215ba9924",
+    zip_sha1="4a90d511a3ceb3307a672177a3ad6b76521e65e5",
+    csv_version_id="sYBPEmXDgbWipuepciLirlorQE3L8BLc",
+    zip_version_id="pYvOkrLxadkQ67K3__wmciNwaCW.hyyN"
 )
-
-# stimulus set
 stimulus_set_registry['Malania2007_vernier-only'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_vernier-only',
     bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
-
-# stimulus set fitting stimuli
-stimulus_set_registry['Malania2007_vernier-only_fit'] = lambda: load_stimulus_set_from_s3(
-    identifier='Malania2007_vernier-only_fit',
-    bucket="brainio-brainscore",
-    csv_sha1="",
-    zip_sha1="",
-    csv_version_id="",
-    zip_version_id="")
\ No newline at end of file
+    csv_sha1="b2cb0f2ed32426b739f90187ae24ad4adf84110d",
+    zip_sha1="0e177aea523adc320070196fbb777af4cdba2144",
+    csv_version_id="c8wpZpqoMqdATlqdoq3srPUi_8fYg6a.",
+    zip_version_id="28lHgxERhw32Ux6IBCxWWTtRwIaRrwo6"
+)
diff --git a/brainscore_vision/data/malania2007/malania_data_assembly.py b/brainscore_vision/data/malania2007/malania_data_assembly.py
index a3110e094..f24c8fc5d 100644
--- a/brainscore_vision/data/malania2007/malania_data_assembly.py
+++ b/brainscore_vision/data/malania2007/malania_data_assembly.py
@@ -72,10 +72,14 @@ def remove_subjects_with_nans(assembly1, assembly2):
 
 
 if __name__ == '__main__':
-    root_directory = Path(r'../../../packaging/malania2007/malania2007_data_assembly')
+    root_directory = Path(r'../../data/malania2007/data_packaging/')
     for dataset in DATASETS:
         assembly = collect_malania_data_assembly(root_directory, dataset)
         # upload to S3
-        #package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-        #                     stimulus_set_identifier=f"Malania2007_{dataset}",
-        #                     assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
+        prints = package_data_assembly(catalog_identifier=None,
+                                       proto_data_assembly=assembly,
+                                       assembly_identifier=assembly.name,
+                                       stimulus_set_identifier=assembly.name,
+                                       assembly_class_name="PropertyAssembly",
+                                       bucket_name="brainio-brainscore")
+        print(prints)
\ No newline at end of file
diff --git a/brainscore_vision/data/malania2007/malania_stimulus_set.py b/brainscore_vision/data/malania2007/malania_stimulus_set.py
index fa6eadfe0..5b3b6f057 100644
--- a/brainscore_vision/data/malania2007/malania_stimulus_set.py
+++ b/brainscore_vision/data/malania2007/malania_stimulus_set.py
@@ -9,7 +9,7 @@
                  'long-2', 'equal-16', 'long-16', 'vernier-only', 'short-2_fit',
                  'short-4_fit', 'short-6_fit', 'short-8_fit', 'short-16_fit',
                  'equal-2_fit', 'long-2_fit', 'equal-16_fit', 'long-16_fit']
-DATASET_LENGTHS = {'test': 1225, 'fit': 1225}
+DATASET_LENGTHS = {'test': 50, 'fit': 500}
 
 
 def collect_malania_stimulus_set(root_directory, dataset):
@@ -38,20 +38,20 @@ def collect_malania_stimulus_set(root_directory, dataset):
         reader = csv.DictReader(metadata)
         for row in reader:
             stimuli.append({
-                'image_size_x': int(row['image_size_x']),
-                'image_size_y': int(row['image_size_y']),
+                'image_size_x_pix': int(row['image_size_x_pix']),
+                'image_size_y_pix': int(row['image_size_y_pix']),
                 'image_size_c': int(row['image_size_c']),
                 'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height': float(row['vernier_height']),
-                'vernier_offset': float(row['vernier_offset']),
+                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
+                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
                 'image_label': row['image_label'],
-                'flanker_height': float(row['flanker_height']),
-                'flanker_spacing': float(row['flanker_spacing']),
-                'line_width': float(row['line_width']),
-                'flanker_distance': float(row['flanker_distance']),
+                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
+                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
+                'line_width_arcsec': float(row['line_width_arcsec']),
+                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
                 'num_flankers': int(row['num_flankers']),
-                'vernier_position_x': int(row['vernier_position_x']),
-                'vernier_position_y': int(row['vernier_position_y']),
+                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
+                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
                 'stimulus_id': str(row['stimulus_id']),
             })
             stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
@@ -59,23 +59,21 @@ def collect_malania_stimulus_set(root_directory, dataset):
     stimuli = StimulusSet(stimuli)
     stimuli.stimulus_paths = stimulus_paths
     stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
+    stimuli.identifier = f'Malania2007_{dataset}'
 
     # Ensure expected number of stimuli in datasets
     assert len(stimuli) == DATASET_LENGTHS[dataset_type]
     return stimuli
 
 
-def return_local_stimulus_set(dataset):
-    root_directory = Path(r'../../../packaging/malania2007/malania2007_stimulus_set')
-    stimuli = collect_malania_stimulus_set(root_directory, dataset)
-    return stimuli
-
-
 if __name__ == '__main__':
-    root_directory = Path(r'../../../packaging/malania2007/malania2007_stimulus_set')
+    root_directory = Path(r'../../data/malania2007/data_packaging/')
     for stimulus_set in STIMULUS_SETS:
         stimuli = collect_malania_stimulus_set(root_directory, stimulus_set)
 
         # upload to S3
-        #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-        #                     bucket_name="brainio-brainscore")
+        prints = package_stimulus_set(catalog_name=None,
+                                      proto_stimulus_set=stimuli,
+                                      stimulus_set_identifier=stimuli.name,
+                                      bucket_name="brainio-brainscore")
+        print(prints)
diff --git a/brainscore_vision/data/malania2007/test.py b/brainscore_vision/data/malania2007/test.py
index 20207258c..4fd83059f 100644
--- a/brainscore_vision/data/malania2007/test.py
+++ b/brainscore_vision/data/malania2007/test.py
@@ -46,7 +46,7 @@ def test_fields_present(self, identifier, field):
         assert hasattr(assembly, field)
 
 
-@pytest.mark.slow
+@pytest.mark.private_access
 class TestStimulusSets:
     # test stimulus_set data:
     @pytest.mark.parametrize('identifier', [
@@ -76,29 +76,28 @@ def test_stimulus_set_exist(self, identifier):
         assert stimulus_set is not None
         assert stimulus_set.identifier == full_name
 
-    # test the number of images
     @pytest.mark.parametrize('identifier, num_images', [
-        ('short-2', 1225),
-        ('short-4', 1225),
-        ('short-6', 1225),
-        ('short-8', 1225),
-        ('short-16', 1225),
-        ('equal-2', 1225),
-        ('long-2', 1225),
-        ('equal-16', 1225),
-        ('long-16', 1225),
-        ('short-2_fit', 1225),
-        ('short-4_fit', 1225),
-        ('short-6_fit', 1225),
-        ('short-8_fit', 1225),
-        ('short-16_fit', 1225),
-        ('equal-2_fit', 1225),
-        ('long-2_fit', 1225),
-        ('equal-16_fit', 1225),
-        ('long-16_fit', 1225),
-        ('vernier-only', 1225)
+        ('short-2', 50),
+        ('short-4', 50),
+        ('short-6', 50),
+        ('short-8', 50),
+        ('short-16', 50),
+        ('equal-2', 50),
+        ('long-2', 50),
+        ('equal-16', 50),
+        ('long-16', 50),
+        ('short-2_fit', 50),
+        ('short-4_fit', 50),
+        ('short-6_fit', 50),
+        ('short-8_fit', 50),
+        ('short-16_fit', 50),
+        ('equal-2_fit', 50),
+        ('long-2_fit', 50),
+        ('equal-16_fit', 50),
+        ('long-16_fit', 50),
+        ('vernier-only', 50)
     ])
-    def test_num_images(self, identifier, num_images):
+    def test_number_of_images(self, identifier, num_images):
         stimulus_set = load_stimulus_set(f"Malania2007_{identifier}")
         assert len(np.unique(stimulus_set['stimulus_id'].values)) == num_images
 
@@ -125,20 +124,20 @@ def test_num_images(self, identifier, num_images):
         'vernier-only'
     ])
     @pytest.mark.parametrize('field', [
-        'image_size_x',
-        'image_size_y',
+        'image_size_x_pix',
+        'image_size_y_pix',
         'image_size_c',
         'image_size_degrees',
-        'vernier_height',
-        'vernier_offset',
+        'vernier_height_arcsec',
+        'vernier_offset_arcsec',
         'image_label',
-        'flanker_height',
-        'flanker_spacing',
-        'line_width',
-        'flanker_distance',
+        'flanker_height_arcsec',
+        'flanker_spacing_arcsec',
+        'line_width_arcsec',
+        'flanker_distance_arcsec',
         'num_flankers',
-        'vernier_position_x',
-        'vernier_position_y',
+        'vernier_position_x_pix',
+        'vernier_position_y_pix',
         'stimulus_id',
     ])
     def test_fields_present(self, identifier, field):

From 199b993326bbda652354f5682f72285a4167cf1e Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 13 Jun 2024 13:55:01 +0200
Subject: [PATCH 33/65] more arcane knowledge, it's endless

---
 .../benchmarks/malania2007/__init__.py           | 16 +++++++++-------
 .../benchmarks/malania2007/benchmark.py          |  5 -----
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/__init__.py b/brainscore_vision/benchmarks/malania2007/__init__.py
index 349d21902..f0b56d84c 100644
--- a/brainscore_vision/benchmarks/malania2007/__init__.py
+++ b/brainscore_vision/benchmarks/malania2007/__init__.py
@@ -1,10 +1,12 @@
 from brainscore_vision import benchmark_registry
-
-# Malania2007
-
 from . import benchmark
 
-for dataset in benchmark.DATASETS:
-    assembly_identifier = f"Malania2007_{dataset}"
-    benchmark_ctr = getattr(benchmark, f"{assembly_identifier}")
-    benchmark_registry[f"{assembly_identifier}"] = benchmark_ctr
+benchmark_registry['Malania2007_short-2'] = lambda: benchmark._Malania2007Base('short-2')
+benchmark_registry['Malania2007_short-4'] = lambda: benchmark._Malania2007Base('short-4')
+benchmark_registry['Malania2007_short-6'] = lambda: benchmark._Malania2007Base('short-6')
+benchmark_registry['Malania2007_short-8'] = lambda: benchmark._Malania2007Base('short-8')
+benchmark_registry['Malania2007_short-16'] = lambda: benchmark._Malania2007Base('short-16')
+benchmark_registry['Malania2007_equal-2'] = lambda: benchmark._Malania2007Base('equal-2')
+benchmark_registry['Malania2007_long-2'] = lambda: benchmark._Malania2007Base('long-2')
+benchmark_registry['Malania2007_equal-16'] = lambda: benchmark._Malania2007Base('equal-16')
+benchmark_registry['Malania2007_long-16'] = lambda: benchmark._Malania2007Base('long-16')
diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index c0efe035d..e073614a0 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -32,11 +32,6 @@
                               'short-16': 16, 'equal-2': 2, 'long-2': 2, 'equal-16': 16,
                               'long-16': 16, 'vernier-only': 0}
 
-for dataset in DATASETS:
-    # behavioral benchmark
-    identifier = f"Malania2007_{dataset}"
-    globals()[identifier] = lambda dataset=dataset: _Malania2007Base(dataset)
-
 
 class _Malania2007Base(BenchmarkBase):
     """

From c955c3475536af6f9f3828609fa8b59c0dccd795 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 13 Jun 2024 15:43:53 +0200
Subject: [PATCH 34/65] move packaging files

---
 .../data/malania2007/malania_data_assembly.py | 85 -------------------
 .../data/malania2007/malania_stimulus_set.py  | 79 -----------------
 2 files changed, 164 deletions(-)
 delete mode 100644 brainscore_vision/data/malania2007/malania_data_assembly.py
 delete mode 100644 brainscore_vision/data/malania2007/malania_stimulus_set.py

diff --git a/brainscore_vision/data/malania2007/malania_data_assembly.py b/brainscore_vision/data/malania2007/malania_data_assembly.py
deleted file mode 100644
index f24c8fc5d..000000000
--- a/brainscore_vision/data/malania2007/malania_data_assembly.py
+++ /dev/null
@@ -1,85 +0,0 @@
-from pathlib import Path
-import numpy as np
-import xarray as xr
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-import pandas as pd
-
-
-DATASETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2',
-            'long-2', 'equal-16', 'long-16', 'vernier-only']
-NUM_SUBJECTS = {'short-2': 6,
-                'short-4': 5,
-                'short-6': 5,
-                'short-8': 5,
-                'short-16': 6,
-                'equal-2': 5,
-                'long-2': 5,
-                'equal-16': 5,
-                'long-16': 5,
-                'vernier-only': 6}
-
-
-def collect_malania_data_assembly(root_directory, dataset):
-    """
-    Experiment Information:
-        - 5-6 observers per condition (for exact value, see NUM_SUBJECTS)
-        - 2AFC left/right offset discrimination task
-        - PEST staircase to 75% correct responses
-        - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-    """
-    # construct the assembly
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata_human.xlsx')
-    metadata = pd.read_excel(metadata_directory)
-    # Since subjects are uniquely held using 'unique_subject_id', drop the rows with a subject
-    #  without measurement
-    assembly = PropertyAssembly(metadata['threshold'],
-                                  coords={
-                                      'subject_unique_id': ('subject', metadata['subject_unique_id'])
-                                  },
-                                  dims=['subject']
-                                  )
-
-    # give the assembly an identifier name
-    assembly.name = f'Malania2007_{dataset}'
-
-    # test subject numbers after removing the NaN subject
-    metadata = metadata.dropna(subset=['threshold'], axis=0)
-    assert len(metadata) == NUM_SUBJECTS[dataset]
-
-    return assembly
-
-
-def return_local_data_assembly(dataset):
-    root_directory = Path(r'../../../packaging/malania2007/malania2007_data_assembly')
-    assembly = collect_malania_data_assembly(root_directory, dataset)
-    return assembly
-
-
-def remove_subjects_with_nans(assembly1, assembly2):
-    # Find the indices of the subjects with NaN values in the first PropertyAssembly
-    nan_subjects = np.isnan(assembly1.values)
-
-    # Convert the boolean array to a DataArray with the same coordinates as the input assemblies
-    nan_subjects_da = xr.DataArray(nan_subjects, coords=assembly1.coords, dims=assembly1.dims)
-
-    # Filter out the subjects with NaN values from both PropertyAssemblies
-    filtered_assembly1 = assembly1.where(~nan_subjects_da, drop=True)
-    filtered_assembly2 = assembly2.where(~nan_subjects_da, drop=True)
-
-    return filtered_assembly1, filtered_assembly2
-
-
-if __name__ == '__main__':
-    root_directory = Path(r'../../data/malania2007/data_packaging/')
-    for dataset in DATASETS:
-        assembly = collect_malania_data_assembly(root_directory, dataset)
-        # upload to S3
-        prints = package_data_assembly(catalog_identifier=None,
-                                       proto_data_assembly=assembly,
-                                       assembly_identifier=assembly.name,
-                                       stimulus_set_identifier=assembly.name,
-                                       assembly_class_name="PropertyAssembly",
-                                       bucket_name="brainio-brainscore")
-        print(prints)
\ No newline at end of file
diff --git a/brainscore_vision/data/malania2007/malania_stimulus_set.py b/brainscore_vision/data/malania2007/malania_stimulus_set.py
deleted file mode 100644
index 5b3b6f057..000000000
--- a/brainscore_vision/data/malania2007/malania_stimulus_set.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import csv
-from pathlib import Path
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-
-# every stimulus set is separate, incl. baseline condition
-STIMULUS_SETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2',
-                 'long-2', 'equal-16', 'long-16', 'vernier-only', 'short-2_fit',
-                 'short-4_fit', 'short-6_fit', 'short-8_fit', 'short-16_fit',
-                 'equal-2_fit', 'long-2_fit', 'equal-16_fit', 'long-16_fit']
-DATASET_LENGTHS = {'test': 50, 'fit': 500}
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    """
-    Dataset Meta Info
-
-    Reported in pixels:
-        - image_size_x; image_size_y
-        - vernier_position_x; vernier_position_y
-
-    Reported in arcsec:
-        - vernier_height (height of the vernier elements combined, - middle gap)
-        - vernier_offset (horizontal offset between flankers)
-        - flanker_height (height of the flanker elements)
-        - flanker_spacing (distance between a flanker element and another flanker element)
-        - line_width (width of all the lines in all elements)
-        - flanker_distance (distance between a flanker and a vernier)
-    """
-    stimuli = []
-    stimulus_paths = {}
-
-    dataset_type = 'fit' if dataset[-3:] == 'fit' else 'test'
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-    stimuli.identifier = f'Malania2007_{dataset}'
-
-    # Ensure expected number of stimuli in datasets
-    assert len(stimuli) == DATASET_LENGTHS[dataset_type]
-    return stimuli
-
-
-if __name__ == '__main__':
-    root_directory = Path(r'../../data/malania2007/data_packaging/')
-    for stimulus_set in STIMULUS_SETS:
-        stimuli = collect_malania_stimulus_set(root_directory, stimulus_set)
-
-        # upload to S3
-        prints = package_stimulus_set(catalog_name=None,
-                                      proto_stimulus_set=stimuli,
-                                      stimulus_set_identifier=stimuli.name,
-                                      bucket_name="brainio-brainscore")
-        print(prints)

From d444e27cc1ac29d357339fc33640e69d3a3206c3 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 13 Jun 2024 15:45:30 +0200
Subject: [PATCH 35/65] add packaging files

---
 .../data_packaging/malania_data_assembly.py   | 85 +++++++++++++++++++
 .../data_packaging/malania_stimulus_set.py    | 79 +++++++++++++++++
 2 files changed, 164 insertions(+)
 create mode 100644 brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
 create mode 100644 brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py

diff --git a/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
new file mode 100644
index 000000000..f24c8fc5d
--- /dev/null
+++ b/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
@@ -0,0 +1,85 @@
+from pathlib import Path
+import numpy as np
+import xarray as xr
+
+from brainio.assemblies import PropertyAssembly
+from brainio.packaging import package_data_assembly
+import pandas as pd
+
+
+DATASETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2',
+            'long-2', 'equal-16', 'long-16', 'vernier-only']
+NUM_SUBJECTS = {'short-2': 6,
+                'short-4': 5,
+                'short-6': 5,
+                'short-8': 5,
+                'short-16': 6,
+                'equal-2': 5,
+                'long-2': 5,
+                'equal-16': 5,
+                'long-16': 5,
+                'vernier-only': 6}
+
+
+def collect_malania_data_assembly(root_directory, dataset):
+    """
+    Experiment Information:
+        - 5-6 observers per condition (for exact value, see NUM_SUBJECTS)
+        - 2AFC left/right offset discrimination task
+        - PEST staircase to 75% correct responses
+        - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
+    """
+    # construct the assembly
+    metadata_directory = Path(f'{root_directory}/{dataset}/metadata_human.xlsx')
+    metadata = pd.read_excel(metadata_directory)
+    # Since subjects are uniquely held using 'unique_subject_id', drop the rows with a subject
+    #  without measurement
+    assembly = PropertyAssembly(metadata['threshold'],
+                                  coords={
+                                      'subject_unique_id': ('subject', metadata['subject_unique_id'])
+                                  },
+                                  dims=['subject']
+                                  )
+
+    # give the assembly an identifier name
+    assembly.name = f'Malania2007_{dataset}'
+
+    # test subject numbers after removing the NaN subject
+    metadata = metadata.dropna(subset=['threshold'], axis=0)
+    assert len(metadata) == NUM_SUBJECTS[dataset]
+
+    return assembly
+
+
+def return_local_data_assembly(dataset):
+    root_directory = Path(r'../../../packaging/malania2007/malania2007_data_assembly')
+    assembly = collect_malania_data_assembly(root_directory, dataset)
+    return assembly
+
+
+def remove_subjects_with_nans(assembly1, assembly2):
+    # Find the indices of the subjects with NaN values in the first PropertyAssembly
+    nan_subjects = np.isnan(assembly1.values)
+
+    # Convert the boolean array to a DataArray with the same coordinates as the input assemblies
+    nan_subjects_da = xr.DataArray(nan_subjects, coords=assembly1.coords, dims=assembly1.dims)
+
+    # Filter out the subjects with NaN values from both PropertyAssemblies
+    filtered_assembly1 = assembly1.where(~nan_subjects_da, drop=True)
+    filtered_assembly2 = assembly2.where(~nan_subjects_da, drop=True)
+
+    return filtered_assembly1, filtered_assembly2
+
+
+if __name__ == '__main__':
+    root_directory = Path(r'../../data/malania2007/data_packaging/')
+    for dataset in DATASETS:
+        assembly = collect_malania_data_assembly(root_directory, dataset)
+        # upload to S3
+        prints = package_data_assembly(catalog_identifier=None,
+                                       proto_data_assembly=assembly,
+                                       assembly_identifier=assembly.name,
+                                       stimulus_set_identifier=assembly.name,
+                                       assembly_class_name="PropertyAssembly",
+                                       bucket_name="brainio-brainscore")
+        print(prints)
\ No newline at end of file
diff --git a/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py
new file mode 100644
index 000000000..5b3b6f057
--- /dev/null
+++ b/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py
@@ -0,0 +1,79 @@
+import csv
+from pathlib import Path
+from brainio.stimuli import StimulusSet
+from brainio.packaging import package_stimulus_set
+
+
+# every stimulus set is separate, incl. baseline condition
+STIMULUS_SETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2',
+                 'long-2', 'equal-16', 'long-16', 'vernier-only', 'short-2_fit',
+                 'short-4_fit', 'short-6_fit', 'short-8_fit', 'short-16_fit',
+                 'equal-2_fit', 'long-2_fit', 'equal-16_fit', 'long-16_fit']
+DATASET_LENGTHS = {'test': 50, 'fit': 500}
+
+
+def collect_malania_stimulus_set(root_directory, dataset):
+    """
+    Dataset Meta Info
+
+    Reported in pixels:
+        - image_size_x; image_size_y
+        - vernier_position_x; vernier_position_y
+
+    Reported in arcsec:
+        - vernier_height (height of the vernier elements combined, - middle gap)
+        - vernier_offset (horizontal offset between flankers)
+        - flanker_height (height of the flanker elements)
+        - flanker_spacing (distance between a flanker element and another flanker element)
+        - line_width (width of all the lines in all elements)
+        - flanker_distance (distance between a flanker and a vernier)
+    """
+    stimuli = []
+    stimulus_paths = {}
+
+    dataset_type = 'fit' if dataset[-3:] == 'fit' else 'test'
+    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
+    image_directory = Path(f'{root_directory}/{dataset}/images')
+    with open(metadata_directory, 'r') as metadata:
+        reader = csv.DictReader(metadata)
+        for row in reader:
+            stimuli.append({
+                'image_size_x_pix': int(row['image_size_x_pix']),
+                'image_size_y_pix': int(row['image_size_y_pix']),
+                'image_size_c': int(row['image_size_c']),
+                'image_size_degrees': float(row['image_size_degrees']),
+                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
+                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
+                'image_label': row['image_label'],
+                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
+                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
+                'line_width_arcsec': float(row['line_width_arcsec']),
+                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
+                'num_flankers': int(row['num_flankers']),
+                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
+                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
+                'stimulus_id': str(row['stimulus_id']),
+            })
+            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
+
+    stimuli = StimulusSet(stimuli)
+    stimuli.stimulus_paths = stimulus_paths
+    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
+    stimuli.identifier = f'Malania2007_{dataset}'
+
+    # Ensure expected number of stimuli in datasets
+    assert len(stimuli) == DATASET_LENGTHS[dataset_type]
+    return stimuli
+
+
+if __name__ == '__main__':
+    root_directory = Path(r'../../data/malania2007/data_packaging/')
+    for stimulus_set in STIMULUS_SETS:
+        stimuli = collect_malania_stimulus_set(root_directory, stimulus_set)
+
+        # upload to S3
+        prints = package_stimulus_set(catalog_name=None,
+                                      proto_stimulus_set=stimuli,
+                                      stimulus_set_identifier=stimuli.name,
+                                      bucket_name="brainio-brainscore")
+        print(prints)

From f29ebe80b491cc198669e120b1c43c2ac64a0c4d Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 13 Jun 2024 15:47:14 +0200
Subject: [PATCH 36/65] modify packaging paths

---
 .../malania2007/data_packaging/malania_data_assembly.py   | 8 +-------
 .../malania2007/data_packaging/malania_stimulus_set.py    | 2 +-
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
index f24c8fc5d..defd937f8 100644
--- a/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
+++ b/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
@@ -51,12 +51,6 @@ def collect_malania_data_assembly(root_directory, dataset):
     return assembly
 
 
-def return_local_data_assembly(dataset):
-    root_directory = Path(r'../../../packaging/malania2007/malania2007_data_assembly')
-    assembly = collect_malania_data_assembly(root_directory, dataset)
-    return assembly
-
-
 def remove_subjects_with_nans(assembly1, assembly2):
     # Find the indices of the subjects with NaN values in the first PropertyAssembly
     nan_subjects = np.isnan(assembly1.values)
@@ -72,7 +66,7 @@ def remove_subjects_with_nans(assembly1, assembly2):
 
 
 if __name__ == '__main__':
-    root_directory = Path(r'../../data/malania2007/data_packaging/')
+    root_directory = Path(r'../data/malania2007/data_packaging/')
     for dataset in DATASETS:
         assembly = collect_malania_data_assembly(root_directory, dataset)
         # upload to S3
diff --git a/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py
index 5b3b6f057..8a9f63fde 100644
--- a/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py
+++ b/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py
@@ -67,7 +67,7 @@ def collect_malania_stimulus_set(root_directory, dataset):
 
 
 if __name__ == '__main__':
-    root_directory = Path(r'../../data/malania2007/data_packaging/')
+    root_directory = Path(r'../data/malania2007/data_packaging/')
     for stimulus_set in STIMULUS_SETS:
         stimuli = collect_malania_stimulus_set(root_directory, stimulus_set)
 

From 44451e572529e215ab715ecbb5473587b74263a3 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 13 Jun 2024 15:56:00 +0200
Subject: [PATCH 37/65] remove redundant files

---
 .../equal-16/equal-16_data_assembly.py        | 42 ----------
 .../equal-16/equal-16_stimulus_set.py         | 77 -------------------
 .../equal-2/equal-2_data_assembly.py          | 43 -----------
 .../equal-2/equal-2_stimulus_set.py           | 77 -------------------
 .../long-16/long-16_data_assembly.py          | 42 ----------
 .../long-16/long-16_stimulus_set.py           | 77 -------------------
 .../long-2/long-2_data_assembly.py            | 42 ----------
 .../long-2/long-2_stimulus_set.py             | 77 -------------------
 .../data_packaging/short-16/short-16.py       | 42 ----------
 .../short-16/short-16_stimulus_set.py         | 77 -------------------
 .../short-2/short-2_data_assembly.py          | 42 ----------
 .../short-2/short-2_stimulus_set.py           | 77 -------------------
 .../short-4/short-4_data_assembly.py          | 42 ----------
 .../short-4/short-4_stimulus_set.py           | 77 -------------------
 .../short-6/short-6_data_assembly.py          | 42 ----------
 .../short-6/short-6_stimulus_set.py           | 77 -------------------
 .../short-8/short-8_data_assembly.py          | 42 ----------
 .../short-8/short-8_stimulus_set.py           | 77 -------------------
 .../vernier-only_data_assembly.py             | 42 ----------
 .../vernier-only/vernier-only_stimulus_set.py | 68 ----------------
 20 files changed, 1182 deletions(-)
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/equal-16/equal-16_data_assembly.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/equal-16/equal-16_stimulus_set.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/equal-2/equal-2_data_assembly.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/equal-2/equal-2_stimulus_set.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/long-16/long-16_data_assembly.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/long-16/long-16_stimulus_set.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/long-2/long-2_data_assembly.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/long-2/long-2_stimulus_set.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/short-16/short-16.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/short-16/short-16_stimulus_set.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/short-2/short-2_data_assembly.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/short-2/short-2_stimulus_set.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/short-4/short-4_data_assembly.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/short-4/short-4_stimulus_set.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/short-6/short-6_data_assembly.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/short-6/short-6_stimulus_set.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/short-8/short-8_data_assembly.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/short-8/short-8_stimulus_set.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/vernier-only/vernier-only_data_assembly.py
 delete mode 100644 brainscore_vision/data/malania2007/data_packaging/vernier-only/vernier-only_stimulus_set.py

diff --git a/brainscore_vision/data/malania2007/data_packaging/equal-16/equal-16_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/equal-16/equal-16_data_assembly.py
deleted file mode 100644
index d64e8f099..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/equal-16/equal-16_data_assembly.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pandas as pd
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-
-'''
-Experiment Information:
-    - 5 subjects
-    - 2AFC left/right offset discrimination task
-    - PEST staircase to 75% correct responses
-    - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-
-'''
-
-num_subjects = 5
-all_subjects = pd.read_excel('./metadata_human.xlsx')
-this_experiment_subjects = all_subjects.dropna(subset=['threshold'], axis=0)
-assembly = PropertyAssembly(this_experiment_subjects['threshold'],
-                            coords={
-                                'subject_unique_id': ('subject', this_experiment_subjects['subject_unique_id']),
-                                'condition': ('subject', ['equal-16', ] * num_subjects),
-                            },
-                            dims=['subject']
-                            )
-
-# assign assembly an identifier name
-assembly.name = 'Malania2007_equal-16'
-
-# make sure assembly dims are correct length
-assert len(assembly['subject']) == 5
-
-# make sure assembly coords are correct length
-assert len(assembly['subject_unique_id']) == 5
-assert len(assembly['condition']) == 5
-
-# make sure assembly coords are correct values
-assert (assembly['condition'].values == 'equal-16').all()
-assert (assembly['subject_unique_id'].values == [1, 3, 4, 5, 6]).all()
-
-# package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-#                       stimulus_set_identifier=f"Malania2007_equal-16",
-#                       assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/equal-16/equal-16_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/equal-16/equal-16_stimulus_set.py
deleted file mode 100644
index 789d5a2ae..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/equal-16/equal-16_stimulus_set.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import csv
-from pathlib import Path
-
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-'''
-Dataset Meta Info
-
-Reported in pixels:
-    - image_size_x_pix; image_size_y_pix
-    - vernier_position_x_pix; vernier_position_y_pix
-
-Reported in arcsec:
-    - vernier_height_arcsec (height of the vernier elements combined, - middle gap)
-    - vernier_offset_arcsec (horizontal offset between flankers)
-    - flanker_height_arcsec (height of the flanker elements)
-    - flanker_spacing_arcsec (distance between a flanker element and another flanker element)
-    - line_width_arcsec (width of all the lines in all elements)
-    - flanker_distance_arcsec (distance between a flanker and a vernier)
-'''
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    stimuli = []
-    stimulus_paths = {}
-
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-
-    return stimuli
-
-
-if __name__ == '__main__':
-    stimulus_root_directory = Path(r'../stimuli')
-    test_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'equal-16')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(test_stimuli) == 50
-
-    # upload to S3
-    #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
-
-    train_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'equal-16_fit')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(train_stimuli) == 500
-
-    # upload to S3
-    # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/equal-2/equal-2_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/equal-2/equal-2_data_assembly.py
deleted file mode 100644
index 56a6dffec..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/equal-2/equal-2_data_assembly.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import pandas as pd
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-
-'''
-Experiment Information:
-    - 5 subjects
-    - 2AFC left/right offset discrimination task
-    - PEST staircase to 75% correct responses
-    - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-    
-'''
-
-num_subjects = 5
-all_subjects = pd.read_excel('./metadata_human.xlsx')
-this_experiment_subjects = all_subjects.dropna(subset=['threshold'], axis=0)
-assembly = PropertyAssembly(this_experiment_subjects['threshold'],
-                                  coords={
-                                      'subject_unique_id': ('subject', this_experiment_subjects['subject_unique_id']),
-                                      'condition': ('subject', ['equal-2',] * num_subjects),
-                                  },
-                                  dims=['subject']
-                                  )
-
-# assign assembly an identifier name
-assembly.name = 'Malania2007_equal-2'
-
-# make sure assembly dims are correct length
-assert len(assembly['subject']) == 5
-
-# make sure assembly coords are correct length
-assert len(assembly['subject_unique_id']) == 5
-assert len(assembly['condition']) == 5
-
-# make sure assembly coords are correct values
-assert (assembly['condition'].values == 'equal-2').all()
-assert (assembly['subject_unique_id'].values == [1, 3, 4, 5, 6]).all()
-
-
-package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-                      stimulus_set_identifier=f"Malania2007_equal-2",
-                      assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/equal-2/equal-2_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/equal-2/equal-2_stimulus_set.py
deleted file mode 100644
index 5b0ef2876..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/equal-2/equal-2_stimulus_set.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import csv
-from pathlib import Path
-
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-'''
-Dataset Meta Info
-
-Reported in pixels:
-    - image_size_x_pix; image_size_y_pix
-    - vernier_position_x_pix; vernier_position_y_pix
-
-Reported in arcsec:
-    - vernier_height_arcsec (height of the vernier elements combined, - middle gap)
-    - vernier_offset_arcsec (horizontal offset between flankers)
-    - flanker_height_arcsec (height of the flanker elements)
-    - flanker_spacing_arcsec (distance between a flanker element and another flanker element)
-    - line_width_arcsec (width of all the lines in all elements)
-    - flanker_distance_arcsec (distance between a flanker and a vernier)
-'''
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    stimuli = []
-    stimulus_paths = {}
-
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-
-    return stimuli
-
-
-if __name__ == '__main__':
-    stimulus_root_directory = Path(r'../stimuli')
-    test_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'equal-2')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(test_stimuli) == 50
-
-    # upload to S3
-    #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
-
-    train_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'equal-2_fit')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(train_stimuli) == 500
-
-    # upload to S3
-    # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/long-16/long-16_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/long-16/long-16_data_assembly.py
deleted file mode 100644
index 5b1ff14f4..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/long-16/long-16_data_assembly.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pandas as pd
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-
-'''
-Experiment Information:
-    - 5 subjects
-    - 2AFC left/right offset discrimination task
-    - PEST staircase to 75% correct responses
-    - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-
-'''
-
-num_subjects = 5
-all_subjects = pd.read_excel('./metadata_human.xlsx')
-this_experiment_subjects = all_subjects.dropna(subset=['threshold'], axis=0)
-assembly = PropertyAssembly(this_experiment_subjects['threshold'],
-                            coords={
-                                'subject_unique_id': ('subject', this_experiment_subjects['subject_unique_id']),
-                                'condition': ('subject', ['long-16', ] * num_subjects),
-                            },
-                            dims=['subject']
-                            )
-
-# assign assembly an identifier name
-assembly.name = 'Malania2007_long-16'
-
-# make sure assembly dims are correct length
-assert len(assembly['subject']) == 5
-
-# make sure assembly coords are correct length
-assert len(assembly['subject_unique_id']) == 5
-assert len(assembly['condition']) == 5
-
-# make sure assembly coords are correct values
-assert (assembly['condition'].values == 'long-16').all()
-assert (assembly['subject_unique_id'].values == [1, 3, 4, 5, 6]).all()
-
-# package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-#                       stimulus_set_identifier=f"Malania2007_long-16",
-#                       assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/long-16/long-16_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/long-16/long-16_stimulus_set.py
deleted file mode 100644
index 9c675c3c2..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/long-16/long-16_stimulus_set.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import csv
-from pathlib import Path
-
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-'''
-Dataset Meta Info
-
-Reported in pixels:
-    - image_size_x_pix; image_size_y_pix
-    - vernier_position_x_pix; vernier_position_y_pix
-
-Reported in arcsec:
-    - vernier_height_arcsec (height of the vernier elements combined, - middle gap)
-    - vernier_offset_arcsec (horizontal offset between flankers)
-    - flanker_height_arcsec (height of the flanker elements)
-    - flanker_spacing_arcsec (distance between a flanker element and another flanker element)
-    - line_width_arcsec (width of all the lines in all elements)
-    - flanker_distance_arcsec (distance between a flanker and a vernier)
-'''
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    stimuli = []
-    stimulus_paths = {}
-
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-
-    return stimuli
-
-
-if __name__ == '__main__':
-    stimulus_root_directory = Path(r'../stimuli')
-    test_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'long-16')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(test_stimuli) == 50
-
-    # upload to S3
-    #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
-
-    train_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'long-16_fit')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(train_stimuli) == 500
-
-    # upload to S3
-    # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/long-2/long-2_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/long-2/long-2_data_assembly.py
deleted file mode 100644
index fd25c3dbd..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/long-2/long-2_data_assembly.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pandas as pd
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-
-'''
-Experiment Information:
-    - 5 subjects
-    - 2AFC left/right offset discrimination task
-    - PEST staircase to 75% correct responses
-    - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-
-'''
-
-num_subjects = 5
-all_subjects = pd.read_excel('./metadata_human.xlsx')
-this_experiment_subjects = all_subjects.dropna(subset=['threshold'], axis=0)
-assembly = PropertyAssembly(this_experiment_subjects['threshold'],
-                            coords={
-                                'subject_unique_id': ('subject', this_experiment_subjects['subject_unique_id']),
-                                'condition': ('subject', ['long-2', ] * num_subjects),
-                            },
-                            dims=['subject']
-                            )
-
-# assign assembly an identifier name
-assembly.name = 'Malania2007_long-2'
-
-# make sure assembly dims are correct length
-assert len(assembly['subject']) == 5
-
-# make sure assembly coords are correct length
-assert len(assembly['subject_unique_id']) == 5
-assert len(assembly['condition']) == 5
-
-# make sure assembly coords are correct values
-assert (assembly['condition'].values == 'long-2').all()
-assert (assembly['subject_unique_id'].values == [1, 3, 4, 5, 6]).all()
-
-# package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-#                       stimulus_set_identifier=f"Malania2007_long-2",
-#                       assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/long-2/long-2_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/long-2/long-2_stimulus_set.py
deleted file mode 100644
index 6d9538a17..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/long-2/long-2_stimulus_set.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import csv
-from pathlib import Path
-
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-'''
-Dataset Meta Info
-
-Reported in pixels:
-    - image_size_x_pix; image_size_y_pix
-    - vernier_position_x_pix; vernier_position_y_pix
-
-Reported in arcsec:
-    - vernier_height_arcsec (height of the vernier elements combined, - middle gap)
-    - vernier_offset_arcsec (horizontal offset between flankers)
-    - flanker_height_arcsec (height of the flanker elements)
-    - flanker_spacing_arcsec (distance between a flanker element and another flanker element)
-    - line_width_arcsec (width of all the lines in all elements)
-    - flanker_distance_arcsec (distance between a flanker and a vernier)
-'''
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    stimuli = []
-    stimulus_paths = {}
-
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-
-    return stimuli
-
-
-if __name__ == '__main__':
-    stimulus_root_directory = Path(r'../stimuli')
-    test_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'long-2')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(test_stimuli) == 50
-
-    # upload to S3
-    #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
-
-    train_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'long-2_fit')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(train_stimuli) == 500
-
-    # upload to S3
-    # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/short-16/short-16.py b/brainscore_vision/data/malania2007/data_packaging/short-16/short-16.py
deleted file mode 100644
index 9b3b3e456..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/short-16/short-16.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pandas as pd
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-
-'''
-Experiment Information:
-    - 6 subjects
-    - 2AFC left/right offset discrimination task
-    - PEST staircase to 75% correct responses
-    - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-
-'''
-
-num_subjects = 6
-all_subjects = pd.read_excel('./metadata_human.xlsx')
-this_experiment_subjects = all_subjects.dropna(subset=['threshold'], axis=0)
-assembly = PropertyAssembly(this_experiment_subjects['threshold'],
-                            coords={
-                                'subject_unique_id': ('subject', this_experiment_subjects['subject_unique_id']),
-                                'condition': ('subject', ['short-16', ] * num_subjects),
-                            },
-                            dims=['subject']
-                            )
-
-# assign assembly an identifier name
-assembly.name = 'Malania2007_short-16'
-
-# make sure assembly dims are correct length
-assert len(assembly['subject']) == 6
-
-# make sure assembly coords are correct length
-assert len(assembly['subject_unique_id']) == 6
-assert len(assembly['condition']) == 6
-
-# make sure assembly coords are correct values
-assert (assembly['condition'].values == 'short-16').all()
-assert (assembly['subject_unique_id'].values == [1, 2, 3, 4, 5, 6]).all()
-
-# package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-#                       stimulus_set_identifier=f"Malania2007_short-16",
-#                       assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/short-16/short-16_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/short-16/short-16_stimulus_set.py
deleted file mode 100644
index e6a880af0..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/short-16/short-16_stimulus_set.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import csv
-from pathlib import Path
-
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-'''
-Dataset Meta Info
-
-Reported in pixels:
-    - image_size_x_pix; image_size_y_pix
-    - vernier_position_x_pix; vernier_position_y_pix
-
-Reported in arcsec:
-    - vernier_height_arcsec (height of the vernier elements combined, - middle gap)
-    - vernier_offset_arcsec (horizontal offset between flankers)
-    - flanker_height_arcsec (height of the flanker elements)
-    - flanker_spacing_arcsec (distance between a flanker element and another flanker element)
-    - line_width_arcsec (width of all the lines in all elements)
-    - flanker_distance_arcsec (distance between a flanker and a vernier)
-'''
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    stimuli = []
-    stimulus_paths = {}
-
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-
-    return stimuli
-
-
-if __name__ == '__main__':
-    stimulus_root_directory = Path(r'../stimuli')
-    test_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'short-16')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(test_stimuli) == 50
-
-    # upload to S3
-    #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
-
-    train_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'short-16_fit')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(train_stimuli) == 500
-
-    # upload to S3
-    # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/short-2/short-2_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/short-2/short-2_data_assembly.py
deleted file mode 100644
index 17256fa75..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/short-2/short-2_data_assembly.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pandas as pd
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-
-'''
-Experiment Information:
-    - 6 subjects
-    - 2AFC left/right offset discrimination task
-    - PEST staircase to 75% correct responses
-    - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-
-'''
-
-num_subjects = 6
-all_subjects = pd.read_excel('./metadata_human.xlsx')
-this_experiment_subjects = all_subjects.dropna(subset=['threshold'], axis=0)
-assembly = PropertyAssembly(this_experiment_subjects['threshold'],
-                            coords={
-                                'subject_unique_id': ('subject', this_experiment_subjects['subject_unique_id']),
-                                'condition': ('subject', ['short-2', ] * num_subjects),
-                            },
-                            dims=['subject']
-                            )
-
-# assign assembly an identifier name
-assembly.name = 'Malania2007_short-2'
-
-# make sure assembly dims are correct length
-assert len(assembly['subject']) == 6
-
-# make sure assembly coords are correct length
-assert len(assembly['subject_unique_id']) == 6
-assert len(assembly['condition']) == 6
-
-# make sure assembly coords are correct values
-assert (assembly['condition'].values == 'short-2').all()
-assert (assembly['subject_unique_id'].values == [1, 2, 3, 4, 5, 6]).all()
-
-# package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-#                       stimulus_set_identifier=f"Malania2007_short-2",
-#                       assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/short-2/short-2_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/short-2/short-2_stimulus_set.py
deleted file mode 100644
index 75106cf20..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/short-2/short-2_stimulus_set.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import csv
-from pathlib import Path
-
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-'''
-Dataset Meta Info
-
-Reported in pixels:
-    - image_size_x_pix; image_size_y_pix
-    - vernier_position_x_pix; vernier_position_y_pix
-
-Reported in arcsec:
-    - vernier_height_arcsec (height of the vernier elements combined, - middle gap)
-    - vernier_offset_arcsec (horizontal offset between flankers)
-    - flanker_height_arcsec (height of the flanker elements)
-    - flanker_spacing_arcsec (distance between a flanker element and another flanker element)
-    - line_width_arcsec (width of all the lines in all elements)
-    - flanker_distance_arcsec (distance between a flanker and a vernier)
-'''
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    stimuli = []
-    stimulus_paths = {}
-
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-
-    return stimuli
-
-
-if __name__ == '__main__':
-    stimulus_root_directory = Path(r'../stimuli')
-    test_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'short-2')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(test_stimuli) == 50
-
-    # upload to S3
-    #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
-
-    train_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'short-2_fit')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(train_stimuli) == 500
-
-    # upload to S3
-    # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/short-4/short-4_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/short-4/short-4_data_assembly.py
deleted file mode 100644
index 4523a46fd..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/short-4/short-4_data_assembly.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pandas as pd
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-
-'''
-Experiment Information:
-    - 5 subjects
-    - 2AFC left/right offset discrimination task
-    - PEST staircase to 75% correct responses
-    - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-
-'''
-
-num_subjects = 5
-all_subjects = pd.read_excel('./metadata_human.xlsx')
-this_experiment_subjects = all_subjects.dropna(subset=['threshold'], axis=0)
-assembly = PropertyAssembly(this_experiment_subjects['threshold'],
-                            coords={
-                                'subject_unique_id': ('subject', this_experiment_subjects['subject_unique_id']),
-                                'condition': ('subject', ['short-4', ] * num_subjects),
-                            },
-                            dims=['subject']
-                            )
-
-# assign assembly an identifier name
-assembly.name = 'Malania2007_short-4'
-
-# make sure assembly dims are correct length
-assert len(assembly['subject']) == 5
-
-# make sure assembly coords are correct length
-assert len(assembly['subject_unique_id']) == 5
-assert len(assembly['condition']) == 5
-
-# make sure assembly coords are correct values
-assert (assembly['condition'].values == 'short-4').all()
-assert (assembly['subject_unique_id'].values == [1, 2, 3, 4, 5]).all()
-
-# package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-#                       stimulus_set_identifier=f"Malania2007_short-4",
-#                       assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/short-4/short-4_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/short-4/short-4_stimulus_set.py
deleted file mode 100644
index f1216c6aa..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/short-4/short-4_stimulus_set.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import csv
-from pathlib import Path
-
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-'''
-Dataset Meta Info
-
-Reported in pixels:
-    - image_size_x_pix; image_size_y_pix
-    - vernier_position_x_pix; vernier_position_y_pix
-
-Reported in arcsec:
-    - vernier_height_arcsec (height of the vernier elements combined, - middle gap)
-    - vernier_offset_arcsec (horizontal offset between flankers)
-    - flanker_height_arcsec (height of the flanker elements)
-    - flanker_spacing_arcsec (distance between a flanker element and another flanker element)
-    - line_width_arcsec (width of all the lines in all elements)
-    - flanker_distance_arcsec (distance between a flanker and a vernier)
-'''
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    stimuli = []
-    stimulus_paths = {}
-
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-
-    return stimuli
-
-
-if __name__ == '__main__':
-    stimulus_root_directory = Path(r'../stimuli')
-    test_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'short-4')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(test_stimuli) == 50
-
-    # upload to S3
-    #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
-
-    train_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'short-4_fit')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(train_stimuli) == 500
-
-    # upload to S3
-    # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/short-6/short-6_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/short-6/short-6_data_assembly.py
deleted file mode 100644
index 155149a91..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/short-6/short-6_data_assembly.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pandas as pd
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-
-'''
-Experiment Information:
-    - 5 subjects
-    - 2AFC left/right offset discrimination task
-    - PEST staircase to 75% correct responses
-    - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-
-'''
-
-num_subjects = 5
-all_subjects = pd.read_excel('./metadata_human.xlsx')
-this_experiment_subjects = all_subjects.dropna(subset=['threshold'], axis=0)
-assembly = PropertyAssembly(this_experiment_subjects['threshold'],
-                            coords={
-                                'subject_unique_id': ('subject', this_experiment_subjects['subject_unique_id']),
-                                'condition': ('subject', ['short-6', ] * num_subjects),
-                            },
-                            dims=['subject']
-                            )
-
-# assign assembly an identifier name
-assembly.name = 'Malania2007_short-6'
-
-# make sure assembly dims are correct length
-assert len(assembly['subject']) == 5
-
-# make sure assembly coords are correct length
-assert len(assembly['subject_unique_id']) == 5
-assert len(assembly['condition']) == 5
-
-# make sure assembly coords are correct values
-assert (assembly['condition'].values == 'short-6').all()
-assert (assembly['subject_unique_id'].values == [1, 2, 3, 4, 5]).all()
-
-# package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-#                       stimulus_set_identifier=f"Malania2007_short-6",
-#                       assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/short-6/short-6_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/short-6/short-6_stimulus_set.py
deleted file mode 100644
index ae656a1cf..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/short-6/short-6_stimulus_set.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import csv
-from pathlib import Path
-
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-'''
-Dataset Meta Info
-
-Reported in pixels:
-    - image_size_x_pix; image_size_y_pix
-    - vernier_position_x_pix; vernier_position_y_pix
-
-Reported in arcsec:
-    - vernier_height_arcsec (height of the vernier elements combined, - middle gap)
-    - vernier_offset_arcsec (horizontal offset between flankers)
-    - flanker_height_arcsec (height of the flanker elements)
-    - flanker_spacing_arcsec (distance between a flanker element and another flanker element)
-    - line_width_arcsec (width of all the lines in all elements)
-    - flanker_distance_arcsec (distance between a flanker and a vernier)
-'''
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    stimuli = []
-    stimulus_paths = {}
-
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-
-    return stimuli
-
-
-if __name__ == '__main__':
-    stimulus_root_directory = Path(r'../stimuli')
-    test_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'short-6')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(test_stimuli) == 50
-
-    # upload to S3
-    #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
-
-    train_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'short-6_fit')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(train_stimuli) == 500
-
-    # upload to S3
-    # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/short-8/short-8_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/short-8/short-8_data_assembly.py
deleted file mode 100644
index 16f044a8e..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/short-8/short-8_data_assembly.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pandas as pd
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-
-'''
-Experiment Information:
-    - 5 subjects
-    - 2AFC left/right offset discrimination task
-    - PEST staircase to 75% correct responses
-    - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-
-'''
-
-num_subjects = 5
-all_subjects = pd.read_excel('./metadata_human.xlsx')
-this_experiment_subjects = all_subjects.dropna(subset=['threshold'], axis=0)
-assembly = PropertyAssembly(this_experiment_subjects['threshold'],
-                            coords={
-                                'subject_unique_id': ('subject', this_experiment_subjects['subject_unique_id']),
-                                'condition': ('subject', ['short-8', ] * num_subjects),
-                            },
-                            dims=['subject']
-                            )
-
-# assign assembly an identifier name
-assembly.name = 'Malania2007_short-8'
-
-# make sure assembly dims are correct length
-assert len(assembly['subject']) == 5
-
-# make sure assembly coords are correct length
-assert len(assembly['subject_unique_id']) == 5
-assert len(assembly['condition']) == 5
-
-# make sure assembly coords are correct values
-assert (assembly['condition'].values == 'short-8').all()
-assert (assembly['subject_unique_id'].values == [1, 2, 3, 4, 5]).all()
-
-# package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-#                       stimulus_set_identifier=f"Malania2007_short-8",
-#                       assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/short-8/short-8_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/short-8/short-8_stimulus_set.py
deleted file mode 100644
index 85923a852..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/short-8/short-8_stimulus_set.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import csv
-from pathlib import Path
-
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-'''
-Dataset Meta Info
-
-Reported in pixels:
-    - image_size_x_pix; image_size_y_pix
-    - vernier_position_x_pix; vernier_position_y_pix
-
-Reported in arcsec:
-    - vernier_height_arcsec (height of the vernier elements combined, - middle gap)
-    - vernier_offset_arcsec (horizontal offset between flankers)
-    - flanker_height_arcsec (height of the flanker elements)
-    - flanker_spacing_arcsec (distance between a flanker element and another flanker element)
-    - line_width_arcsec (width of all the lines in all elements)
-    - flanker_distance_arcsec (distance between a flanker and a vernier)
-'''
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    stimuli = []
-    stimulus_paths = {}
-
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-
-    return stimuli
-
-
-if __name__ == '__main__':
-    stimulus_root_directory = Path(r'../stimuli')
-    test_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'short-8')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(test_stimuli) == 50
-
-    # upload to S3
-    #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
-
-    train_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'short-8_fit')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(train_stimuli) == 500
-
-    # upload to S3
-    # package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/vernier-only/vernier-only_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/vernier-only/vernier-only_data_assembly.py
deleted file mode 100644
index ace2de989..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/vernier-only/vernier-only_data_assembly.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pandas as pd
-
-from brainio.assemblies import PropertyAssembly
-from brainio.packaging import package_data_assembly
-
-'''
-Experiment Information:
-    - 6 subjects
-    - 2AFC left/right offset discrimination task
-    - PEST staircase to 75% correct responses
-    - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
-
-'''
-
-num_subjects = 6
-all_subjects = pd.read_excel('./metadata_human.xlsx')
-this_experiment_subjects = all_subjects.dropna(subset=['threshold'], axis=0)
-assembly = PropertyAssembly(this_experiment_subjects['threshold'],
-                            coords={
-                                'subject_unique_id': ('subject', this_experiment_subjects['subject_unique_id']),
-                                'condition': ('subject', ['vernier-only', ] * num_subjects),
-                            },
-                            dims=['subject']
-                            )
-
-# assign assembly an identifier name
-assembly.name = 'Malania2007_vernier-only'
-
-# make sure assembly dims are correct length
-assert len(assembly['subject']) == 6
-
-# make sure assembly coords are correct length
-assert len(assembly['subject_unique_id']) == 6
-assert len(assembly['condition']) == 6
-
-# make sure assembly coords are correct values
-assert (assembly['condition'].values == 'vernier-only').all()
-assert (assembly['subject_unique_id'].values == [1, 2, 3, 4, 5, 6]).all()
-
-# package_data_assembly('brainio_brainscore', assembly, assembly_identifier=assembly.name,
-#                       stimulus_set_identifier=f"Malania2007_vernier-only",
-#                       assembly_class_name="PropertyAssembly", bucket_name="brainio-brainscore")
diff --git a/brainscore_vision/data/malania2007/data_packaging/vernier-only/vernier-only_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/vernier-only/vernier-only_stimulus_set.py
deleted file mode 100644
index df6f856c0..000000000
--- a/brainscore_vision/data/malania2007/data_packaging/vernier-only/vernier-only_stimulus_set.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import csv
-from pathlib import Path
-
-from brainio.stimuli import StimulusSet
-from brainio.packaging import package_stimulus_set
-
-'''
-Dataset Meta Info
-
-Reported in pixels:
-    - image_size_x_pix; image_size_y_pix
-    - vernier_position_x_pix; vernier_position_y_pix
-
-Reported in arcsec:
-    - vernier_height_arcsec (height of the vernier elements combined, - middle gap)
-    - vernier_offset_arcsec (horizontal offset between flankers)
-    - flanker_height_arcsec (height of the flanker elements)
-    - flanker_spacing_arcsec (distance between a flanker element and another flanker element)
-    - line_width_arcsec (width of all the lines in all elements)
-    - flanker_distance_arcsec (distance between a flanker and a vernier)
-'''
-
-
-def collect_malania_stimulus_set(root_directory, dataset):
-    stimuli = []
-    stimulus_paths = {}
-
-    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
-    image_directory = Path(f'{root_directory}/{dataset}/images')
-    with open(metadata_directory, 'r') as metadata:
-        reader = csv.DictReader(metadata)
-        for row in reader:
-            stimuli.append({
-                'image_size_x_pix': int(row['image_size_x_pix']),
-                'image_size_y_pix': int(row['image_size_y_pix']),
-                'image_size_c': int(row['image_size_c']),
-                'image_size_degrees': float(row['image_size_degrees']),
-                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
-                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
-                'image_label': row['image_label'],
-                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
-                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
-                'line_width_arcsec': float(row['line_width_arcsec']),
-                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
-                'num_flankers': int(row['num_flankers']),
-                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
-                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
-                'stimulus_id': str(row['stimulus_id']),
-            })
-            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
-
-    stimuli = StimulusSet(stimuli)
-    stimuli.stimulus_paths = stimulus_paths
-    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
-
-    return stimuli
-
-
-if __name__ == '__main__':
-    stimulus_root_directory = Path(r'../stimuli')
-    test_stimuli = collect_malania_stimulus_set(stimulus_root_directory, 'vernier-only')
-
-    # Ensure expected number of stimuli in datasets
-    assert len(test_stimuli) == 50
-
-    # upload to S3
-    #package_stimulus_set("brainio_brainscore", stimuli, stimulus_set_identifier=stimuli.name,
-    #                     bucket_name="brainio-brainscore")

From 6e56cf6ccd0ca69745c5d995aa3dfeef529e2a15 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 13 Jun 2024 15:57:11 +0200
Subject: [PATCH 38/65] remove redundant import

---
 brainscore_vision/benchmark_helpers/screen.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/brainscore_vision/benchmark_helpers/screen.py b/brainscore_vision/benchmark_helpers/screen.py
index e1290142c..8cd1249ed 100644
--- a/brainscore_vision/benchmark_helpers/screen.py
+++ b/brainscore_vision/benchmark_helpers/screen.py
@@ -3,7 +3,6 @@
 """
 import copy
 import logging
-from typing import Union
 import os
 import shutil
 from typing import Union

From 48f007a6a111562fc330b64fb6ab135b7cbd70a8 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 13 Jun 2024 16:54:45 +0200
Subject: [PATCH 39/65] fix stimulus set / assembly link

---
 .../data/malania2007/__init__.py              | 20 +++++++++----------
 .../data_packaging/malania_data_assembly.py   |  6 +++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/brainscore_vision/data/malania2007/__init__.py b/brainscore_vision/data/malania2007/__init__.py
index 2d4b28ab9..007840449 100644
--- a/brainscore_vision/data/malania2007/__init__.py
+++ b/brainscore_vision/data/malania2007/__init__.py
@@ -24,7 +24,7 @@
     sha1="277b2fbffed00e16b6a69b488f73eeda5abaaf10",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_equal-2'),
+    stimulus_set_loader=None,
 )
 data_registry['Malania2007_equal-16'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_equal-16',
@@ -32,7 +32,7 @@
     sha1="ef49506238e8d2554918b113fbc60c133077186e",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_equal-16'),
+    stimulus_set_loader=None,
 )
 data_registry['Malania2007_long-2'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_long-2',
@@ -40,7 +40,7 @@
     sha1="9076a5b693948c4992b6c8e753f04a7acd2014a1",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_long-2'),
+    stimulus_set_loader=None,
 )
 data_registry['Malania2007_long-16'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_long-16',
@@ -48,7 +48,7 @@
     sha1="3106cf1f2fa9e66617ebf231df05d29077fc478f",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_long-16'),
+    stimulus_set_loader=None,
 )
 data_registry['Malania2007_short-2'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_short-2',
@@ -56,7 +56,7 @@
     sha1="85fb65ad76de48033c704b9c5689771e1ea0457d",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-2'),
+    stimulus_set_loader=None,
 )
 data_registry['Malania2007_short-4'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_short-4',
@@ -64,7 +64,7 @@
     sha1="75506be9a26ec38a223e41510f1a8cb32d5b0bc9",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-4'),
+    stimulus_set_loader=None,
 )
 data_registry['Malania2007_short-6'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_short-6',
@@ -72,7 +72,7 @@
     sha1="2901be6b352e67550da040d79d744819365b8626",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-6'),
+    stimulus_set_loader=None,
 )
 data_registry['Malania2007_short-8'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_short-8',
@@ -80,7 +80,7 @@
     sha1="6daf47b086cb969d75222e320f49453ed8437885",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-8'),
+    stimulus_set_loader=None,
 )
 data_registry['Malania2007_short-16'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_short-16',
@@ -88,7 +88,7 @@
     sha1="8ae0898caad718b747f85fce5888416affc3a569",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_short-16'),
+    stimulus_set_loader=None,
 )
 data_registry['Malania2007_vernier-only'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_vernier-only',
@@ -96,7 +96,7 @@
     sha1="1cf83e8b6141f8b0d67ea46994f342325f62001f",
     bucket="brainio-brainscore",
     cls=PropertyAssembly,
-    stimulus_set_loader=lambda: load_stimulus_set('Malania2007_vernier-only'),
+    stimulus_set_loader=None,
 )
 
 
diff --git a/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
index defd937f8..091ac3fa6 100644
--- a/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
+++ b/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
@@ -36,7 +36,7 @@ def collect_malania_data_assembly(root_directory, dataset):
     #  without measurement
     assembly = PropertyAssembly(metadata['threshold'],
                                   coords={
-                                      'subject_unique_id': ('subject', metadata['subject_unique_id'])
+                                      'subject': ('subject', metadata['subject_unique_id']),
                                   },
                                   dims=['subject']
                                   )
@@ -66,7 +66,7 @@ def remove_subjects_with_nans(assembly1, assembly2):
 
 
 if __name__ == '__main__':
-    root_directory = Path(r'../data/malania2007/data_packaging/')
+    root_directory = Path(r'.')
     for dataset in DATASETS:
         assembly = collect_malania_data_assembly(root_directory, dataset)
         # upload to S3
@@ -76,4 +76,4 @@ def remove_subjects_with_nans(assembly1, assembly2):
                                        stimulus_set_identifier=assembly.name,
                                        assembly_class_name="PropertyAssembly",
                                        bucket_name="brainio-brainscore")
-        print(prints)
\ No newline at end of file
+        print(prints)

From a7b500cb1d49319e11c0c7bfb29ff95e52993b12 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 13 Jun 2024 16:55:07 +0200
Subject: [PATCH 40/65] fix stimulus set / assembly indexing

---
 brainscore_vision/benchmarks/malania2007/benchmark.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index e073614a0..54e1f21d3 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -84,11 +84,11 @@ def __init__(self, condition: str):
 
         self._assemblies = {'baseline_assembly': self._baseline_assembly,
                             'condition_assembly': self._assembly}
-        self._stimulus_set = brainscore_vision.load_stimulus_set(f'{self.condition}')
-        self._baseline_stimulus_set = brainscore_vision.load_stimulus_set(f'{self.baseline_condition}')
+        self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007_{self.condition}')
+        self._baseline_stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007_{self.baseline_condition}')
         self._stimulus_sets = {self.condition: self._stimulus_set,
                                self.baseline_condition: self._baseline_stimulus_set}
-        self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'{self.condition}_fit')
+        self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'Malania2007_{self.condition}_fit')
 
         self._metric = load_metric('threshold_elevation',
                                    baseline_condition=self.baseline_condition,
@@ -142,7 +142,7 @@ def filter_baseline_subjects(condition_assembly: PropertyAssembly,
                              baseline_assembly: PropertyAssembly
                              ) -> Tuple[PropertyAssembly, PropertyAssembly]:
     """A function to select only the unique subjects that exist in the condition_assembly."""
-    unique_ids = condition_assembly.coords['subject_unique_id'].values.tolist()
-    mask = baseline_assembly.coords['subject_unique_id'].isin(unique_ids)
+    unique_ids = condition_assembly.coords['subject'].values.tolist()
+    mask = baseline_assembly.coords['subject'].isin(unique_ids)
     filtered_baseline_assembly = baseline_assembly.where(mask, drop=True)
     return condition_assembly, filtered_baseline_assembly

From 871974a6735ef3a141311de378625958d2d221d9 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 13 Jun 2024 17:01:08 +0200
Subject: [PATCH 41/65] add image label to threshold elevation calculation

---
 brainscore_vision/benchmarks/malania2007/benchmark.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index 54e1f21d3..233fb5990 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -91,6 +91,7 @@ def __init__(self, condition: str):
         self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'Malania2007_{self.condition}_fit')
 
         self._metric = load_metric('threshold_elevation',
+                                   independent_variable='image_label',
                                    baseline_condition=self.baseline_condition,
                                    test_condition=self.condition,
                                    threshold_accuracy=0.75)

From 9ee6bd5cc1d5ce55253720c5f8e2ec027b941f9c Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 14 Jun 2024 15:20:15 +0200
Subject: [PATCH 42/65] change stimulus numbers in the test to be what they
 should be

---
 brainscore_vision/data/malania2007/test.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/brainscore_vision/data/malania2007/test.py b/brainscore_vision/data/malania2007/test.py
index 4fd83059f..42f769e92 100644
--- a/brainscore_vision/data/malania2007/test.py
+++ b/brainscore_vision/data/malania2007/test.py
@@ -86,16 +86,16 @@ def test_stimulus_set_exist(self, identifier):
         ('long-2', 50),
         ('equal-16', 50),
         ('long-16', 50),
-        ('short-2_fit', 50),
-        ('short-4_fit', 50),
-        ('short-6_fit', 50),
-        ('short-8_fit', 50),
-        ('short-16_fit', 50),
-        ('equal-2_fit', 50),
-        ('long-2_fit', 50),
-        ('equal-16_fit', 50),
-        ('long-16_fit', 50),
-        ('vernier-only', 50)
+        ('short-2_fit', 500),
+        ('short-4_fit', 500),
+        ('short-6_fit', 500),
+        ('short-8_fit', 500),
+        ('short-16_fit', 500),
+        ('equal-2_fit', 500),
+        ('long-2_fit', 500),
+        ('equal-16_fit', 500),
+        ('long-16_fit', 500),
+        ('vernier-only', 500)
     ])
     def test_number_of_images(self, identifier, num_images):
         stimulus_set = load_stimulus_set(f"Malania2007_{identifier}")

From 6136b5fad070a014855030c3fccc75d5d6c87a72 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 14 Jun 2024 15:23:43 +0200
Subject: [PATCH 43/65] add calls to require_variance

---
 brainscore_vision/benchmarks/malania2007/benchmark.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index 233fb5990..5e6290482 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -108,7 +108,8 @@ def __init__(self, condition: str):
 
     def __call__(self, candidate: BrainModel):
         model_responses = {}
-        candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli)
+        candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli,
+                             number_of_trials=self._number_of_trials, require_variance=True)
         for condition in (self.baseline_condition, self.condition):
             stimulus_set = place_on_screen(
                 self._stimulus_sets[condition],
@@ -143,7 +144,9 @@ def filter_baseline_subjects(condition_assembly: PropertyAssembly,
                              baseline_assembly: PropertyAssembly
                              ) -> Tuple[PropertyAssembly, PropertyAssembly]:
     """A function to select only the unique subjects that exist in the condition_assembly."""
-    unique_ids = condition_assembly.coords['subject'].values.tolist()
+    non_nan_mask = ~np.isnan(condition_assembly.values)
+    unique_ids = condition_assembly.coords['subject'][non_nan_mask].values.tolist()
+
     mask = baseline_assembly.coords['subject'].isin(unique_ids)
     filtered_baseline_assembly = baseline_assembly.where(mask, drop=True)
     return condition_assembly, filtered_baseline_assembly

From b3902169e1dab7e70085f4d3765d91151c0d45f2 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Mon, 24 Jun 2024 15:12:46 +0200
Subject: [PATCH 44/65] fix test errors

---
 brainscore_vision/benchmarks/malania2007/test.py | 4 ++--
 brainscore_vision/data/malania2007/test.py       | 2 +-
 brainscore_vision/metrics/threshold/test.py      | 0
 3 files changed, 3 insertions(+), 3 deletions(-)
 create mode 100644 brainscore_vision/metrics/threshold/test.py

diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index b5bcd6948..30c346d44 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -23,7 +23,7 @@ def test_in_pool(self, dataset):
     def test_mean_ceiling(self):
         benchmarks = [f"Malania2007_{dataset}" for dataset in DATASETS]
         benchmarks = [benchmark_registry[benchmark] for benchmark in benchmarks]
-        ceilings = [benchmark.ceiling.sel(aggregation='center') for benchmark in benchmarks]
+        ceilings = [benchmark._ceiling.sel(aggregation='center') for benchmark in benchmarks]
         mean_ceiling = np.mean(ceilings)
         assert mean_ceiling == approx(0.7724487108297781, abs=0.001)
 
@@ -43,7 +43,7 @@ def test_mean_ceiling(self):
     def test_dataset_ceiling(self, dataset, expected_ceiling):
         benchmark = f"Malania2007_{dataset}"
         benchmark = benchmark_registry[benchmark]
-        ceiling = benchmark.ceiling
+        ceiling = benchmark._ceiling
         assert ceiling.sel(aggregation='center').values.item() == expected_ceiling
 
     @pytest.mark.private_access
diff --git a/brainscore_vision/data/malania2007/test.py b/brainscore_vision/data/malania2007/test.py
index 42f769e92..3e7a2267a 100644
--- a/brainscore_vision/data/malania2007/test.py
+++ b/brainscore_vision/data/malania2007/test.py
@@ -22,7 +22,7 @@ class TestAssemblies:
     ])
     def test_num_subjects(self, identifier, num_subjects):
         assembly = load_dataset(f"Malania2007_{identifier}")
-        assembly = assembly.where(~np.isnan(assembly.values), drop=True)
+        assembly = assembly.where(assembly.isnull(), drop=True)
         assert len(np.unique(assembly['subject'].values)) == num_subjects
 
     # test assembly coords present in ALL 17 sets:
diff --git a/brainscore_vision/metrics/threshold/test.py b/brainscore_vision/metrics/threshold/test.py
new file mode 100644
index 000000000..e69de29bb

From 246ce43f3c40047c6ac14f1ae9f1ca3819979a41 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 26 Jun 2024 13:38:18 +0200
Subject: [PATCH 45/65] fix bug with ceiling access

---
 brainscore_vision/benchmarks/malania2007/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index 30c346d44..7cd372348 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -42,8 +42,8 @@ def test_mean_ceiling(self):
     ])
     def test_dataset_ceiling(self, dataset, expected_ceiling):
         benchmark = f"Malania2007_{dataset}"
-        benchmark = benchmark_registry[benchmark]
-        ceiling = benchmark._ceiling
+        benchmark = load_benchmark(benchmark)
+        ceiling = benchmark.ceiling
         assert ceiling.sel(aggregation='center').values.item() == expected_ceiling
 
     @pytest.mark.private_access

From 4adbbb220dea3625987b6cb8896df1c245a43612 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 26 Jun 2024 13:46:14 +0200
Subject: [PATCH 46/65] correct test with incorrect nan droppign

---
 brainscore_vision/data/malania2007/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brainscore_vision/data/malania2007/test.py b/brainscore_vision/data/malania2007/test.py
index 3e7a2267a..8a87b5104 100644
--- a/brainscore_vision/data/malania2007/test.py
+++ b/brainscore_vision/data/malania2007/test.py
@@ -22,7 +22,7 @@ class TestAssemblies:
     ])
     def test_num_subjects(self, identifier, num_subjects):
         assembly = load_dataset(f"Malania2007_{identifier}")
-        assembly = assembly.where(assembly.isnull(), drop=True)
+        assembly = assembly.dropna(dim='subject')
         assert len(np.unique(assembly['subject'].values)) == num_subjects
 
     # test assembly coords present in ALL 17 sets:

From 3042ddfe8d13e7fb618a64b5758a34942b624d01 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 26 Jun 2024 13:48:37 +0200
Subject: [PATCH 47/65] fix wrong number of stimuli in the vernier only test

---
 brainscore_vision/data/malania2007/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brainscore_vision/data/malania2007/test.py b/brainscore_vision/data/malania2007/test.py
index 8a87b5104..0c23f92ec 100644
--- a/brainscore_vision/data/malania2007/test.py
+++ b/brainscore_vision/data/malania2007/test.py
@@ -95,7 +95,7 @@ def test_stimulus_set_exist(self, identifier):
         ('long-2_fit', 500),
         ('equal-16_fit', 500),
         ('long-16_fit', 500),
-        ('vernier-only', 500)
+        ('vernier-only', 50)
     ])
     def test_number_of_images(self, identifier, num_images):
         stimulus_set = load_stimulus_set(f"Malania2007_{identifier}")

From 786a166e69bd75e462a4fc90b1a025f87054c235 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 26 Jun 2024 14:12:29 +0200
Subject: [PATCH 48/65] add comment to explain the logic behind the scoring
 function

---
 brainscore_vision/metrics/threshold/metric.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/brainscore_vision/metrics/threshold/metric.py b/brainscore_vision/metrics/threshold/metric.py
index 07cdf6599..a10c99e92 100644
--- a/brainscore_vision/metrics/threshold/metric.py
+++ b/brainscore_vision/metrics/threshold/metric.py
@@ -346,6 +346,9 @@ def pool_score(source: float, target: Union[list, PropertyAssembly]) -> Score:
         else:
             target_mean = np.mean(target)
         # This score = 0 when the source exceeds target_mean by 100%
+        #  The logic is that the maximum distance below is target threshold is at 0, and thus
+        #  setting the maximum distance above the target threshold at the same distance will make
+        #  the score symmetric.
         raw_score = max((1 - ((np.abs(target_mean - source)) / target_mean)), 0)
         raw_score = Score([raw_score], coords={'aggregation': ['center']}, dims=['aggregation'])
         return raw_score

From cd3eefc2fb225906056a300d8ae5ebb6e327173e Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 26 Jun 2024 15:01:10 +0200
Subject: [PATCH 49/65] remove redundant comment

---
 brainscore_vision/benchmarks/malania2007/benchmark.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index 5e6290482..132ba4e8d 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -116,7 +116,6 @@ def __call__(self, candidate: BrainModel):
                 target_visual_degrees=candidate.visual_degrees(),
                 source_visual_degrees=self._visual_degrees
             )
-            # model_requirements here
             model_responses[condition] = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials,
                                                            require_variance=True)
 

From 5b68dd45aefef621536bf446b26a12044a28e5a3 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 26 Jun 2024 15:01:30 +0200
Subject: [PATCH 50/65] remove pool score

---
 brainscore_vision/metrics/threshold/metric.py | 68 ++++++-------------
 1 file changed, 19 insertions(+), 49 deletions(-)

diff --git a/brainscore_vision/metrics/threshold/metric.py b/brainscore_vision/metrics/threshold/metric.py
index a10c99e92..683bf4b38 100644
--- a/brainscore_vision/metrics/threshold/metric.py
+++ b/brainscore_vision/metrics/threshold/metric.py
@@ -121,7 +121,6 @@ def __init__(self,
                  fit_function=psychometric_cum_gauss,
                  fit_inverse_function=inverse_psychometric_cum_gauss,
                  threshold_accuracy: Union[str, float] = 'inflection',
-                 scoring: str = 'pool',
                  required_accuracy: Optional[float] = 0.6,
                  plot_fit: bool = False
                  ):
@@ -135,14 +134,11 @@ def __init__(self,
                                     is used, the function finds the inflection point of the curve and evaluates
                                     the threshold at that level. When a float is used, the function evaluates
                                     the threshold at that level.
-        :param scoring: The scoring function used to evaluate performance. Either Literal['individual'] or
-                         Literal['pool']. See the individual_score and pool_score methods for more information.
         """
         self.fit_function = fit_function
         self.fit_inverse_function = fit_inverse_function
         self._independent_variable = independent_variable
         self.threshold_accuracy = threshold_accuracy
-        self.scoring = scoring
         self.required_accuracy = required_accuracy
         self.plot_fit = plot_fit
 
@@ -152,7 +148,7 @@ def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list,
                         threshold as a float.
         :param target: Either a list containing human thresholds (for the ceiling function & ThresholdElevation),
                         or a PropertyAsssembly.
-        :return: A Score containing the evaluated model's ceiling-adjusted distance to target thresholds.
+        :return: A Score containing the evaluated model's distance to target thresholds.
         """
         # compute threshold from measurements if the input is not a threshold already
         if isinstance(source, float):
@@ -164,14 +160,7 @@ def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list,
                 return Score([0., 0.], coords={'aggregation': ['center', ]}, dims=['aggregation'])
         else:
             raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.')
-
-        # compare threshold to target thresholds
-        if self.scoring == 'pool':
-            return self.pool_score(source_threshold, target)
-        elif self.scoring == 'individual':
-            return self.individual_score(source_threshold, target)
-        else:
-            raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
+        return self.scoring_function(source_threshold, target)
 
     def ceiling(self, assembly: Union[PropertyAssembly, Dict[str, PropertyAssembly]]) -> Score:
         """
@@ -191,7 +180,7 @@ def ceiling(self, assembly: Union[PropertyAssembly, Dict[str, PropertyAssembly]]
             random_state = np.random.RandomState(i)
             random_human_score = random_state.choice(human_thresholds, replace=False)
             metric = Threshold(self._independent_variable, self.fit_function, self.fit_inverse_function,
-                               self.threshold_accuracy, scoring=self.scoring)
+                               self.threshold_accuracy)
             human_thresholds.remove(random_human_score)
             score = metric(random_human_score, human_thresholds)
             score = float(score[(score['aggregation'] == 'center')].values)
@@ -199,7 +188,8 @@ def ceiling(self, assembly: Union[PropertyAssembly, Dict[str, PropertyAssembly]]
             scores.append(score)
 
         ceiling, ceiling_error = np.mean(scores), np.std(scores)
-        ceiling = Score([ceiling, ceiling_error], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
+        ceiling = Score([ceiling], coords={'aggregation': ['center']}, dims=['aggregation'])
+        ceiling.attrs['error'] = ceiling_error
         return ceiling
 
     def compute_threshold(self, source: BehavioralAssembly, independent_variable: str) -> Union[float, str]:
@@ -318,7 +308,7 @@ def aggregate_psychometric_fit_data(x_points, y_points):
         return unique_x, correct_rate
 
     @staticmethod
-    def individual_score(source: float, target: Union[list, PropertyAssembly]) -> Score:
+    def scoring_function(source: float, target: Union[list, PropertyAssembly]) -> Score:
         """
         Computes the average distance of the source from each of the individual targets in units of the
         individual targets. This is generally a more stringent scoring method than pool_score, aimed
@@ -331,27 +321,11 @@ def individual_score(source: float, target: Union[list, PropertyAssembly]) -> Sc
             raw_scores.append(raw_score)
 
         raw_score, model_error = np.mean(raw_scores), np.std(raw_scores)
-        raw_score = Score([raw_score, model_error], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
-        return raw_score
-
-    @staticmethod
-    def pool_score(source: float, target: Union[list, PropertyAssembly]) -> Score:
-        """
-        Computes the distance of the source from the average of the target in units of the target average.
-        This is generally a less stringent scoring method than individual_score, aimed to measure the average
-        target effect.
-        """
-        if not isinstance(target, list):
-            target_mean = np.mean(target.values)
-        else:
-            target_mean = np.mean(target)
-        # This score = 0 when the source exceeds target_mean by 100%
-        #  The logic is that the maximum distance below is target threshold is at 0, and thus
-        #  setting the maximum distance above the target threshold at the same distance will make
-        #  the score symmetric.
-        raw_score = max((1 - ((np.abs(target_mean - source)) / target_mean)), 0)
-        raw_score = Score([raw_score], coords={'aggregation': ['center']}, dims=['aggregation'])
-        return raw_score
+        # add the aggregation: center coordinate to the score
+        score = Score([np.mean(raw_scores)], coords={'aggregation': ['center']}, dims=['aggregation'])
+        score.attrs['raw'] = raw_score
+        score.attrs['error'] = model_error
+        return score
 
     @staticmethod
     def convert_proba_to_correct(source: BehavioralAssembly) -> np.array:
@@ -421,7 +395,6 @@ def __init__(self,
                  baseline_condition: str,
                  test_condition: str,
                  threshold_accuracy: Union[str, float] = 'inflection',
-                 scoring: str = 'pool',
                  required_baseline_accuracy: Optional[float] = 0.6,
                  required_test_accuracy: Optional[float] = 0.6,
                  plot_fit: bool = False
@@ -437,7 +410,7 @@ def __init__(self,
                                     the threshold at that level. When a float is used, the function evaluates
                                     the threshold at that level.
         :param scoring: The scoring function used to evaluate performance. Either Literal['individual'] or
-                         Literal['pool']. See the individual_score and pool_score methods for more information.
+                         Literal['pool']. See the scoring_function and pool_score methods for more information.
         """
         super(ThresholdElevation, self).__init__(independent_variable)
         self.baseline_threshold_metric = Threshold(self._independent_variable,
@@ -451,11 +424,10 @@ def __init__(self,
         self.baseline_condition = baseline_condition
         self.test_condition = test_condition
         self.threshold_accuracy = threshold_accuracy
-        self.scoring = scoring
 
     def __call__(self,
                  source: Union[float, Dict[str, BehavioralAssembly]],
-                 target: Union[list, Dict[str, PropertyAssembly]]
+                 target: Union[list, PropertyAssembly, Dict[str, PropertyAssembly]]
                  ) -> Score:
         """
         :param source: Either a dictionary containing the BehavioralAssemblies for the test condition and the
@@ -488,18 +460,15 @@ def __call__(self,
         # check whether the targets are threshold elevations already - if not, compute them
         if isinstance(target, list):
             target_threshold_elevations = target
+        elif isinstance(target, PropertyAssembly):
+            target_threshold_elevations = target.values.tolist()
         elif isinstance(target, Dict):
             target_threshold_elevations = self.compute_threshold_elevations(target)
         else:
             raise TypeError(f'target is type {type(target)}, but type PropertyAssembly or list is required.')
 
         # compare threshold elevation to target threshold elevations
-        if self.scoring == 'pool':
-            return self.pool_score(raw_source_threshold_elevation, target_threshold_elevations)
-        elif self.scoring == 'individual':
-            return self.individual_score(raw_source_threshold_elevation, target_threshold_elevations)
-        else:
-            raise ValueError(f'Scoring method {self.scoring} is not a valid scoring method.')
+        return self.scoring_function(raw_source_threshold_elevation, target_threshold_elevations)
 
     def ceiling(self, assemblies: Dict[str, PropertyAssembly]) -> Score:
         """
@@ -515,7 +484,7 @@ def ceiling(self, assemblies: Dict[str, PropertyAssembly]) -> Score:
             random_state = np.random.RandomState(i)
             random_human_score = random_state.choice(human_threshold_elevations, replace=False)
             metric = ThresholdElevation(self._independent_variable, self.baseline_condition, self.test_condition,
-                                        self.threshold_accuracy, scoring=self.scoring)
+                                        self.threshold_accuracy)
             human_threshold_elevations.remove(random_human_score)
             score = metric(random_human_score, human_threshold_elevations)
             score = float(score[(score['aggregation'] == 'center')].values)
@@ -523,7 +492,8 @@ def ceiling(self, assemblies: Dict[str, PropertyAssembly]) -> Score:
             scores.append(score)
 
         ceiling, ceiling_error = np.mean(scores), np.std(scores)
-        ceiling = Score([ceiling, ceiling_error], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
+        ceiling = Score([ceiling], coords={'aggregation': ['center']}, dims=['aggregation'])
+        ceiling.attrs['error'] = ceiling_error
         return ceiling
 
     @staticmethod

From 58253b7a413a04da5e0fe484f6b6d0ac79eeb24a Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 26 Jun 2024 15:01:44 +0200
Subject: [PATCH 51/65] add metric tests

---
 brainscore_vision/metrics/threshold/test.py | 75 +++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/brainscore_vision/metrics/threshold/test.py b/brainscore_vision/metrics/threshold/test.py
index e69de29bb..0f1180d30 100644
--- a/brainscore_vision/metrics/threshold/test.py
+++ b/brainscore_vision/metrics/threshold/test.py
@@ -0,0 +1,75 @@
+from pytest import approx
+
+from brainio.assemblies import PropertyAssembly
+from brainscore_vision import load_metric
+
+
+def test_threshold_score_from_thresholds():
+    assembly = _make_threshold_data()
+    # independent_variable is not used since we compute from thresholds, and do not need to fit them
+    metric = load_metric('threshold', independent_variable='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    print(score)
+    assert score == approx(0.5625)
+
+
+def test_threshold_elevation_score_from_threshold_elevations():
+    assembly = _make_threshold_elevation_data()
+    # independent_variable is not used since we compute from thresholds, and do not need to fit them
+    metric = load_metric('threshold_elevation',
+                         independent_variable='placeholder',
+                         baseline_condition='placeholder',
+                         test_condition='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    print(score)
+    assert score == approx(0.525)
+
+
+def test_threshold_has_error():
+    assembly = _make_threshold_data()
+    metric = load_metric('threshold', independent_variable='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    assert hasattr(score, 'error')
+
+
+def test_threshold_elevation_has_error():
+    assembly = _make_threshold_elevation_data()
+    metric = load_metric('threshold_elevation',
+                         independent_variable='placeholder',
+                         baseline_condition='placeholder',
+                         test_condition='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    assert hasattr(score, 'error')
+
+
+def test_threshold_raw_subjects():
+    assembly = _make_threshold_data()
+    metric = load_metric('threshold', independent_variable='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    subject_scores = score.raw
+    assert subject_scores.sel(subject='A') == approx(0.5625)
+
+
+def test_threshold_elevation_raw_subjects():
+    assembly = _make_threshold_elevation_data()
+    metric = load_metric('threshold_elevation',
+                         independent_variable='placeholder',
+                         baseline_condition='placeholder',
+                         test_condition='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    subject_scores = score.raw
+    assert subject_scores.sel(subject='A') == approx(0.525)
+
+
+def _make_threshold_data():
+    # Subjects have thresholds of 10, 20, 40, and 20 respectively.
+    return PropertyAssembly([10.0, 20.0, 40.0, 20.0],
+                            coords={'subject': ('subject', ['A', 'B', 'C', 'D'])},
+                            dims=['subject'])
+
+
+def _make_threshold_elevation_data():
+    # Subjects have threshold elevations of 3, 2, 1.5, and 5 respectively.
+    return PropertyAssembly([3.0, 2.0, 1.5, 5.0],
+                            coords={'subject': ('subject', ['A', 'B', 'C', 'D'])},
+                            dims=['subject'])

From 10447e234348231a9e507037abfae269737f0ec9 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 26 Jun 2024 15:39:28 +0200
Subject: [PATCH 52/65] fix benchmark filtering issue and recompute ceilings to
 accommodate removal of pool_score

---
 .../benchmarks/malania2007/benchmark.py        |  6 +++---
 .../benchmarks/malania2007/test.py             | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index 132ba4e8d..df0d74c88 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -95,14 +95,13 @@ def __init__(self, condition: str):
                                    baseline_condition=self.baseline_condition,
                                    test_condition=self.condition,
                                    threshold_accuracy=0.75)
-        self._ceiling = self._metric.ceiling(self._assemblies)
 
         self._visual_degrees = 2.986667
         self._number_of_trials = 10  # arbitrary choice for microsaccades to improve precision of estimates
 
         super(_Malania2007Base, self).__init__(
             identifier=f'Malania2007_{condition}', version=1,
-            ceiling_func=lambda: self._ceiling,
+            ceiling_func=lambda: self._metric.ceiling(self._assemblies),
             parent='Malania2007',
             bibtex=BIBTEX)
 
@@ -148,4 +147,5 @@ def filter_baseline_subjects(condition_assembly: PropertyAssembly,
 
     mask = baseline_assembly.coords['subject'].isin(unique_ids)
     filtered_baseline_assembly = baseline_assembly.where(mask, drop=True)
-    return condition_assembly, filtered_baseline_assembly
+    filtered_condition_assembly = condition_assembly.where(mask, drop=True)
+    return filtered_condition_assembly, filtered_baseline_assembly
diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index 7cd372348..e25d71b76 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -30,15 +30,15 @@ def test_mean_ceiling(self):
     # these test values are for the pooled score ceiling
     @pytest.mark.private_access
     @pytest.mark.parametrize('dataset, expected_ceiling', [
-        ('short-2', approx(0.69824226, abs=0.001)),
-        ('short-4', approx(0.56750692, abs=0.001)),
-        ('short-6', approx(0.62480255, abs=0.001)),
-        ('short-8', approx(0.67478401, abs=0.001)),
-        ('short-16', approx(0.83471481, abs=0.001)),
-        ('equal-2', approx(0.59491172, abs=0.001)),
-        ('long-2', approx(0.52140858, abs=0.001)),
-        ('equal-16', approx(0.3824312, abs=0.001)),
-        ('long-16', approx(0.51425013, abs=0.001))
+        ('short-2', approx(0.78719345, abs=0.001)),
+        ('short-4', approx(0.49998989, abs=0.001)),
+        ('short-6', approx(0.50590051, abs=0.001)),
+        ('short-8', approx(0.4426336, abs=0.001)),
+        ('short-16', approx(0.8383443, abs=0.001)),
+        ('equal-2', approx(0.56664015, abs=0.001)),
+        ('long-2', approx(0.46470421, abs=0.001)),
+        ('equal-16', approx(0.44087153, abs=0.001)),
+        ('long-16', approx(0.50996587, abs=0.001))
     ])
     def test_dataset_ceiling(self, dataset, expected_ceiling):
         benchmark = f"Malania2007_{dataset}"

From 6966bbf9671e807cb4d87ace53cf29760844f9e8 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 27 Jun 2024 13:08:28 +0200
Subject: [PATCH 53/65] fix superfluous test by switching it for another

---
 brainscore_vision/metrics/threshold/test.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/brainscore_vision/metrics/threshold/test.py b/brainscore_vision/metrics/threshold/test.py
index 0f1180d30..7253d1cd9 100644
--- a/brainscore_vision/metrics/threshold/test.py
+++ b/brainscore_vision/metrics/threshold/test.py
@@ -42,23 +42,21 @@ def test_threshold_elevation_has_error():
     assert hasattr(score, 'error')
 
 
-def test_threshold_raw_subjects():
+def test_threshold_has_raw():
     assembly = _make_threshold_data()
     metric = load_metric('threshold', independent_variable='placeholder')
     score = metric(float(assembly.sel(subject='A').values), assembly)
-    subject_scores = score.raw
-    assert subject_scores.sel(subject='A') == approx(0.5625)
+    assert hasattr(score, 'raw')
 
 
-def test_threshold_elevation_raw_subjects():
+def test_threshold_elevation_has_raw():
     assembly = _make_threshold_elevation_data()
     metric = load_metric('threshold_elevation',
                          independent_variable='placeholder',
                          baseline_condition='placeholder',
                          test_condition='placeholder')
     score = metric(float(assembly.sel(subject='A').values), assembly)
-    subject_scores = score.raw
-    assert subject_scores.sel(subject='A') == approx(0.525)
+    assert hasattr(score, 'raw')
 
 
 def _make_threshold_data():

From 98540d291ee8304c139234bb4412af5d9802a92b Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 27 Jun 2024 15:43:21 +0200
Subject: [PATCH 54/65] add vernier acuity benchmark

---
 .../benchmarks/malania2007/__init__.py        |  1 +
 .../benchmarks/malania2007/benchmark.py       | 65 ++++++++++++++++++-
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/brainscore_vision/benchmarks/malania2007/__init__.py b/brainscore_vision/benchmarks/malania2007/__init__.py
index f0b56d84c..a5016e24f 100644
--- a/brainscore_vision/benchmarks/malania2007/__init__.py
+++ b/brainscore_vision/benchmarks/malania2007/__init__.py
@@ -10,3 +10,4 @@
 benchmark_registry['Malania2007_long-2'] = lambda: benchmark._Malania2007Base('long-2')
 benchmark_registry['Malania2007_equal-16'] = lambda: benchmark._Malania2007Base('equal-16')
 benchmark_registry['Malania2007_long-16'] = lambda: benchmark._Malania2007Base('long-16')
+benchmark_registry['Malania2007_vernieracuity'] = lambda: benchmark._Malania2007VernierAcuity()
diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index df0d74c88..36843cb97 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -121,7 +121,7 @@ def __call__(self, candidate: BrainModel):
         raw_score = self._metric(model_responses, self._assemblies)
 
         # Adjust score to ceiling
-        ceiling = self._ceiling
+        ceiling = self.ceiling
         score = raw_score / ceiling.sel(aggregation='center')
 
         # cap score at 1 if ceiled score > 1
@@ -133,6 +133,69 @@ def __call__(self, candidate: BrainModel):
         return score
 
 
+class _Malania2007VernierAcuity(BenchmarkBase):
+    def __init__(self):
+        self.baseline_condition = BASELINE_CONDITION
+        self.conditions = DATASETS
+
+        self._assemblies = {condition: {'baseline_assembly': self.get_assemblies(condition)['baseline_assembly'],
+                                        'condition_assembly': self.get_assemblies(condition)['condition_assembly']}
+                            for condition in self.conditions}
+        self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007_{self.baseline_condition}')
+        self._fitting_stimuli = {condition: brainscore_vision.load_stimulus_set(f'Malania2007_{condition}_fit')
+                               for condition in self.conditions}
+
+        self._metric = load_metric('threshold',
+                                   independent_variable='image_label',
+                                   threshold_accuracy=0.75)
+
+        self._visual_degrees = 2.986667
+        self._number_of_trials = 10  # arbitrary choice for microsaccades to improve precision of estimates
+
+        super(_Malania2007VernierAcuity, self).__init__(
+            identifier=f'Malania2007_vernieracuity', version=1,
+            ceiling_func=lambda: self._metric.ceiling(self._assemblies),
+            parent='Malania2007',
+            bibtex=BIBTEX)
+
+    def __call__(self, candidate: BrainModel):
+        scores = []
+        for condition in self.conditions:
+            candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli[condition],
+                                 number_of_trials=self._number_of_trials, require_variance=True)
+            stimulus_set = place_on_screen(
+                self._stimulus_set,
+                target_visual_degrees=candidate.visual_degrees(),
+                source_visual_degrees=self._visual_degrees
+            )
+            model_response = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials,
+                                               require_variance=True)
+
+            raw_score = self._metric(model_response, self._assemblies[condition])
+            # Adjust score to ceiling
+            ceiling = self.ceiling
+            score = raw_score / ceiling.sel(aggregation='center')
+
+            # cap score at 1 if ceiled score > 1
+            if score[(score['aggregation'] == 'center')] > 1:
+                score.__setitem__({'aggregation': score['aggregation'] == 'center'}, 1)
+
+            score.attrs['raw'] = raw_score
+            score.attrs['ceiling'] = ceiling
+        # average all scores to get 1 average score
+        mean_score = np.mean(scores)
+        return mean_score
+
+    def get_assemblies(self, condition: str):
+        baseline_assembly = LazyLoad(lambda: load_assembly(self.baseline_condition))
+        condition_assembly = LazyLoad(lambda: load_assembly(condition))
+        assembly, baseline_assembly = filter_baseline_subjects(condition_assembly,
+                                                               baseline_assembly)
+        return {'condition_assembly': assembly,
+                'baseline_assembly': baseline_assembly}
+
+
+
 def load_assembly(dataset: str) -> PropertyAssembly:
     assembly = brainscore_vision.load_dataset(f'Malania2007_{dataset}')
     return assembly

From faa1194daf30f1f4d690356438abc18af656240b Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 28 Jun 2024 11:14:19 +0200
Subject: [PATCH 55/65] update vernier acuity benchmark, ceilings, and mean
 ceiling

---
 .../benchmarks/malania2007/benchmark.py       | 24 +++++++++++++++----
 .../benchmarks/malania2007/test.py            |  9 ++++---
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index 36843cb97..f1b0dba2d 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -9,6 +9,7 @@
 from brainscore_vision import load_dataset, load_stimulus_set, load_metric
 from brainscore_vision.model_interface import BrainModel
 from brainscore_vision.utils import LazyLoad
+from brainscore_core.metrics import Score
 
 
 BIBTEX = """@article{malania2007,
@@ -25,7 +26,8 @@
         }"""
 
 BASELINE_CONDITION = 'vernier-only'
-DATASETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2', 'long-2', 'equal-16', 'long-16']
+DATASETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2', 'long-2', 'equal-16', 'long-16',
+            'vernieracuity']
 # Values in NUM_FLANKERS_PER_CONDITION denote the condition (i.e., in this case the number of flankers) to be selected
 # This is kept track of simply because the benchmark uses threshold elevation - i.e., a comparison of 2 conditions
 NUM_FLANKERS_PER_CONDITION = {'short-2': 2, 'short-4': 4, 'short-6': 6, 'short-8': 8,
@@ -136,7 +138,8 @@ def __call__(self, candidate: BrainModel):
 class _Malania2007VernierAcuity(BenchmarkBase):
     def __init__(self):
         self.baseline_condition = BASELINE_CONDITION
-        self.conditions = DATASETS
+        self.conditions = DATASETS.copy()
+        self.conditions.remove('vernieracuity')
 
         self._assemblies = {condition: {'baseline_assembly': self.get_assemblies(condition)['baseline_assembly'],
                                         'condition_assembly': self.get_assemblies(condition)['condition_assembly']}
@@ -154,7 +157,7 @@ def __init__(self):
 
         super(_Malania2007VernierAcuity, self).__init__(
             identifier=f'Malania2007_vernieracuity', version=1,
-            ceiling_func=lambda: self._metric.ceiling(self._assemblies),
+            ceiling_func=lambda: self.mean_ceiling(),
             parent='Malania2007',
             bibtex=BIBTEX)
 
@@ -182,8 +185,10 @@ def __call__(self, candidate: BrainModel):
 
             score.attrs['raw'] = raw_score
             score.attrs['ceiling'] = ceiling
+            scores.append(score)
         # average all scores to get 1 average score
-        mean_score = np.mean(scores)
+        mean_score = Score(np.mean(scores))
+        mean_score.attrs['error'] = np.mean([score['error'] for score in scores])
         return mean_score
 
     def get_assemblies(self, condition: str):
@@ -194,6 +199,17 @@ def get_assemblies(self, condition: str):
         return {'condition_assembly': assembly,
                 'baseline_assembly': baseline_assembly}
 
+    def mean_ceiling(self):
+        ceilings = []
+        errors = []
+        for assembly_name in self._assemblies.keys():
+            this_ceiling = self._metric.ceiling(self._assemblies[assembly_name]['baseline_assembly'])
+            ceilings.append(this_ceiling.values)
+            errors.append(this_ceiling.error)
+        mean_ceiling = Score(np.mean(ceilings))
+        mean_ceiling.attrs['error'] = np.mean(errors)
+        return mean_ceiling
+
 
 
 def load_assembly(dataset: str) -> PropertyAssembly:
diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index e25d71b76..f001a9e06 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -20,15 +20,17 @@ def test_in_pool(self, dataset):
         assert identifier in benchmark_registry
 
     @pytest.mark.private_access
+    # TODO: recompute
     def test_mean_ceiling(self):
         benchmarks = [f"Malania2007_{dataset}" for dataset in DATASETS]
         benchmarks = [benchmark_registry[benchmark] for benchmark in benchmarks]
-        ceilings = [benchmark._ceiling.sel(aggregation='center') for benchmark in benchmarks]
+        ceilings = [benchmark.ceiling.sel(aggregation='center') for benchmark in benchmarks]
         mean_ceiling = np.mean(ceilings)
-        assert mean_ceiling == approx(0.7724487108297781, abs=0.001)
+        assert mean_ceiling == approx(0.5757928329186803, abs=0.001)
 
     # these test values are for the pooled score ceiling
     @pytest.mark.private_access
+    # TODO: ceiling for vernier acuity
     @pytest.mark.parametrize('dataset, expected_ceiling', [
         ('short-2', approx(0.78719345, abs=0.001)),
         ('short-4', approx(0.49998989, abs=0.001)),
@@ -38,7 +40,8 @@ def test_mean_ceiling(self):
         ('equal-2', approx(0.56664015, abs=0.001)),
         ('long-2', approx(0.46470421, abs=0.001)),
         ('equal-16', approx(0.44087153, abs=0.001)),
-        ('long-16', approx(0.50996587, abs=0.001))
+        ('long-16', approx(0.50996587, abs=0.001)),
+        ('vernieracuity', approx(0.70168481, abs=0.001))
     ])
     def test_dataset_ceiling(self, dataset, expected_ceiling):
         benchmark = f"Malania2007_{dataset}"

From 63fc009fcc825e4bc7b13aeb8c512f80a7f78a84 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 28 Jun 2024 11:25:56 +0200
Subject: [PATCH 56/65] clean up benchmark file

---
 brainscore_vision/benchmarks/malania2007/benchmark.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index f1b0dba2d..9d44c452e 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -1,12 +1,11 @@
 from typing import Tuple
 import numpy as np
-import xarray as xr
 
 import brainscore_vision
 from brainio.assemblies import PropertyAssembly
 from brainscore_vision.benchmarks import BenchmarkBase
 from brainscore_vision.benchmark_helpers.screen import place_on_screen
-from brainscore_vision import load_dataset, load_stimulus_set, load_metric
+from brainscore_vision import load_metric
 from brainscore_vision.model_interface import BrainModel
 from brainscore_vision.utils import LazyLoad
 from brainscore_core.metrics import Score
@@ -211,7 +210,6 @@ def mean_ceiling(self):
         return mean_ceiling
 
 
-
 def load_assembly(dataset: str) -> PropertyAssembly:
     assembly = brainscore_vision.load_dataset(f'Malania2007_{dataset}')
     return assembly

From 108ab9ba52147a3898724e08e6f9fd3198858a98 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 28 Jun 2024 11:39:36 +0200
Subject: [PATCH 57/65] fix a few bugs with loading benchmarks and such

---
 brainscore_vision/benchmarks/malania2007/benchmark.py | 2 +-
 brainscore_vision/benchmarks/malania2007/test.py      | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index 9d44c452e..cc79046a5 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -205,7 +205,7 @@ def mean_ceiling(self):
             this_ceiling = self._metric.ceiling(self._assemblies[assembly_name]['baseline_assembly'])
             ceilings.append(this_ceiling.values)
             errors.append(this_ceiling.error)
-        mean_ceiling = Score(np.mean(ceilings))
+        mean_ceiling = Score(np.mean(ceilings), coords={'aggregation': ['center']}, dims=['aggregation'])
         mean_ceiling.attrs['error'] = np.mean(errors)
         return mean_ceiling
 
diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index f001a9e06..57165b47f 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -12,7 +12,7 @@
 
 class TestBehavioral:
     def test_count(self):
-        assert len(DATASETS) == 5 + 2 + 2
+        assert len(DATASETS) == 5 + 2 + 2 + 1
 
     @pytest.mark.parametrize('dataset', DATASETS)
     def test_in_pool(self, dataset):
@@ -23,8 +23,8 @@ def test_in_pool(self, dataset):
     # TODO: recompute
     def test_mean_ceiling(self):
         benchmarks = [f"Malania2007_{dataset}" for dataset in DATASETS]
-        benchmarks = [benchmark_registry[benchmark] for benchmark in benchmarks]
-        ceilings = [benchmark.ceiling.sel(aggregation='center') for benchmark in benchmarks]
+        benchmarks = [load_benchmark(benchmark) for benchmark in benchmarks]
+        ceilings = [benchmark.ceiling for benchmark in benchmarks]
         mean_ceiling = np.mean(ceilings)
         assert mean_ceiling == approx(0.5757928329186803, abs=0.001)
 

From f49c32c04b2c4c4a34571a46ac9cb5c7d3a3c0d8 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 28 Jun 2024 16:23:59 +0200
Subject: [PATCH 58/65] fix some happy little bugs

---
 .../benchmarks/malania2007/benchmark.py       | 13 +++--
 .../benchmarks/malania2007/test.py            | 52 -------------------
 brainscore_vision/metrics/threshold/metric.py |  4 +-
 3 files changed, 11 insertions(+), 58 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index cc79046a5..cabbe3cd5 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -10,6 +10,8 @@
 from brainscore_vision.utils import LazyLoad
 from brainscore_core.metrics import Score
 
+from brainio.packaging import write_netcdf
+
 
 BIBTEX = """@article{malania2007,
             author = {Malania, Maka and Herzog, Michael H. and Westheimer, Gerald},
@@ -109,7 +111,7 @@ def __init__(self, condition: str):
     def __call__(self, candidate: BrainModel):
         model_responses = {}
         candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli,
-                             number_of_trials=self._number_of_trials, require_variance=True)
+                             number_of_trials=2, require_variance=True)
         for condition in (self.baseline_condition, self.condition):
             stimulus_set = place_on_screen(
                 self._stimulus_sets[condition],
@@ -123,7 +125,7 @@ def __call__(self, candidate: BrainModel):
 
         # Adjust score to ceiling
         ceiling = self.ceiling
-        score = raw_score / ceiling.sel(aggregation='center')
+        score = raw_score / ceiling
 
         # cap score at 1 if ceiled score > 1
         if score[(score['aggregation'] == 'center')] > 1:
@@ -164,7 +166,7 @@ def __call__(self, candidate: BrainModel):
         scores = []
         for condition in self.conditions:
             candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli[condition],
-                                 number_of_trials=self._number_of_trials, require_variance=True)
+                                 number_of_trials=2, require_variance=True)
             stimulus_set = place_on_screen(
                 self._stimulus_set,
                 target_visual_degrees=candidate.visual_degrees(),
@@ -176,7 +178,8 @@ def __call__(self, candidate: BrainModel):
             raw_score = self._metric(model_response, self._assemblies[condition])
             # Adjust score to ceiling
             ceiling = self.ceiling
-            score = raw_score / ceiling.sel(aggregation='center')
+            score = raw_score / ceiling
+            score.attrs['error'] = raw_score.error
 
             # cap score at 1 if ceiled score > 1
             if score[(score['aggregation'] == 'center')] > 1:
@@ -187,7 +190,7 @@ def __call__(self, candidate: BrainModel):
             scores.append(score)
         # average all scores to get 1 average score
         mean_score = Score(np.mean(scores))
-        mean_score.attrs['error'] = np.mean([score['error'] for score in scores])
+        mean_score.attrs['error'] = np.mean([score.error for score in scores])
         return mean_score
 
     def get_assemblies(self, condition: str):
diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index 57165b47f..0702e8cea 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -20,7 +20,6 @@ def test_in_pool(self, dataset):
         assert identifier in benchmark_registry
 
     @pytest.mark.private_access
-    # TODO: recompute
     def test_mean_ceiling(self):
         benchmarks = [f"Malania2007_{dataset}" for dataset in DATASETS]
         benchmarks = [load_benchmark(benchmark) for benchmark in benchmarks]
@@ -30,7 +29,6 @@ def test_mean_ceiling(self):
 
     # these test values are for the pooled score ceiling
     @pytest.mark.private_access
-    # TODO: ceiling for vernier acuity
     @pytest.mark.parametrize('dataset, expected_ceiling', [
         ('short-2', approx(0.78719345, abs=0.001)),
         ('short-4', approx(0.49998989, abs=0.001)),
@@ -48,53 +46,3 @@ def test_dataset_ceiling(self, dataset, expected_ceiling):
         benchmark = load_benchmark(benchmark)
         ceiling = benchmark.ceiling
         assert ceiling.sel(aggregation='center').values.item() == expected_ceiling
-
-    @pytest.mark.private_access
-    @pytest.mark.parametrize('dataset, model, expected_raw_score', [
-        ('short-2', 'resnet-18', approx(0., abs=0.001)),
-        ('short-4', 'resnet-18', approx(0., abs=0.001)),
-        ('short-6', 'resnet-18', approx(0., abs=0.001)),
-        ('short-8', 'resnet-18', approx(0., abs=0.001)),
-        ('short-16', 'resnet-18', approx(0., abs=0.001)),
-        ('equal-2', 'resnet-18', approx(0., abs=0.001)),
-        ('long-2', 'resnet-18', approx(0., abs=0.001)),
-        ('equal-16', 'resnet-18', approx(0., abs=0.001)),
-        ('long-16', 'resnet-18', approx(0., abs=0.001)),
-    ])
-    def test_model_8degrees(self, dataset, model, expected_raw_score):
-        raise Exception("This test needs to be recalculated.")
-        benchmark = benchmark_registry[f"Malania2007_{dataset}"]
-        # load features
-        precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'
-        precomputed_features = BehavioralAssembly.from_files(file_path=precomputed_features)
-        precomputed_features = PrecomputedFeatures(precomputed_features,
-                                                   visual_degrees=8,  # doesn't matter, features are already computed
-                                                   )
-        # score
-        score = benchmark(precomputed_features).raw
-        assert score == expected_raw_score
-
-    @pytest.mark.private_access
-    @pytest.mark.parametrize('dataset, model, expected_raw_score', [
-        ('short-2', 'resnet-18-3deg', approx(0., abs=0.001)),
-        ('short-4', 'resnet-18-3deg', approx(0., abs=0.001)),
-        ('short-6', 'resnet-18-3deg', approx(0., abs=0.001)),
-        ('short-8', 'resnet-18-3deg', approx(0., abs=0.001)),
-        ('short-16', 'resnet-18-3deg', approx(0., abs=0.001)),
-        ('equal-2', 'resnet-18-3deg', approx(0., abs=0.001)),
-        ('long-2', 'resnet-18-3deg', approx(0., abs=0.001)),
-        ('equal-16', 'resnet-18-3deg', approx(0., abs=0.001)),
-        ('long-16', 'resnet-18-3deg', approx(0., abs=0.001)),
-    ])
-    def test_model_3degrees(self, dataset, model, expected_raw_score):
-        raise Exception("This test needs to be recalculated.")
-        benchmark = benchmark_registry[f"Malania2007_{dataset}"]
-        # load features
-        precomputed_features = Path(__file__).parent / f'{model}-Malania2007_{dataset}.nc'
-        precomputed_features = BehavioralAssembly.from_files(file_path=precomputed_features)
-        precomputed_features = PrecomputedFeatures(precomputed_features,
-                                                   visual_degrees=3,  # doesn't matter, features are already computed
-                                                   )
-        # score
-        score = benchmark(precomputed_features).raw
-        assert score == expected_raw_score
diff --git a/brainscore_vision/metrics/threshold/metric.py b/brainscore_vision/metrics/threshold/metric.py
index 683bf4b38..3151c6b34 100644
--- a/brainscore_vision/metrics/threshold/metric.py
+++ b/brainscore_vision/metrics/threshold/metric.py
@@ -157,7 +157,9 @@ def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list,
             source_threshold = self.compute_threshold(source, self._independent_variable)
             # check whether the psychometric function fit was successful - if not, return a score of 0
             if source_threshold == 'fit_fail':
-                return Score([0., 0.], coords={'aggregation': ['center', ]}, dims=['aggregation'])
+                score = Score([0.], coords={'aggregation': ['center', ]}, dims=['aggregation'])
+                score.attrs['error'] = 0.
+                return score
         else:
             raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.')
         return self.scoring_function(source_threshold, target)

From 3e9b1ff5255d13eda83a02570e72c39e9786d994 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 2 Jul 2024 11:32:07 +0200
Subject: [PATCH 59/65] add alexnet test

---
 .../benchmarks/malania2007/test.py            | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index 0702e8cea..1354c6a0d 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -1,12 +1,8 @@
-from pathlib import Path
-
 import numpy as np
 import pytest
 from pytest import approx
 
-from brainio.assemblies import BehavioralAssembly
-from brainscore_vision import benchmark_registry, load_benchmark
-from brainscore_vision.benchmark_helpers import PrecomputedFeatures
+from brainscore_vision import benchmark_registry, load_benchmark, load_model
 from brainscore_vision.benchmarks.malania2007.benchmark import DATASETS
 
 
@@ -46,3 +42,22 @@ def test_dataset_ceiling(self, dataset, expected_ceiling):
         benchmark = load_benchmark(benchmark)
         ceiling = benchmark.ceiling
         assert ceiling.sel(aggregation='center').values.item() == expected_ceiling
+
+    @pytest.mark.parametrize('dataset, expected_score', [
+        ('short-2', approx(0.0, abs=0.001)),
+        ('short-4', approx(0.0, abs=0.001)),
+        ('short-6', approx(0.0, abs=0.001)),
+        ('short-8', approx(0.0, abs=0.001)),
+        ('short-16', approx(0.0, abs=0.001)),
+        ('equal-2', approx(0.0, abs=0.001)),
+        ('long-2', approx(0.0, abs=0.001)),
+        ('equal-16', approx(0.0, abs=0.001)),
+        ('long-16', approx(0.0, abs=0.001)),
+        ('vernieracuity', approx(0.0, abs=0.001))
+    ])
+    def test_model_score(dataset, expected_score):
+        benchmark = f"Malania2007_{dataset}"
+        benchmark = load_benchmark(benchmark)
+        model = load_model('alexnet')
+        model_score = benchmark(model)
+        assert model_score.values == expected_score

From d1948d36d629a3e73b7f5b825980daa561057405 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 4 Jul 2024 15:39:13 +0200
Subject: [PATCH 60/65] fix dataset argument error

---
 brainscore_vision/benchmarks/malania2007/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index 1354c6a0d..c2e56565a 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -55,7 +55,7 @@ def test_dataset_ceiling(self, dataset, expected_ceiling):
         ('long-16', approx(0.0, abs=0.001)),
         ('vernieracuity', approx(0.0, abs=0.001))
     ])
-    def test_model_score(dataset, expected_score):
+    def test_model_score(self, dataset, expected_score):
         benchmark = f"Malania2007_{dataset}"
         benchmark = load_benchmark(benchmark)
         model = load_model('alexnet')

From f06b6835acf99011fb7b4e8a8fb112514e642094 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 5 Jul 2024 10:15:40 +0200
Subject: [PATCH 61/65] Apply suggestions from code review

Co-authored-by: Martin Schrimpf <mschrimpf@users.noreply.github.com>
---
 .../benchmarks/malania2007/__init__.py        |  20 +--
 .../benchmarks/malania2007/benchmark.py       |  16 +-
 .../benchmarks/malania2007/test.py            |   8 +-
 .../data/malania2007/__init__.py              |  58 +++----
 brainscore_vision/data/malania2007/test.py    | 162 +++++++++---------
 brainscore_vision/metrics/threshold/metric.py |  26 ++-
 brainscore_vision/metrics/threshold/test.py   |   2 -
 7 files changed, 144 insertions(+), 148 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/__init__.py b/brainscore_vision/benchmarks/malania2007/__init__.py
index a5016e24f..82e6e0e42 100644
--- a/brainscore_vision/benchmarks/malania2007/__init__.py
+++ b/brainscore_vision/benchmarks/malania2007/__init__.py
@@ -1,13 +1,13 @@
 from brainscore_vision import benchmark_registry
 from . import benchmark
 
-benchmark_registry['Malania2007_short-2'] = lambda: benchmark._Malania2007Base('short-2')
-benchmark_registry['Malania2007_short-4'] = lambda: benchmark._Malania2007Base('short-4')
-benchmark_registry['Malania2007_short-6'] = lambda: benchmark._Malania2007Base('short-6')
-benchmark_registry['Malania2007_short-8'] = lambda: benchmark._Malania2007Base('short-8')
-benchmark_registry['Malania2007_short-16'] = lambda: benchmark._Malania2007Base('short-16')
-benchmark_registry['Malania2007_equal-2'] = lambda: benchmark._Malania2007Base('equal-2')
-benchmark_registry['Malania2007_long-2'] = lambda: benchmark._Malania2007Base('long-2')
-benchmark_registry['Malania2007_equal-16'] = lambda: benchmark._Malania2007Base('equal-16')
-benchmark_registry['Malania2007_long-16'] = lambda: benchmark._Malania2007Base('long-16')
-benchmark_registry['Malania2007_vernieracuity'] = lambda: benchmark._Malania2007VernierAcuity()
+benchmark_registry['Malania2007.short2'] = lambda: benchmark._Malania2007Base('short-2')
+benchmark_registry['Malania2007.short4'] = lambda: benchmark._Malania2007Base('short-4')
+benchmark_registry['Malania2007.short6'] = lambda: benchmark._Malania2007Base('short-6')
+benchmark_registry['Malania2007.short8'] = lambda: benchmark._Malania2007Base('short-8')
+benchmark_registry['Malania2007.short16'] = lambda: benchmark._Malania2007Base('short-16')
+benchmark_registry['Malania2007.equal2'] = lambda: benchmark._Malania2007Base('equal-2')
+benchmark_registry['Malania2007.long2'] = lambda: benchmark._Malania2007Base('long-2')
+benchmark_registry['Malania2007.equal16'] = lambda: benchmark._Malania2007Base('equal-16')
+benchmark_registry['Malania2007.long16'] = lambda: benchmark._Malania2007Base('long-16')
+benchmark_registry['Malania2007.vernieracuity'] = lambda: benchmark._Malania2007VernierAcuity()
diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index cabbe3cd5..00bd36bbe 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -26,14 +26,14 @@
             url = {https://doi.org/10.1167/7.2.1}
         }"""
 
-BASELINE_CONDITION = 'vernier-only'
-DATASETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2', 'long-2', 'equal-16', 'long-16',
+BASELINE_CONDITION = 'vernier_only'
+DATASETS = ['short2', 'short4', 'short6', 'short8', 'short16', 'equal2', 'long2', 'equal16', 'long16',
             'vernieracuity']
 # Values in NUM_FLANKERS_PER_CONDITION denote the condition (i.e., in this case the number of flankers) to be selected
 # This is kept track of simply because the benchmark uses threshold elevation - i.e., a comparison of 2 conditions
-NUM_FLANKERS_PER_CONDITION = {'short-2': 2, 'short-4': 4, 'short-6': 6, 'short-8': 8,
-                              'short-16': 16, 'equal-2': 2, 'long-2': 2, 'equal-16': 16,
-                              'long-16': 16, 'vernier-only': 0}
+NUM_FLANKERS_PER_CONDITION = {'short2': 2, 'short4': 4, 'short6': 6, 'short8': 8,
+                              'short16': 16, 'equal2': 2, 'long2': 2, 'equal16': 16,
+                              'long16': 16, 'vernier_only': 0}
 
 
 class _Malania2007Base(BenchmarkBase):
@@ -103,7 +103,7 @@ def __init__(self, condition: str):
         self._number_of_trials = 10  # arbitrary choice for microsaccades to improve precision of estimates
 
         super(_Malania2007Base, self).__init__(
-            identifier=f'Malania2007_{condition}', version=1,
+            identifier=f'Malania2007.{condition}', version=1,
             ceiling_func=lambda: self._metric.ceiling(self._assemblies),
             parent='Malania2007',
             bibtex=BIBTEX)
@@ -157,7 +157,7 @@ def __init__(self):
         self._number_of_trials = 10  # arbitrary choice for microsaccades to improve precision of estimates
 
         super(_Malania2007VernierAcuity, self).__init__(
-            identifier=f'Malania2007_vernieracuity', version=1,
+            identifier=f'Malania2007.vernieracuity', version=1,
             ceiling_func=lambda: self.mean_ceiling(),
             parent='Malania2007',
             bibtex=BIBTEX)
@@ -214,7 +214,7 @@ def mean_ceiling(self):
 
 
 def load_assembly(dataset: str) -> PropertyAssembly:
-    assembly = brainscore_vision.load_dataset(f'Malania2007_{dataset}')
+    assembly = brainscore_vision.load_dataset(f'Malania2007.{dataset}')
     return assembly
 
 
diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index c2e56565a..a4ac27984 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -12,12 +12,12 @@ def test_count(self):
 
     @pytest.mark.parametrize('dataset', DATASETS)
     def test_in_pool(self, dataset):
-        identifier = f"Malania2007_{dataset}"
+        identifier = f"Malania2007.{dataset}"
         assert identifier in benchmark_registry
 
     @pytest.mark.private_access
     def test_mean_ceiling(self):
-        benchmarks = [f"Malania2007_{dataset}" for dataset in DATASETS]
+        benchmarks = [f"Malania2007.{dataset}" for dataset in DATASETS]
         benchmarks = [load_benchmark(benchmark) for benchmark in benchmarks]
         ceilings = [benchmark.ceiling for benchmark in benchmarks]
         mean_ceiling = np.mean(ceilings)
@@ -38,7 +38,7 @@ def test_mean_ceiling(self):
         ('vernieracuity', approx(0.70168481, abs=0.001))
     ])
     def test_dataset_ceiling(self, dataset, expected_ceiling):
-        benchmark = f"Malania2007_{dataset}"
+        benchmark = f"Malania2007.{dataset}"
         benchmark = load_benchmark(benchmark)
         ceiling = benchmark.ceiling
         assert ceiling.sel(aggregation='center').values.item() == expected_ceiling
@@ -56,7 +56,7 @@ def test_dataset_ceiling(self, dataset, expected_ceiling):
         ('vernieracuity', approx(0.0, abs=0.001))
     ])
     def test_model_score(self, dataset, expected_score):
-        benchmark = f"Malania2007_{dataset}"
+        benchmark = f"Malania2007.{dataset}"
         benchmark = load_benchmark(benchmark)
         model = load_model('alexnet')
         model_score = benchmark(model)
diff --git a/brainscore_vision/data/malania2007/__init__.py b/brainscore_vision/data/malania2007/__init__.py
index 007840449..e6ecbb5cd 100644
--- a/brainscore_vision/data/malania2007/__init__.py
+++ b/brainscore_vision/data/malania2007/__init__.py
@@ -18,7 +18,7 @@
         }"""
 
 
-data_registry['Malania2007_equal-2'] = lambda: load_assembly_from_s3(
+data_registry['Malania2007.equal2'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_equal-2',
     version_id="yFXK8xjGjEmuYTSfS58rGS_ah3.NGg0X",
     sha1="277b2fbffed00e16b6a69b488f73eeda5abaaf10",
@@ -26,7 +26,7 @@
     cls=PropertyAssembly,
     stimulus_set_loader=None,
 )
-data_registry['Malania2007_equal-16'] = lambda: load_assembly_from_s3(
+data_registry['Malania2007.equal16'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_equal-16',
     version_id="SRZ7bs.Ek59GkeS084Pvdy38uTzFs4yw",
     sha1="ef49506238e8d2554918b113fbc60c133077186e",
@@ -34,7 +34,7 @@
     cls=PropertyAssembly,
     stimulus_set_loader=None,
 )
-data_registry['Malania2007_long-2'] = lambda: load_assembly_from_s3(
+data_registry['Malania2007.long2'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_long-2',
     version_id="2c1lWuXthb3rymB3seTQX1jVqiKUTn1f",
     sha1="9076a5b693948c4992b6c8e753f04a7acd2014a1",
@@ -42,7 +42,7 @@
     cls=PropertyAssembly,
     stimulus_set_loader=None,
 )
-data_registry['Malania2007_long-16'] = lambda: load_assembly_from_s3(
+data_registry['Malania2007.long16'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_long-16',
     version_id="qshNxhxjgusWyWiXnbfFN6gqjLgRh8fO",
     sha1="3106cf1f2fa9e66617ebf231df05d29077fc478f",
@@ -50,7 +50,7 @@
     cls=PropertyAssembly,
     stimulus_set_loader=None,
 )
-data_registry['Malania2007_short-2'] = lambda: load_assembly_from_s3(
+data_registry['Malania2007.short2'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_short-2',
     version_id="8CQ9MupuljAgkkKUXs3hiOliHg8xoDxb",
     sha1="85fb65ad76de48033c704b9c5689771e1ea0457d",
@@ -58,7 +58,7 @@
     cls=PropertyAssembly,
     stimulus_set_loader=None,
 )
-data_registry['Malania2007_short-4'] = lambda: load_assembly_from_s3(
+data_registry['Malania2007.short4'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_short-4',
     version_id=".ZUO0upSfQrWLPgd4oGwAaCbN4bz6S6H",
     sha1="75506be9a26ec38a223e41510f1a8cb32d5b0bc9",
@@ -66,7 +66,7 @@
     cls=PropertyAssembly,
     stimulus_set_loader=None,
 )
-data_registry['Malania2007_short-6'] = lambda: load_assembly_from_s3(
+data_registry['Malania2007.short6'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_short-6',
     version_id="q4FugpNGkT_FQP..qIVzye83hAQR2xfS",
     sha1="2901be6b352e67550da040d79d744819365b8626",
@@ -74,7 +74,7 @@
     cls=PropertyAssembly,
     stimulus_set_loader=None,
 )
-data_registry['Malania2007_short-8'] = lambda: load_assembly_from_s3(
+data_registry['Malania2007.short8'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_short-8',
     version_id="4_lcRl_I7Mp0RHxcfqZ9tkAZjVh.5oMU",
     sha1="6daf47b086cb969d75222e320f49453ed8437885",
@@ -82,7 +82,7 @@
     cls=PropertyAssembly,
     stimulus_set_loader=None,
 )
-data_registry['Malania2007_short-16'] = lambda: load_assembly_from_s3(
+data_registry['Malania2007.short16'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_short-16',
     version_id="fFqEIyIC9CHzqTEmv0MitjCgpeMX5pxJ",
     sha1="8ae0898caad718b747f85fce5888416affc3a569",
@@ -90,7 +90,7 @@
     cls=PropertyAssembly,
     stimulus_set_loader=None,
 )
-data_registry['Malania2007_vernier-only'] = lambda: load_assembly_from_s3(
+data_registry['Malania2007.vernier_only'] = lambda: load_assembly_from_s3(
     identifier='Malania2007_vernier-only',
     version_id="JLWf2pIR_UadQHqwtegJkC6XzWdbSNGi",
     sha1="1cf83e8b6141f8b0d67ea46994f342325f62001f",
@@ -100,7 +100,7 @@
 )
 
 
-stimulus_set_registry['Malania2007_equal-2'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.equal2'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_equal-2',
     bucket="brainio-brainscore",
     csv_sha1="77e94b9b5122a83ebbaffb4a06fcab68ef652751",
@@ -108,7 +108,7 @@
     csv_version_id="MlRpSz.4.jvVRFAZl8tGEum1P0Q0GtyS",
     zip_version_id="vHbAM_FjTbjp5U12BkAelJu4KW6PLYFn"
 )
-stimulus_set_registry['Malania2007_equal-2_fit'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.equal2_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_equal-2_fit',
     bucket="brainio-brainscore",
     csv_sha1="bafdfc855c164d3e5443d67dcf9eb7762443f964",
@@ -116,7 +116,7 @@
     csv_version_id="PIXEW.2vHvjIBP0Q2KHIpnxns7t9o8Cf",
     zip_version_id="h7pp84CYFGLKlPhveD0L5ogePqisk_I7"
 )
-stimulus_set_registry['Malania2007_equal-16'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.equal16'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_equal-16',
     bucket="brainio-brainscore",
     csv_sha1="5fedcff56c302339c3451ae2edbcb846c39c3189",
@@ -124,7 +124,7 @@
     csv_version_id="VmRGiQkhPALDwq74NpE2VpTiKTGn.30T",
     zip_version_id="c.DOlVULXZingRJ9gVY_NbZwRrj_xs_i"
 )
-stimulus_set_registry['Malania2007_equal-16_fit'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.equal16_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_equal-16_fit',
     bucket="brainio-brainscore",
     csv_sha1="3de3e5de19a638767a01ba68cb690dc746c29a77",
@@ -132,7 +132,7 @@
     csv_version_id="joAq8JBC_7axZDfLNFgoXFhTCLU_KKr_",
     zip_version_id="77JRwdldaHDr6TLW1NnB5HucIrkUCVg."
 )
-stimulus_set_registry['Malania2007_long-2'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.long2'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_long-2',
     bucket="brainio-brainscore",
     csv_sha1="ba65316a63dc688d8dfb410219a28fd02850b991",
@@ -140,7 +140,7 @@
     csv_version_id="_0fqObn6k5KvXurHMsuD4IqtrqbNskyo",
     zip_version_id="foL92ndVAAAETzMYHdmMtwIwKxXYhAB."
 )
-stimulus_set_registry['Malania2007_long-2_fit'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.long2_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_long-2_fit',
     bucket="brainio-brainscore",
     csv_sha1="b91dd9261c1d47bdd37f9b60eb8066b7b719709f",
@@ -148,7 +148,7 @@
     csv_version_id="mATh8lcVisdsDnPnU6ACE23iBPfpkLZA",
     zip_version_id="6nEviShTyCYQKrmxyjDyNov9Skc77eXT"
 )
-stimulus_set_registry['Malania2007_long-16'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.long16'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_long-16',
     bucket="brainio-brainscore",
     csv_sha1="1f1b03319b81698ba5e7db389dcd4248f94e45ca",
@@ -156,7 +156,7 @@
     csv_version_id="4RtywQ40hfQA4N80g8lxEScAmMXFRg7E",
     zip_version_id="lJy2QosABzHtiA6BJaE4OqCn1w1Jhz2k"
 )
-stimulus_set_registry['Malania2007_long-16_fit'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.long16_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_long-16_fit',
     bucket="brainio-brainscore",
     csv_sha1="d80a02c75b9908301c3c8dc9f7116fecf8e060ec",
@@ -164,7 +164,7 @@
     csv_version_id="gOxY6tjnT7LO.FDeL1xkRmowl5wYeAia",
     zip_version_id="71UAPTnZscIuqdx2dhuW9V0O0DO_TgTM"
 )
-stimulus_set_registry['Malania2007_short-2'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.short2'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-2',
     bucket="brainio-brainscore",
     csv_sha1="bf0252056d2084e855646f624700ab03c19cfc3d",
@@ -172,7 +172,7 @@
     csv_version_id="zcJqM.ZPwJyiMRWa3RBdvv401yPnLQAp",
     zip_version_id="C8WZzAAQ0JGHAAKii4JpvlRhcUOhgSj."
 )
-stimulus_set_registry['Malania2007_short-2_fit'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.short2_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-2_fit',
     bucket="brainio-brainscore",
     csv_sha1="73127d279a2cd254ae4f07b0053580851e84b00c",
@@ -180,7 +180,7 @@
     csv_version_id="iwGRp3_ktAHfJ6r7ktSK9gsthDjKek70",
     zip_version_id="6RpplJ9UVXTlvhmFSXla0Qa20b44m8Ds"
 )
-stimulus_set_registry['Malania2007_short-4'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.short4'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-4',
     bucket="brainio-brainscore",
     csv_sha1="816326d89d358f6592bd1f789e5c8d429fbca1cd",
@@ -188,7 +188,7 @@
     csv_version_id="Waikk.bktXIncCUtCIAyB2EqynGk.H.F",
     zip_version_id="rl_muxI4UEpwXVaXuhsqroG..COGILvR"
 )
-stimulus_set_registry['Malania2007_short-4_fit'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.short4_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-4_fit',
     bucket="brainio-brainscore",
     csv_sha1="3512cfd029f4e4299bc41ede519e691d80cfc3d5",
@@ -196,7 +196,7 @@
     csv_version_id="UhisdJqiEmkQ_4zsUtAmaxtle2kMZdcD",
     zip_version_id="xt_v0xgCB8YUptyPB0yZFHIUcel5MF_x"
 )
-stimulus_set_registry['Malania2007_short-6'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.short6'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-6',
     bucket="brainio-brainscore",
     csv_sha1="3d5dd9b48a56ba0c31de94b6221b97df962b6f8a",
@@ -204,7 +204,7 @@
     csv_version_id="GwGHPJkMDdg8N_.boyj8qJ3ChsEx4w._",
     zip_version_id="gIN1O4yz.THvK0Ifm5M3AI58ZACE1QFh"
 )
-stimulus_set_registry['Malania2007_short-6_fit'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.short6_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-6_fit',
     bucket="brainio-brainscore",
     csv_sha1="27a5be4fca190837fc5b75ed2cdbbffbf6b41338",
@@ -212,7 +212,7 @@
     csv_version_id="oMlj7wV85s00hJFE84ym0AJHLCfYHVA6",
     zip_version_id="oS.KrBTlcYAgr_lWyA_bIjVc2js_VeUe"
 )
-stimulus_set_registry['Malania2007_short-8'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.short8'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-8',
     bucket="brainio-brainscore",
     csv_sha1="8fc35f607196b4c0cdcebd8102d17e3a637e5988",
@@ -220,7 +220,7 @@
     csv_version_id="gzys8s7j7euMEl7JJpqBFLFHMpFjwbA7",
     zip_version_id="3fYb4Iruh3lRKUwC1APqFH4CNbE5DEuk"
 )
-stimulus_set_registry['Malania2007_short-8_fit'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.short8_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-8_fit',
     bucket="brainio-brainscore",
     csv_sha1="aa4133a9fe19a3c9004a9cb5e6eb5a72564e4883",
@@ -228,7 +228,7 @@
     csv_version_id="7N1Z.uiagqBknJUSBQ4mVfHKWgocM5aA",
     zip_version_id="kcEOPOkvWymO0wX5j_QKxcNPl9sZsjFd"
 )
-stimulus_set_registry['Malania2007_short-16'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.short16'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-16',
     bucket="brainio-brainscore",
     csv_sha1="addd260c9959f2f315db03c0a39c6c1b01fef685",
@@ -236,7 +236,7 @@
     csv_version_id="Peu7WU5vanLoZNOFIAbuPzZNPDRgbCSX",
     zip_version_id="wFkJkZMC8Fs_HfPJy32CMKcHJWeQIUDB"
 )
-stimulus_set_registry['Malania2007_short-16_fit'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.short16_fit'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_short-16_fit',
     bucket="brainio-brainscore",
     csv_sha1="9b340fe242117482f6992f48a805297215ba9924",
@@ -244,7 +244,7 @@
     csv_version_id="sYBPEmXDgbWipuepciLirlorQE3L8BLc",
     zip_version_id="pYvOkrLxadkQ67K3__wmciNwaCW.hyyN"
 )
-stimulus_set_registry['Malania2007_vernier-only'] = lambda: load_stimulus_set_from_s3(
+stimulus_set_registry['Malania2007.vernier_only'] = lambda: load_stimulus_set_from_s3(
     identifier='Malania2007_vernier-only',
     bucket="brainio-brainscore",
     csv_sha1="b2cb0f2ed32426b739f90187ae24ad4adf84110d",
diff --git a/brainscore_vision/data/malania2007/test.py b/brainscore_vision/data/malania2007/test.py
index 0c23f92ec..d63a882e3 100644
--- a/brainscore_vision/data/malania2007/test.py
+++ b/brainscore_vision/data/malania2007/test.py
@@ -9,16 +9,16 @@
 class TestAssemblies:
     # test the number of subjects:
     @pytest.mark.parametrize('identifier, num_subjects', [
-        ('short-2', 6),
-        ('short-4', 5),
-        ('short-6', 5),
-        ('short-8', 5),
-        ('short-16', 6),
-        ('equal-2', 5),
-        ('long-2', 5),
-        ('equal-16', 5),
-        ('long-16', 5),
-        ('vernier-only', 6)
+        ('short2', 6),
+        ('short4', 5),
+        ('short6', 5),
+        ('short8', 5),
+        ('short16', 6),
+        ('equal2', 5),
+        ('long2', 5),
+        ('equal16', 5),
+        ('long16', 5),
+        ('vernier_only', 6)
     ])
     def test_num_subjects(self, identifier, num_subjects):
         assembly = load_dataset(f"Malania2007_{identifier}")
@@ -27,22 +27,22 @@ def test_num_subjects(self, identifier, num_subjects):
 
     # test assembly coords present in ALL 17 sets:
     @pytest.mark.parametrize('identifier', [
-        'short-2',
-        'short-4',
-        'short-6',
-        'short-8',
-        'short-16',
-        'equal-2',
-        'long-2',
-        'equal-16',
-        'long-16',
-        'vernier-only',
+        'short2',
+        'short4',
+        'short6',
+        'short8',
+        'short16',
+        'equal2',
+        'long2',
+        'equal16',
+        'long16',
+        'vernier_only',
     ])
     @pytest.mark.parametrize('field', [
         'subject'
     ])
     def test_fields_present(self, identifier, field):
-        assembly = load_dataset(f"Malania2007_{identifier}")
+        assembly = load_dataset(f"Malania2007.{identifier}")
         assert hasattr(assembly, field)
 
 
@@ -50,78 +50,78 @@ def test_fields_present(self, identifier, field):
 class TestStimulusSets:
     # test stimulus_set data:
     @pytest.mark.parametrize('identifier', [
-        'short-2',
-        'short-4',
-        'short-6',
-        'short-8',
-        'short-16',
-        'equal-2',
-        'long-2',
-        'equal-16',
-        'long-16',
-        'short-2_fit',
-        'short-4_fit',
-        'short-6_fit',
-        'short-8_fit',
-        'short-16_fit',
-        'equal-2_fit',
-        'long-2_fit',
-        'equal-16_fit',
-        'long-16_fit',
-        'vernier-only'
+        'short2',
+        'short4',
+        'short6',
+        'short8',
+        'short16',
+        'equal2',
+        'long2',
+        'equal16',
+        'long16',
+        'short2_fit',
+        'short4_fit',
+        'short6_fit',
+        'short8_fit',
+        'short16_fit',
+        'equal2_fit',
+        'long2_fit',
+        'equal16_fit',
+        'long16_fit',
+        'vernier_only'
     ])
     def test_stimulus_set_exist(self, identifier):
-        full_name = f"Malania2007_{identifier}"
+        full_name = f"Malania2007.{identifier}"
         stimulus_set = load_stimulus_set(full_name)
         assert stimulus_set is not None
         assert stimulus_set.identifier == full_name
 
     @pytest.mark.parametrize('identifier, num_images', [
-        ('short-2', 50),
-        ('short-4', 50),
-        ('short-6', 50),
-        ('short-8', 50),
-        ('short-16', 50),
-        ('equal-2', 50),
-        ('long-2', 50),
-        ('equal-16', 50),
-        ('long-16', 50),
-        ('short-2_fit', 500),
-        ('short-4_fit', 500),
-        ('short-6_fit', 500),
-        ('short-8_fit', 500),
-        ('short-16_fit', 500),
-        ('equal-2_fit', 500),
-        ('long-2_fit', 500),
-        ('equal-16_fit', 500),
-        ('long-16_fit', 500),
-        ('vernier-only', 50)
+        ('short2', 50),
+        ('short4', 50),
+        ('short6', 50),
+        ('short8', 50),
+        ('short16', 50),
+        ('equal2', 50),
+        ('long2', 50),
+        ('equal16', 50),
+        ('long16', 50),
+        ('short2_fit', 500),
+        ('short4_fit', 500),
+        ('short6_fit', 500),
+        ('short8_fit', 500),
+        ('short16_fit', 500),
+        ('equal2_fit', 500),
+        ('long2_fit', 500),
+        ('equal16_fit', 500),
+        ('long16_fit', 500),
+        ('vernier_only', 50)
     ])
     def test_number_of_images(self, identifier, num_images):
-        stimulus_set = load_stimulus_set(f"Malania2007_{identifier}")
+        stimulus_set = load_stimulus_set(f"Malania2007.{identifier}")
         assert len(np.unique(stimulus_set['stimulus_id'].values)) == num_images
 
     # tests stimulus_set coords for the 14 "normal" sets:
     @pytest.mark.parametrize('identifier', [
-        'short-2',
-        'short-4',
-        'short-6',
-        'short-8',
-        'short-16',
-        'equal-2',
-        'long-2',
-        'equal-16',
-        'long-16',
-        'short-2_fit',
-        'short-4_fit',
-        'short-6_fit',
-        'short-8_fit',
-        'short-16_fit',
-        'equal-2_fit',
-        'long-2_fit',
-        'equal-16_fit',
-        'long-16_fit',
-        'vernier-only'
+        'short2',
+        'short4',
+        'short6',
+        'short8',
+        'short16',
+        'equal2',
+        'long2',
+        'equal16',
+        'long16',
+        'short2_fit',
+        'short4_fit',
+        'short6_fit',
+        'short8_fit',
+        'short16_fit',
+        'equal2_fit',
+        'long2_fit',
+        'equal16_fit',
+        'long16_fit',
+        'vernier_only'
     ])
     @pytest.mark.parametrize('field', [
         'image_size_x_pix',
@@ -141,5 +141,5 @@ def test_number_of_images(self, identifier, num_images):
         'stimulus_id',
     ])
     def test_fields_present(self, identifier, field):
-        stimulus_set = load_stimulus_set(f"Malania2007_{identifier}")
+        stimulus_set = load_stimulus_set(f"Malania2007.{identifier}")
         assert hasattr(stimulus_set, field)
diff --git a/brainscore_vision/metrics/threshold/metric.py b/brainscore_vision/metrics/threshold/metric.py
index 3151c6b34..e27f4d559 100644
--- a/brainscore_vision/metrics/threshold/metric.py
+++ b/brainscore_vision/metrics/threshold/metric.py
@@ -157,7 +157,7 @@ def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list,
             source_threshold = self.compute_threshold(source, self._independent_variable)
             # check whether the psychometric function fit was successful - if not, return a score of 0
             if source_threshold == 'fit_fail':
-                score = Score([0.], coords={'aggregation': ['center', ]}, dims=['aggregation'])
+                score = Score(0.)
                 score.attrs['error'] = 0.
                 return score
         else:
@@ -185,12 +185,11 @@ def ceiling(self, assembly: Union[PropertyAssembly, Dict[str, PropertyAssembly]]
                                self.threshold_accuracy)
             human_thresholds.remove(random_human_score)
             score = metric(random_human_score, human_thresholds)
-            score = float(score[(score['aggregation'] == 'center')].values)
             human_thresholds.append(random_human_score)
-            scores.append(score)
+            scores.append(score.values)
 
         ceiling, ceiling_error = np.mean(scores), np.std(scores)
-        ceiling = Score([ceiling], coords={'aggregation': ['center']}, dims=['aggregation'])
+        ceiling = Score(ceiling)
         ceiling.attrs['error'] = ceiling_error
         return ceiling
 
@@ -322,11 +321,10 @@ def scoring_function(source: float, target: Union[list, PropertyAssembly]) -> Sc
             raw_score = max((1 - ((np.abs(target_value - source)) / target_value)), 0)
             raw_scores.append(raw_score)
 
-        raw_score, model_error = np.mean(raw_scores), np.std(raw_scores)
-        # add the aggregation: center coordinate to the score
-        score = Score([np.mean(raw_scores)], coords={'aggregation': ['center']}, dims=['aggregation'])
-        score.attrs['raw'] = raw_score
-        score.attrs['error'] = model_error
+        scores_mean, scores_std = np.mean(raw_scores), np.std(raw_scores)
+        score = Score(scores_mean)
+        score.attrs['raw'] = raw_scores
+        score.attrs['error'] = scores_std
         return score
 
     @staticmethod
@@ -454,7 +452,7 @@ def __call__(self,
             source_test_threshold = self.test_threshold_metric.compute_threshold(source[self.test_condition],
                                                                                  self._independent_variable)
             if source_baseline_threshold == 'fit_fail' or source_test_threshold == 'fit_fail':
-                return Score([0., 0.], coords={'aggregation': ['center', 'error']}, dims=['aggregation'])
+                return Score(0.)  # psychometric function could not be fit -- this typically means that the model is at chance throughout
             raw_source_threshold_elevation = source_test_threshold / source_baseline_threshold
         else:
             raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.')
@@ -489,17 +487,17 @@ def ceiling(self, assemblies: Dict[str, PropertyAssembly]) -> Score:
                                         self.threshold_accuracy)
             human_threshold_elevations.remove(random_human_score)
             score = metric(random_human_score, human_threshold_elevations)
-            score = float(score[(score['aggregation'] == 'center')].values)
             human_threshold_elevations.append(random_human_score)
-            scores.append(score)
+            scores.append(score.values)
 
         ceiling, ceiling_error = np.mean(scores), np.std(scores)
-        ceiling = Score([ceiling], coords={'aggregation': ['center']}, dims=['aggregation'])
+        ceiling = Score(ceiling)
+        ceiling.attrs['raw'] = scores
         ceiling.attrs['error'] = ceiling_error
         return ceiling
 
     @staticmethod
-    def compute_threshold_elevations(assemblies: Dict[str, PropertyAssembly]) -> list:
+    def compute_threshold_elevations(assemblies: Dict[str, PropertyAssembly]) -> List:
         """
         Computes the threshold elevations of a baseline condition and a test condition:
 
diff --git a/brainscore_vision/metrics/threshold/test.py b/brainscore_vision/metrics/threshold/test.py
index 7253d1cd9..84f0c7214 100644
--- a/brainscore_vision/metrics/threshold/test.py
+++ b/brainscore_vision/metrics/threshold/test.py
@@ -9,7 +9,6 @@ def test_threshold_score_from_thresholds():
     # independent_variable is not used since we compute from thresholds, and do not need to fit them
     metric = load_metric('threshold', independent_variable='placeholder')
     score = metric(float(assembly.sel(subject='A').values), assembly)
-    print(score)
     assert score == approx(0.5625)
 
 
@@ -21,7 +20,6 @@ def test_threshold_elevation_score_from_threshold_elevations():
                          baseline_condition='placeholder',
                          test_condition='placeholder')
     score = metric(float(assembly.sel(subject='A').values), assembly)
-    print(score)
     assert score == approx(0.525)
 
 

From 6e322e338fe2a7b1c19751ab323cd43310b4cc1a Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 5 Jul 2024 11:50:15 +0200
Subject: [PATCH 62/65] add renaming suggestions

---
 .../benchmarks/malania2007/__init__.py        | 20 ++++----
 .../benchmarks/malania2007/benchmark.py       | 33 +++++--------
 .../benchmarks/malania2007/test.py            | 40 ++++++++--------
 brainscore_vision/metrics/threshold/metric.py | 48 ++++---------------
 4 files changed, 52 insertions(+), 89 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/__init__.py b/brainscore_vision/benchmarks/malania2007/__init__.py
index 82e6e0e42..7c6a3bbd1 100644
--- a/brainscore_vision/benchmarks/malania2007/__init__.py
+++ b/brainscore_vision/benchmarks/malania2007/__init__.py
@@ -1,13 +1,13 @@
 from brainscore_vision import benchmark_registry
 from . import benchmark
 
-benchmark_registry['Malania2007.short2'] = lambda: benchmark._Malania2007Base('short-2')
-benchmark_registry['Malania2007.short4'] = lambda: benchmark._Malania2007Base('short-4')
-benchmark_registry['Malania2007.short6'] = lambda: benchmark._Malania2007Base('short-6')
-benchmark_registry['Malania2007.short8'] = lambda: benchmark._Malania2007Base('short-8')
-benchmark_registry['Malania2007.short16'] = lambda: benchmark._Malania2007Base('short-16')
-benchmark_registry['Malania2007.equal2'] = lambda: benchmark._Malania2007Base('equal-2')
-benchmark_registry['Malania2007.long2'] = lambda: benchmark._Malania2007Base('long-2')
-benchmark_registry['Malania2007.equal16'] = lambda: benchmark._Malania2007Base('equal-16')
-benchmark_registry['Malania2007.long16'] = lambda: benchmark._Malania2007Base('long-16')
-benchmark_registry['Malania2007.vernieracuity'] = lambda: benchmark._Malania2007VernierAcuity()
+benchmark_registry['Malania2007.short2-threshold_elevation'] = lambda: benchmark._Malania2007Base('short2')
+benchmark_registry['Malania2007.short4-threshold_elevation'] = lambda: benchmark._Malania2007Base('short4')
+benchmark_registry['Malania2007.short6-threshold_elevation'] = lambda: benchmark._Malania2007Base('short6')
+benchmark_registry['Malania2007.short8-threshold_elevation'] = lambda: benchmark._Malania2007Base('short8')
+benchmark_registry['Malania2007.short16-threshold_elevation'] = lambda: benchmark._Malania2007Base('short16')
+benchmark_registry['Malania2007.equal2-threshold_elevation'] = lambda: benchmark._Malania2007Base('equal2')
+benchmark_registry['Malania2007.long2-threshold_elevation'] = lambda: benchmark._Malania2007Base('long2')
+benchmark_registry['Malania2007.equal16-threshold_elevation'] = lambda: benchmark._Malania2007Base('equal16')
+benchmark_registry['Malania2007.long16-threshold_elevation'] = lambda: benchmark._Malania2007Base('long16')
+benchmark_registry['Malania2007.vernieracuity-threshold'] = lambda: benchmark._Malania2007VernierAcuity()
diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index 00bd36bbe..b9b1e7aba 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -10,8 +10,6 @@
 from brainscore_vision.utils import LazyLoad
 from brainscore_core.metrics import Score
 
-from brainio.packaging import write_netcdf
-
 
 BIBTEX = """@article{malania2007,
             author = {Malania, Maka and Herzog, Michael H. and Westheimer, Gerald},
@@ -27,8 +25,10 @@
         }"""
 
 BASELINE_CONDITION = 'vernier_only'
-DATASETS = ['short2', 'short4', 'short6', 'short8', 'short16', 'equal2', 'long2', 'equal16', 'long16',
-            'vernieracuity']
+DATASETS = ['short2-threshold_elevation', 'short4-threshold_elevation', 'short6-threshold_elevation',
+            'short8-threshold_elevation', 'short16-threshold_elevation', 'equal2-threshold_elevation',
+            'long2-threshold_elevation', 'equal16-threshold_elevation', 'long16-threshold_elevation',
+            'vernieracuity-threshold']
 # Values in NUM_FLANKERS_PER_CONDITION denote the condition (i.e., in this case the number of flankers) to be selected
 # This is kept track of simply because the benchmark uses threshold elevation - i.e., a comparison of 2 conditions
 NUM_FLANKERS_PER_CONDITION = {'short2': 2, 'short4': 4, 'short6': 6, 'short8': 8,
@@ -87,11 +87,11 @@ def __init__(self, condition: str):
 
         self._assemblies = {'baseline_assembly': self._baseline_assembly,
                             'condition_assembly': self._assembly}
-        self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007_{self.condition}')
-        self._baseline_stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007_{self.baseline_condition}')
+        self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.condition}'.rstrip('-threshold_elevation'))
+        self._baseline_stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007_{self.baseline_condition}'.rstrip('-threshold_elevation'))
         self._stimulus_sets = {self.condition: self._stimulus_set,
                                self.baseline_condition: self._baseline_stimulus_set}
-        self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'Malania2007_{self.condition}_fit')
+        self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'Malania2007_{self.condition}'.rstrip('-threshold_elevation') + '_fit')
 
         self._metric = load_metric('threshold_elevation',
                                    independent_variable='image_label',
@@ -127,10 +127,6 @@ def __call__(self, candidate: BrainModel):
         ceiling = self.ceiling
         score = raw_score / ceiling
 
-        # cap score at 1 if ceiled score > 1
-        if score[(score['aggregation'] == 'center')] > 1:
-            score.__setitem__({'aggregation': score['aggregation'] == 'center'}, 1)
-
         score.attrs['raw'] = raw_score
         score.attrs['ceiling'] = ceiling
         return score
@@ -140,14 +136,14 @@ class _Malania2007VernierAcuity(BenchmarkBase):
     def __init__(self):
         self.baseline_condition = BASELINE_CONDITION
         self.conditions = DATASETS.copy()
-        self.conditions.remove('vernieracuity')
+        self.conditions.remove('vernieracuity-threshold')
 
         self._assemblies = {condition: {'baseline_assembly': self.get_assemblies(condition)['baseline_assembly'],
                                         'condition_assembly': self.get_assemblies(condition)['condition_assembly']}
                             for condition in self.conditions}
         self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007_{self.baseline_condition}')
-        self._fitting_stimuli = {condition: brainscore_vision.load_stimulus_set(f'Malania2007_{condition}_fit')
-                               for condition in self.conditions}
+        self._fitting_stimuli = {condition: brainscore_vision.load_stimulus_set(f'Malania2007_{condition}'.rstrip('-threshold_elevation') + '_fit')
+                                 for condition in self.conditions}
 
         self._metric = load_metric('threshold',
                                    independent_variable='image_label',
@@ -157,7 +153,7 @@ def __init__(self):
         self._number_of_trials = 10  # arbitrary choice for microsaccades to improve precision of estimates
 
         super(_Malania2007VernierAcuity, self).__init__(
-            identifier=f'Malania2007.vernieracuity', version=1,
+            identifier=f'Malania2007.vernieracuity-threshold', version=1,
             ceiling_func=lambda: self.mean_ceiling(),
             parent='Malania2007',
             bibtex=BIBTEX)
@@ -181,10 +177,6 @@ def __call__(self, candidate: BrainModel):
             score = raw_score / ceiling
             score.attrs['error'] = raw_score.error
 
-            # cap score at 1 if ceiled score > 1
-            if score[(score['aggregation'] == 'center')] > 1:
-                score.__setitem__({'aggregation': score['aggregation'] == 'center'}, 1)
-
             score.attrs['raw'] = raw_score
             score.attrs['ceiling'] = ceiling
             scores.append(score)
@@ -194,6 +186,7 @@ def __call__(self, candidate: BrainModel):
         return mean_score
 
     def get_assemblies(self, condition: str):
+        condition = condition.rstrip('-threshold_elevation')
         baseline_assembly = LazyLoad(lambda: load_assembly(self.baseline_condition))
         condition_assembly = LazyLoad(lambda: load_assembly(condition))
         assembly, baseline_assembly = filter_baseline_subjects(condition_assembly,
@@ -208,7 +201,7 @@ def mean_ceiling(self):
             this_ceiling = self._metric.ceiling(self._assemblies[assembly_name]['baseline_assembly'])
             ceilings.append(this_ceiling.values)
             errors.append(this_ceiling.error)
-        mean_ceiling = Score(np.mean(ceilings), coords={'aggregation': ['center']}, dims=['aggregation'])
+        mean_ceiling = Score(np.mean(ceilings))
         mean_ceiling.attrs['error'] = np.mean(errors)
         return mean_ceiling
 
diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index a4ac27984..d6bb48e7f 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -26,16 +26,16 @@ def test_mean_ceiling(self):
     # these test values are for the pooled score ceiling
     @pytest.mark.private_access
     @pytest.mark.parametrize('dataset, expected_ceiling', [
-        ('short-2', approx(0.78719345, abs=0.001)),
-        ('short-4', approx(0.49998989, abs=0.001)),
-        ('short-6', approx(0.50590051, abs=0.001)),
-        ('short-8', approx(0.4426336, abs=0.001)),
-        ('short-16', approx(0.8383443, abs=0.001)),
-        ('equal-2', approx(0.56664015, abs=0.001)),
-        ('long-2', approx(0.46470421, abs=0.001)),
-        ('equal-16', approx(0.44087153, abs=0.001)),
-        ('long-16', approx(0.50996587, abs=0.001)),
-        ('vernieracuity', approx(0.70168481, abs=0.001))
+        ('short2-threshold_elevation', approx(0.78719345, abs=0.001)),
+        ('short4-threshold_elevation', approx(0.49998989, abs=0.001)),
+        ('short6-threshold_elevation', approx(0.50590051, abs=0.001)),
+        ('short8-threshold_elevation', approx(0.4426336, abs=0.001)),
+        ('short16-threshold_elevation', approx(0.8383443, abs=0.001)),
+        ('equal2-threshold_elevation', approx(0.56664015, abs=0.001)),
+        ('long2-threshold_elevation', approx(0.46470421, abs=0.001)),
+        ('equal16-threshold_elevation', approx(0.44087153, abs=0.001)),
+        ('long16-threshold_elevation', approx(0.50996587, abs=0.001)),
+        ('vernieracuity-threshold', approx(0.70168481, abs=0.001))
     ])
     def test_dataset_ceiling(self, dataset, expected_ceiling):
         benchmark = f"Malania2007.{dataset}"
@@ -44,16 +44,16 @@ def test_dataset_ceiling(self, dataset, expected_ceiling):
         assert ceiling.sel(aggregation='center').values.item() == expected_ceiling
 
     @pytest.mark.parametrize('dataset, expected_score', [
-        ('short-2', approx(0.0, abs=0.001)),
-        ('short-4', approx(0.0, abs=0.001)),
-        ('short-6', approx(0.0, abs=0.001)),
-        ('short-8', approx(0.0, abs=0.001)),
-        ('short-16', approx(0.0, abs=0.001)),
-        ('equal-2', approx(0.0, abs=0.001)),
-        ('long-2', approx(0.0, abs=0.001)),
-        ('equal-16', approx(0.0, abs=0.001)),
-        ('long-16', approx(0.0, abs=0.001)),
-        ('vernieracuity', approx(0.0, abs=0.001))
+        ('short2-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short4-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short6-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short8-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short16-threshold_elevation', approx(0.0, abs=0.001)),
+        ('equal2-threshold_elevation', approx(0.0, abs=0.001)),
+        ('long2-threshold_elevation', approx(0.0, abs=0.001)),
+        ('equal16-threshold_elevation', approx(0.0, abs=0.001)),
+        ('long16-threshold_elevation', approx(0.0, abs=0.001)),
+        ('vernieracuity-threshold', approx(0.0, abs=0.001))
     ])
     def test_model_score(self, dataset, expected_score):
         benchmark = f"Malania2007.{dataset}"
diff --git a/brainscore_vision/metrics/threshold/metric.py b/brainscore_vision/metrics/threshold/metric.py
index e27f4d559..68273af55 100644
--- a/brainscore_vision/metrics/threshold/metric.py
+++ b/brainscore_vision/metrics/threshold/metric.py
@@ -121,8 +121,7 @@ def __init__(self,
                  fit_function=psychometric_cum_gauss,
                  fit_inverse_function=inverse_psychometric_cum_gauss,
                  threshold_accuracy: Union[str, float] = 'inflection',
-                 required_accuracy: Optional[float] = 0.6,
-                 plot_fit: bool = False
+                 required_accuracy: Optional[float] = 0.6
                  ):
         """
         :param independent_variable: The independent variable in the benchmark that the threshold is computed
@@ -134,13 +133,13 @@ def __init__(self,
                                     is used, the function finds the inflection point of the curve and evaluates
                                     the threshold at that level. When a float is used, the function evaluates
                                     the threshold at that level.
+        :param required_accuracy: The minimum accuracy required for the psychometric function fit to be considered.
         """
         self.fit_function = fit_function
         self.fit_inverse_function = fit_inverse_function
         self._independent_variable = independent_variable
         self.threshold_accuracy = threshold_accuracy
         self.required_accuracy = required_accuracy
-        self.plot_fit = plot_fit
 
     def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list, PropertyAssembly]) -> Score:
         """
@@ -248,13 +247,6 @@ def fit_threshold_function(self, x_points: np.array, y_points: np.array) -> Unio
             print('Fit fail due to low fit R^2.')
             params = 'fit_fail'
 
-        if self.plot_fit:
-            self.plot_fit_(x_points,
-                           aggregated_x_points,
-                           y_points,
-                           aggregated_y_points,
-                           params,
-                           fit_function=self.fit_function)
         return params, measurement_max
 
     def find_threshold(self, threshold_accuracy: float, fit_params: Tuple[float, ...]) -> float:
@@ -275,27 +267,6 @@ def inflection_accuracy(self, x_points: np.array, fit_params: np.array) -> float
         threshold_accuracy = min_fit_accuracy + (max_fit_accuracy - min_fit_accuracy) / 2
         return threshold_accuracy
 
-    def plot_fit_(self, x_points, x_points_removed, y_points, y_points_removed, fit_params, fit_function):
-        # Create a dense set of x values for plotting the fitted curve
-        x_dense = np.linspace(min(x_points), max(x_points), 1000)
-        # Calculate the corresponding y values using the fit function and parameters
-        y_dense = fit_function(x_dense, *fit_params)
-
-        # Plot the original data points
-        plt.scatter(x_points, y_points, label='Before asymptote removal',
-                    marker='o', color='blue', alpha=0.5)
-        plt.scatter(x_points_removed, y_points_removed, label='After asymptote removal',
-                    marker='o', color='red', alpha=0.5)
-
-        # Plot the fitted curve
-        plt.plot(x_dense, y_dense, label='Fitted curve', color='red', linewidth=2)
-
-        # Add labels and a legend
-        plt.xlabel(self._independent_variable)
-        plt.ylabel('Accuracy')
-        plt.legend()
-        plt.show()
-
     @staticmethod
     def aggregate_psychometric_fit_data(x_points, y_points):
         unique_x = np.unique(x_points)
@@ -396,8 +367,7 @@ def __init__(self,
                  test_condition: str,
                  threshold_accuracy: Union[str, float] = 'inflection',
                  required_baseline_accuracy: Optional[float] = 0.6,
-                 required_test_accuracy: Optional[float] = 0.6,
-                 plot_fit: bool = False
+                 required_test_accuracy: Optional[float] = 0.6
                  ):
         """
         :param independent_variable: The independent variable in the benchmark that the threshold is computed
@@ -409,18 +379,18 @@ def __init__(self,
                                     is used, the function finds the inflection point of the curve and evaluates
                                     the threshold at that level. When a float is used, the function evaluates
                                     the threshold at that level.
-        :param scoring: The scoring function used to evaluate performance. Either Literal['individual'] or
-                         Literal['pool']. See the scoring_function and pool_score methods for more information.
+        :param required_baseline_accuracy: The minimum accuracy required for the psychometric function fit to be
+                                             considered for the baseline condition.
+        :param required_test_accuracy: The minimum accuracy required for the psychometric function fit to be
+                                        considered for the test condition.
         """
         super(ThresholdElevation, self).__init__(independent_variable)
         self.baseline_threshold_metric = Threshold(self._independent_variable,
                                                    threshold_accuracy=threshold_accuracy,
-                                                   required_accuracy=required_baseline_accuracy,
-                                                   plot_fit=plot_fit)
+                                                   required_accuracy=required_baseline_accuracy)
         self.test_threshold_metric = Threshold(self._independent_variable,
                                                threshold_accuracy=threshold_accuracy,
-                                               required_accuracy=required_test_accuracy,
-                                               plot_fit=plot_fit)
+                                               required_accuracy=required_test_accuracy)
         self.baseline_condition = baseline_condition
         self.test_condition = test_condition
         self.threshold_accuracy = threshold_accuracy

From c20bb5b7a12286d79d7a80d030edeeb01b07ecd5 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 5 Jul 2024 12:08:23 +0200
Subject: [PATCH 63/65] fix naming issues

---
 brainscore_vision/benchmarks/malania2007/benchmark.py | 8 ++++----
 brainscore_vision/metrics/threshold/metric.py         | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
index b9b1e7aba..7ad587b4d 100644
--- a/brainscore_vision/benchmarks/malania2007/benchmark.py
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -88,10 +88,10 @@ def __init__(self, condition: str):
         self._assemblies = {'baseline_assembly': self._baseline_assembly,
                             'condition_assembly': self._assembly}
         self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.condition}'.rstrip('-threshold_elevation'))
-        self._baseline_stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007_{self.baseline_condition}'.rstrip('-threshold_elevation'))
+        self._baseline_stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.baseline_condition}'.rstrip('-threshold_elevation'))
         self._stimulus_sets = {self.condition: self._stimulus_set,
                                self.baseline_condition: self._baseline_stimulus_set}
-        self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'Malania2007_{self.condition}'.rstrip('-threshold_elevation') + '_fit')
+        self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'Malania2007.{self.condition}'.rstrip('-threshold_elevation') + '_fit')
 
         self._metric = load_metric('threshold_elevation',
                                    independent_variable='image_label',
@@ -141,8 +141,8 @@ def __init__(self):
         self._assemblies = {condition: {'baseline_assembly': self.get_assemblies(condition)['baseline_assembly'],
                                         'condition_assembly': self.get_assemblies(condition)['condition_assembly']}
                             for condition in self.conditions}
-        self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007_{self.baseline_condition}')
-        self._fitting_stimuli = {condition: brainscore_vision.load_stimulus_set(f'Malania2007_{condition}'.rstrip('-threshold_elevation') + '_fit')
+        self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.baseline_condition}')
+        self._fitting_stimuli = {condition: brainscore_vision.load_stimulus_set(f'Malania2007.{condition}'.rstrip('-threshold_elevation') + '_fit')
                                  for condition in self.conditions}
 
         self._metric = load_metric('threshold',
diff --git a/brainscore_vision/metrics/threshold/metric.py b/brainscore_vision/metrics/threshold/metric.py
index 68273af55..2b04270e3 100644
--- a/brainscore_vision/metrics/threshold/metric.py
+++ b/brainscore_vision/metrics/threshold/metric.py
@@ -1,4 +1,4 @@
-from typing import Dict, Union, Tuple, Optional, Callable
+from typing import Dict, Union, Tuple, List, Optional, Callable
 
 import numpy as np
 from scipy.optimize import minimize

From faa1c7d782dedc955c56adee8e9262893be8138f Mon Sep 17 00:00:00 2001
From: Martin Schrimpf <mschrimpf@users.noreply.github.com>
Date: Sat, 6 Jul 2024 01:40:38 -0400
Subject: [PATCH 64/65] remove out-dated aggregation dim in test

---
 brainscore_vision/benchmarks/malania2007/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
index d6bb48e7f..8a8ca75cd 100644
--- a/brainscore_vision/benchmarks/malania2007/test.py
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -41,7 +41,7 @@ def test_dataset_ceiling(self, dataset, expected_ceiling):
         benchmark = f"Malania2007.{dataset}"
         benchmark = load_benchmark(benchmark)
         ceiling = benchmark.ceiling
-        assert ceiling.sel(aggregation='center').values.item() == expected_ceiling
+        assert ceiling == expected_ceiling
 
     @pytest.mark.parametrize('dataset, expected_score', [
         ('short2-threshold_elevation', approx(0.0, abs=0.001)),

From d10930fe870bc22edf97814874522d1e48b06d03 Mon Sep 17 00:00:00 2001
From: Martin Schrimpf <mschrimpf@users.noreply.github.com>
Date: Sat, 6 Jul 2024 01:44:12 -0400
Subject: [PATCH 65/65] fix identifier testing

---
 brainscore_vision/data/malania2007/test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/brainscore_vision/data/malania2007/test.py b/brainscore_vision/data/malania2007/test.py
index d63a882e3..faf216749 100644
--- a/brainscore_vision/data/malania2007/test.py
+++ b/brainscore_vision/data/malania2007/test.py
@@ -21,7 +21,7 @@ class TestAssemblies:
         ('vernier_only', 6)
     ])
     def test_num_subjects(self, identifier, num_subjects):
-        assembly = load_dataset(f"Malania2007_{identifier}")
+        assembly = load_dataset(f"Malania2007.{identifier}")
         assembly = assembly.dropna(dim='subject')
         assert len(np.unique(assembly['subject'].values)) == num_subjects
 
@@ -74,7 +74,9 @@ def test_stimulus_set_exist(self, identifier):
         full_name = f"Malania2007.{identifier}"
         stimulus_set = load_stimulus_set(full_name)
         assert stimulus_set is not None
-        assert stimulus_set.identifier == full_name
+        stripped_actual_identifier = stimulus_set.identifier.replace('.', '').replace('_', '').replace('-', '')
+        stripped_expected_identifier = full_name.replace('.', '').replace('_', '').replace('-', '')
+        assert stripped_actual_identifier == stripped_expected_identifier
 
     @pytest.mark.parametrize('identifier, num_images', [
         ('short2', 50),