diff --git a/brainscore_vision/benchmarks/malania2007/__init__.py b/brainscore_vision/benchmarks/malania2007/__init__.py
new file mode 100644
index 000000000..7c6a3bbd1
--- /dev/null
+++ b/brainscore_vision/benchmarks/malania2007/__init__.py
@@ -0,0 +1,13 @@
+from brainscore_vision import benchmark_registry
+from . import benchmark
+
+benchmark_registry['Malania2007.short2-threshold_elevation'] = lambda: benchmark._Malania2007Base('short2')
+benchmark_registry['Malania2007.short4-threshold_elevation'] = lambda: benchmark._Malania2007Base('short4')
+benchmark_registry['Malania2007.short6-threshold_elevation'] = lambda: benchmark._Malania2007Base('short6')
+benchmark_registry['Malania2007.short8-threshold_elevation'] = lambda: benchmark._Malania2007Base('short8')
+benchmark_registry['Malania2007.short16-threshold_elevation'] = lambda: benchmark._Malania2007Base('short16')
+benchmark_registry['Malania2007.equal2-threshold_elevation'] = lambda: benchmark._Malania2007Base('equal2')
+benchmark_registry['Malania2007.long2-threshold_elevation'] = lambda: benchmark._Malania2007Base('long2')
+benchmark_registry['Malania2007.equal16-threshold_elevation'] = lambda: benchmark._Malania2007Base('equal16')
+benchmark_registry['Malania2007.long16-threshold_elevation'] = lambda: benchmark._Malania2007Base('long16')
+benchmark_registry['Malania2007.vernieracuity-threshold'] = lambda: benchmark._Malania2007VernierAcuity()
diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
new file mode 100644
index 000000000..7ad587b4d
--- /dev/null
+++ b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -0,0 +1,224 @@
+from typing import Tuple
+import numpy as np
+
+import brainscore_vision
+from brainio.assemblies import PropertyAssembly
+from brainscore_vision.benchmarks import BenchmarkBase
+from brainscore_vision.benchmark_helpers.screen import place_on_screen
+from brainscore_vision import load_metric
+from brainscore_vision.model_interface import BrainModel
+from brainscore_vision.utils import LazyLoad
+from brainscore_core.metrics import Score
+
+
+BIBTEX = """@article{malania2007,
+            author = {Malania, Maka and Herzog, Michael H. and Westheimer, Gerald},
+            title = "{Grouping of contextual elements that affect vernier thresholds}",
+            journal = {Journal of Vision},
+            volume = {7},
+            number = {2},
+            pages = {1-1},
+            year = {2007},
+            issn = {1534-7362},
+            doi = {10.1167/7.2.1},
+            url = {https://doi.org/10.1167/7.2.1}
+        }"""
+
+BASELINE_CONDITION = 'vernier_only'
+DATASETS = ['short2-threshold_elevation', 'short4-threshold_elevation', 'short6-threshold_elevation',
+            'short8-threshold_elevation', 'short16-threshold_elevation', 'equal2-threshold_elevation',
+            'long2-threshold_elevation', 'equal16-threshold_elevation', 'long16-threshold_elevation',
+            'vernieracuity-threshold']
+# Values in NUM_FLANKERS_PER_CONDITION denote the condition (i.e., in this case the number of flankers) to be selected
+# This is kept track of simply because the benchmark uses threshold elevation - i.e., a comparison of 2 conditions
+NUM_FLANKERS_PER_CONDITION = {'short2': 2, 'short4': 4, 'short6': 6, 'short8': 8,
+                              'short16': 16, 'equal2': 2, 'long2': 2, 'equal16': 16,
+                              'long16': 16, 'vernier_only': 0}
+
+
+class _Malania2007Base(BenchmarkBase):
+    """
+    INFORMATION:
+
+    Benchmark DATASETS should be considered as independent. This means that participant-specific across-condition data
+    should only ever be compared using the 'subject_unique_id'. In some conditions (short-2, vernier_only, short-16)
+    an additional observer was added from the original paper's plots. This is because in these conditions, two
+    experiments were independently conducted, and 1 additional observer that was non-overlapping between the
+    experiments was added to the aggregate benchmark.
+
+    While humans and models are performing the same testing task in this benchmark, there are a number of choices
+    that are made in this benchmark that make minor deviations from the human experiment. The choices that make
+    deviations from the human experiment are listed below alongside the reason for why the departure was made,
+    and what the 'precisely faithful' alternative would be.
+
+    Benchmark Choices:
+
+    1) The number and type of fitting stimuli are unfounded choices. Currently, the number of fitting stimuli is chosen
+        to be relatively large, and hopefully sufficient for decoding in the baseline condition in general.
+        - Precisely faithful alternative: Present text instructions to models as they were presented to humans
+            * Why not this alternative? Since the experiment is about early visual perception, and there are currently
+            few/no models capable of a task like this, it would not be interesting.
+        - Somewhat faithful alternative: Present a smaller number of training stimuli, motivated by work like
+        Lee & DiCarlo (2023), biorXiv (doi:https://doi.org/10.1101/2022.12.31.522402).
+            * Why not this alternative? Since the experiment is not about perceptual learning but about early visual
+            perception, and there are few/no models capable of a task like this, it would not be interesting.
+        - Importantly, this means the benchmark examines the models' capability to support a task like this, rather than
+        their capability to learn a task like this.
+    2) In the human experiment, stimuli were presented at exactly the foveal position. In the model experiment,
+        testing stimuli are presented at exactly the foveal position +- 72arcsec = 0.02deg.
+        * Why this alternative? Since most models evaluated are test-time deterministic, we want a more precise
+        estimate of the threshold than a point estimate. Since human microsaccades of small distances are generally
+        uncontrolled and uncontrollable for (e.g., up to 360arcsec = 6arcmin = 0.1 deg), we believe the tiny jitter
+        of 0.02deg to have no impact at all on the comparison under study, while improving the precision of threshold
+        estimates.
+
+    """
+    def __init__(self, condition: str):
+        self.baseline_condition = BASELINE_CONDITION
+        self.condition = condition
+
+        # since this benchmark compares threshold elevation against a baseline, we omit one subject
+        # in some conditions in which that subject did not perform both the baseline and the test
+        # condition
+        baseline_assembly = LazyLoad(lambda: load_assembly(self.baseline_condition))
+        condition_assembly = LazyLoad(lambda: load_assembly(self.condition))
+        self._assembly, self._baseline_assembly = filter_baseline_subjects(condition_assembly,
+                                                                           baseline_assembly)
+
+        self._assemblies = {'baseline_assembly': self._baseline_assembly,
+                            'condition_assembly': self._assembly}
+        self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.condition}'.rstrip('-threshold_elevation'))
+        self._baseline_stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.baseline_condition}'.rstrip('-threshold_elevation'))
+        self._stimulus_sets = {self.condition: self._stimulus_set,
+                               self.baseline_condition: self._baseline_stimulus_set}
+        self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'Malania2007.{self.condition}'.rstrip('-threshold_elevation') + '_fit')
+
+        self._metric = load_metric('threshold_elevation',
+                                   independent_variable='image_label',
+                                   baseline_condition=self.baseline_condition,
+                                   test_condition=self.condition,
+                                   threshold_accuracy=0.75)
+
+        self._visual_degrees = 2.986667
+        self._number_of_trials = 10  # arbitrary choice for microsaccades to improve precision of estimates
+
+        super(_Malania2007Base, self).__init__(
+            identifier=f'Malania2007.{condition}', version=1,
+            ceiling_func=lambda: self._metric.ceiling(self._assemblies),
+            parent='Malania2007',
+            bibtex=BIBTEX)
+
+    def __call__(self, candidate: BrainModel):
+        model_responses = {}
+        candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli,
+                             number_of_trials=2, require_variance=True)
+        for condition in (self.baseline_condition, self.condition):
+            stimulus_set = place_on_screen(
+                self._stimulus_sets[condition],
+                target_visual_degrees=candidate.visual_degrees(),
+                source_visual_degrees=self._visual_degrees
+            )
+            model_responses[condition] = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials,
+                                                           require_variance=True)
+
+        raw_score = self._metric(model_responses, self._assemblies)
+
+        # Adjust score to ceiling
+        ceiling = self.ceiling
+        score = raw_score / ceiling
+
+        score.attrs['raw'] = raw_score
+        score.attrs['ceiling'] = ceiling
+        return score
+
+
+class _Malania2007VernierAcuity(BenchmarkBase):
+    def __init__(self):
+        self.baseline_condition = BASELINE_CONDITION
+        self.conditions = DATASETS.copy()
+        self.conditions.remove('vernieracuity-threshold')
+
+        self._assemblies = {condition: {'baseline_assembly': self.get_assemblies(condition)['baseline_assembly'],
+                                        'condition_assembly': self.get_assemblies(condition)['condition_assembly']}
+                            for condition in self.conditions}
+        self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.baseline_condition}')
+        self._fitting_stimuli = {condition: brainscore_vision.load_stimulus_set(f'Malania2007.{condition}'.rstrip('-threshold_elevation') + '_fit')
+                                 for condition in self.conditions}
+
+        self._metric = load_metric('threshold',
+                                   independent_variable='image_label',
+                                   threshold_accuracy=0.75)
+
+        self._visual_degrees = 2.986667
+        self._number_of_trials = 10  # arbitrary choice for microsaccades to improve precision of estimates
+
+        super(_Malania2007VernierAcuity, self).__init__(
+            identifier=f'Malania2007.vernieracuity-threshold', version=1,
+            ceiling_func=lambda: self.mean_ceiling(),
+            parent='Malania2007',
+            bibtex=BIBTEX)
+
+    def __call__(self, candidate: BrainModel):
+        scores = []
+        for condition in self.conditions:
+            candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli[condition],
+                                 number_of_trials=2, require_variance=True)
+            stimulus_set = place_on_screen(
+                self._stimulus_set,
+                target_visual_degrees=candidate.visual_degrees(),
+                source_visual_degrees=self._visual_degrees
+            )
+            model_response = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials,
+                                               require_variance=True)
+
+            raw_score = self._metric(model_response, self._assemblies[condition])
+            # Adjust score to ceiling
+            ceiling = self.ceiling
+            score = raw_score / ceiling
+            score.attrs['error'] = raw_score.error
+
+            score.attrs['raw'] = raw_score
+            score.attrs['ceiling'] = ceiling
+            scores.append(score)
+        # average all scores to get 1 average score
+        mean_score = Score(np.mean(scores))
+        mean_score.attrs['error'] = np.mean([score.error for score in scores])
+        return mean_score
+
+    def get_assemblies(self, condition: str):
+        condition = condition.rstrip('-threshold_elevation')
+        baseline_assembly = LazyLoad(lambda: load_assembly(self.baseline_condition))
+        condition_assembly = LazyLoad(lambda: load_assembly(condition))
+        assembly, baseline_assembly = filter_baseline_subjects(condition_assembly,
+                                                               baseline_assembly)
+        return {'condition_assembly': assembly,
+                'baseline_assembly': baseline_assembly}
+
+    def mean_ceiling(self):
+        ceilings = []
+        errors = []
+        for assembly_name in self._assemblies.keys():
+            this_ceiling = self._metric.ceiling(self._assemblies[assembly_name]['baseline_assembly'])
+            ceilings.append(this_ceiling.values)
+            errors.append(this_ceiling.error)
+        mean_ceiling = Score(np.mean(ceilings))
+        mean_ceiling.attrs['error'] = np.mean(errors)
+        return mean_ceiling
+
+
+def load_assembly(dataset: str) -> PropertyAssembly:
+    assembly = brainscore_vision.load_dataset(f'Malania2007.{dataset}')
+    return assembly
+
+
+def filter_baseline_subjects(condition_assembly: PropertyAssembly,
+                             baseline_assembly: PropertyAssembly
+                             ) -> Tuple[PropertyAssembly, PropertyAssembly]:
+    """A function to select only the unique subjects that exist in the condition_assembly."""
+    non_nan_mask = ~np.isnan(condition_assembly.values)
+    unique_ids = condition_assembly.coords['subject'][non_nan_mask].values.tolist()
+
+    mask = baseline_assembly.coords['subject'].isin(unique_ids)
+    filtered_baseline_assembly = baseline_assembly.where(mask, drop=True)
+    filtered_condition_assembly = condition_assembly.where(mask, drop=True)
+    return filtered_condition_assembly, filtered_baseline_assembly
diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
new file mode 100644
index 000000000..8a8ca75cd
--- /dev/null
+++ b/brainscore_vision/benchmarks/malania2007/test.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pytest
+from pytest import approx
+
+from brainscore_vision import benchmark_registry, load_benchmark, load_model
+from brainscore_vision.benchmarks.malania2007.benchmark import DATASETS
+
+
+class TestBehavioral:
+    def test_count(self):
+        assert len(DATASETS) == 5 + 2 + 2 + 1
+
+    @pytest.mark.parametrize('dataset', DATASETS)
+    def test_in_pool(self, dataset):
+        identifier = f"Malania2007.{dataset}"
+        assert identifier in benchmark_registry
+
+    @pytest.mark.private_access
+    def test_mean_ceiling(self):
+        benchmarks = [f"Malania2007.{dataset}" for dataset in DATASETS]
+        benchmarks = [load_benchmark(benchmark) for benchmark in benchmarks]
+        ceilings = [benchmark.ceiling for benchmark in benchmarks]
+        mean_ceiling = np.mean(ceilings)
+        assert mean_ceiling == approx(0.5757928329186803, abs=0.001)
+
+    # these test values are for the pooled score ceiling
+    @pytest.mark.private_access
+    @pytest.mark.parametrize('dataset, expected_ceiling', [
+        ('short2-threshold_elevation', approx(0.78719345, abs=0.001)),
+        ('short4-threshold_elevation', approx(0.49998989, abs=0.001)),
+        ('short6-threshold_elevation', approx(0.50590051, abs=0.001)),
+        ('short8-threshold_elevation', approx(0.4426336, abs=0.001)),
+        ('short16-threshold_elevation', approx(0.8383443, abs=0.001)),
+        ('equal2-threshold_elevation', approx(0.56664015, abs=0.001)),
+        ('long2-threshold_elevation', approx(0.46470421, abs=0.001)),
+        ('equal16-threshold_elevation', approx(0.44087153, abs=0.001)),
+        ('long16-threshold_elevation', approx(0.50996587, abs=0.001)),
+        ('vernieracuity-threshold', approx(0.70168481, abs=0.001))
+    ])
+    def test_dataset_ceiling(self, dataset, expected_ceiling):
+        benchmark = f"Malania2007.{dataset}"
+        benchmark = load_benchmark(benchmark)
+        ceiling = benchmark.ceiling
+        assert ceiling == expected_ceiling
+
+    @pytest.mark.parametrize('dataset, expected_score', [
+        ('short2-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short4-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short6-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short8-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short16-threshold_elevation', approx(0.0, abs=0.001)),
+        ('equal2-threshold_elevation', approx(0.0, abs=0.001)),
+        ('long2-threshold_elevation', approx(0.0, abs=0.001)),
+        ('equal16-threshold_elevation', approx(0.0, abs=0.001)),
+        ('long16-threshold_elevation', approx(0.0, abs=0.001)),
+        ('vernieracuity-threshold', approx(0.0, abs=0.001))
+    ])
+    def test_model_score(self, dataset, expected_score):
+        benchmark = f"Malania2007.{dataset}"
+        benchmark = load_benchmark(benchmark)
+        model = load_model('alexnet')
+        model_score = benchmark(model)
+        assert model_score.values == expected_score
diff --git a/brainscore_vision/data/malania2007/__init__.py b/brainscore_vision/data/malania2007/__init__.py
new file mode 100644
index 000000000..e6ecbb5cd
--- /dev/null
+++ b/brainscore_vision/data/malania2007/__init__.py
@@ -0,0 +1,254 @@
+from brainio.assemblies import PropertyAssembly
+
+from brainscore_vision import data_registry, stimulus_set_registry, load_stimulus_set
+from brainscore_vision.data_helpers.s3 import load_assembly_from_s3, load_stimulus_set_from_s3
+
+
+BIBTEX = """@article{malania2007,
+            author = {Malania, Maka and Herzog, Michael H. and Westheimer, Gerald},
+            title = "{Grouping of contextual elements that affect vernier thresholds}",
+            journal = {Journal of Vision},
+            volume = {7},
+            number = {2},
+            pages = {1-1},
+            year = {2007},
+            issn = {1534-7362},
+            doi = {10.1167/7.2.1},
+            url = {https://doi.org/10.1167/7.2.1}
+        }"""
+
+
+data_registry['Malania2007.equal2'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_equal-2',
+    version_id="yFXK8xjGjEmuYTSfS58rGS_ah3.NGg0X",
+    sha1="277b2fbffed00e16b6a69b488f73eeda5abaaf10",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=None,
+)
+data_registry['Malania2007.equal16'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_equal-16',
+    version_id="SRZ7bs.Ek59GkeS084Pvdy38uTzFs4yw",
+    sha1="ef49506238e8d2554918b113fbc60c133077186e",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=None,
+)
+data_registry['Malania2007.long2'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_long-2',
+    version_id="2c1lWuXthb3rymB3seTQX1jVqiKUTn1f",
+    sha1="9076a5b693948c4992b6c8e753f04a7acd2014a1",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=None,
+)
+data_registry['Malania2007.long16'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_long-16',
+    version_id="qshNxhxjgusWyWiXnbfFN6gqjLgRh8fO",
+    sha1="3106cf1f2fa9e66617ebf231df05d29077fc478f",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=None,
+)
+data_registry['Malania2007.short2'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-2',
+    version_id="8CQ9MupuljAgkkKUXs3hiOliHg8xoDxb",
+    sha1="85fb65ad76de48033c704b9c5689771e1ea0457d",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=None,
+)
+data_registry['Malania2007.short4'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-4',
+    version_id=".ZUO0upSfQrWLPgd4oGwAaCbN4bz6S6H",
+    sha1="75506be9a26ec38a223e41510f1a8cb32d5b0bc9",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=None,
+)
+data_registry['Malania2007.short6'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-6',
+    version_id="q4FugpNGkT_FQP..qIVzye83hAQR2xfS",
+    sha1="2901be6b352e67550da040d79d744819365b8626",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=None,
+)
+data_registry['Malania2007.short8'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-8',
+    version_id="4_lcRl_I7Mp0RHxcfqZ9tkAZjVh.5oMU",
+    sha1="6daf47b086cb969d75222e320f49453ed8437885",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=None,
+)
+data_registry['Malania2007.short16'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_short-16',
+    version_id="fFqEIyIC9CHzqTEmv0MitjCgpeMX5pxJ",
+    sha1="8ae0898caad718b747f85fce5888416affc3a569",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=None,
+)
+data_registry['Malania2007.vernier_only'] = lambda: load_assembly_from_s3(
+    identifier='Malania2007_vernier-only',
+    version_id="JLWf2pIR_UadQHqwtegJkC6XzWdbSNGi",
+    sha1="1cf83e8b6141f8b0d67ea46994f342325f62001f",
+    bucket="brainio-brainscore",
+    cls=PropertyAssembly,
+    stimulus_set_loader=None,
+)
+
+
+stimulus_set_registry['Malania2007.equal2'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_equal-2',
+    bucket="brainio-brainscore",
+    csv_sha1="77e94b9b5122a83ebbaffb4a06fcab68ef652751",
+    zip_sha1="99826d459f6920dafab72eed69eb2a90492ce796",
+    csv_version_id="MlRpSz.4.jvVRFAZl8tGEum1P0Q0GtyS",
+    zip_version_id="vHbAM_FjTbjp5U12BkAelJu4KW6PLYFn"
+)
+stimulus_set_registry['Malania2007.equal2_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_equal-2_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="bafdfc855c164d3e5443d67dcf9eb7762443f964",
+    zip_sha1="e52fec1a79ac8837e331b180c2a8a140840d6666",
+    csv_version_id="PIXEW.2vHvjIBP0Q2KHIpnxns7t9o8Cf",
+    zip_version_id="h7pp84CYFGLKlPhveD0L5ogePqisk_I7"
+)
+stimulus_set_registry['Malania2007.equal16'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_equal-16',
+    bucket="brainio-brainscore",
+    csv_sha1="5fedcff56c302339c3451ae2edbcb846c39c3189",
+    zip_sha1="b30dc2dc90e4f3d88775622e558db963765f38e0",
+    csv_version_id="VmRGiQkhPALDwq74NpE2VpTiKTGn.30T",
+    zip_version_id="c.DOlVULXZingRJ9gVY_NbZwRrj_xs_i"
+)
+stimulus_set_registry['Malania2007.equal16_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_equal-16_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="3de3e5de19a638767a01ba68cb690dc746c29a77",
+    zip_sha1="1728920c5ea4fb7b3a3cf3c076165aca65c8b751",
+    csv_version_id="joAq8JBC_7axZDfLNFgoXFhTCLU_KKr_",
+    zip_version_id="77JRwdldaHDr6TLW1NnB5HucIrkUCVg."
+)
+stimulus_set_registry['Malania2007.long2'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_long-2',
+    bucket="brainio-brainscore",
+    csv_sha1="ba65316a63dc688d8dfb410219a28fd02850b991",
+    zip_sha1="7fd431fbbd4a4dc0cd271624d3297c19a28a70b5",
+    csv_version_id="_0fqObn6k5KvXurHMsuD4IqtrqbNskyo",
+    zip_version_id="foL92ndVAAAETzMYHdmMtwIwKxXYhAB."
+)
+stimulus_set_registry['Malania2007.long2_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_long-2_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="b91dd9261c1d47bdd37f9b60eb8066b7b719709f",
+    zip_sha1="5be3e1cd57b59081103715b5d318505166e0045e",
+    csv_version_id="mATh8lcVisdsDnPnU6ACE23iBPfpkLZA",
+    zip_version_id="6nEviShTyCYQKrmxyjDyNov9Skc77eXT"
+)
+stimulus_set_registry['Malania2007.long16'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_long-16',
+    bucket="brainio-brainscore",
+    csv_sha1="1f1b03319b81698ba5e7db389dcd4248f94e45ca",
+    zip_sha1="97c70462a28905b58058c687880188d634d357f0",
+    csv_version_id="4RtywQ40hfQA4N80g8lxEScAmMXFRg7E",
+    zip_version_id="lJy2QosABzHtiA6BJaE4OqCn1w1Jhz2k"
+)
+stimulus_set_registry['Malania2007.long16_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_long-16_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="d80a02c75b9908301c3c8dc9f7116fecf8e060ec",
+    zip_sha1="d8819b94d3f502d7a382c8a0db0a34627132e5e2",
+    csv_version_id="gOxY6tjnT7LO.FDeL1xkRmowl5wYeAia",
+    zip_version_id="71UAPTnZscIuqdx2dhuW9V0O0DO_TgTM"
+)
+stimulus_set_registry['Malania2007.short2'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-2',
+    bucket="brainio-brainscore",
+    csv_sha1="bf0252056d2084e855646f624700ab03c19cfc3d",
+    zip_sha1="eee1270feb7443e7e315d8feb7fb0a6b6908f554",
+    csv_version_id="zcJqM.ZPwJyiMRWa3RBdvv401yPnLQAp",
+    zip_version_id="C8WZzAAQ0JGHAAKii4JpvlRhcUOhgSj."
+)
+stimulus_set_registry['Malania2007.short2_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-2_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="73127d279a2cd254ae4f07b0053580851e84b00c",
+    zip_sha1="918736349d714a4f784c29bf7e7d218b103e128d",
+    csv_version_id="iwGRp3_ktAHfJ6r7ktSK9gsthDjKek70",
+    zip_version_id="6RpplJ9UVXTlvhmFSXla0Qa20b44m8Ds"
+)
+stimulus_set_registry['Malania2007.short4'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-4',
+    bucket="brainio-brainscore",
+    csv_sha1="816326d89d358f6592bd1f789e5c8d429fbca1cd",
+    zip_sha1="ff57d976ef75ede9148a4097e90d6cf6c8054d34",
+    csv_version_id="Waikk.bktXIncCUtCIAyB2EqynGk.H.F",
+    zip_version_id="rl_muxI4UEpwXVaXuhsqroG..COGILvR"
+)
+stimulus_set_registry['Malania2007.short4_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-4_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="3512cfd029f4e4299bc41ede519e691d80cfc3d5",
+    zip_sha1="301386408dd1fb8556881f9a171be2d43dbfec6e",
+    csv_version_id="UhisdJqiEmkQ_4zsUtAmaxtle2kMZdcD",
+    zip_version_id="xt_v0xgCB8YUptyPB0yZFHIUcel5MF_x"
+)
+stimulus_set_registry['Malania2007.short6'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-6',
+    bucket="brainio-brainscore",
+    csv_sha1="3d5dd9b48a56ba0c31de94b6221b97df962b6f8a",
+    zip_sha1="120d90a143d1577d4745c3f69291d0db6c7e512e",
+    csv_version_id="GwGHPJkMDdg8N_.boyj8qJ3ChsEx4w._",
+    zip_version_id="gIN1O4yz.THvK0Ifm5M3AI58ZACE1QFh"
+)
+stimulus_set_registry['Malania2007.short6_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-6_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="27a5be4fca190837fc5b75ed2cdbbffbf6b41338",
+    zip_sha1="c88e05c6cadec88a2c9475b0735323a2b049bd75",
+    csv_version_id="oMlj7wV85s00hJFE84ym0AJHLCfYHVA6",
+    zip_version_id="oS.KrBTlcYAgr_lWyA_bIjVc2js_VeUe"
+)
+stimulus_set_registry['Malania2007.short8'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-8',
+    bucket="brainio-brainscore",
+    csv_sha1="8fc35f607196b4c0cdcebd8102d17e3a637e5988",
+    zip_sha1="a9215ed0cb0f0333582dda65f6afd7015c506ba5",
+    csv_version_id="gzys8s7j7euMEl7JJpqBFLFHMpFjwbA7",
+    zip_version_id="3fYb4Iruh3lRKUwC1APqFH4CNbE5DEuk"
+)
+stimulus_set_registry['Malania2007.short8_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-8_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="aa4133a9fe19a3c9004a9cb5e6eb5a72564e4883",
+    zip_sha1="beb9f068794708e41750202b78c438538a40a8fb",
+    csv_version_id="7N1Z.uiagqBknJUSBQ4mVfHKWgocM5aA",
+    zip_version_id="kcEOPOkvWymO0wX5j_QKxcNPl9sZsjFd"
+)
+stimulus_set_registry['Malania2007.short16'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-16',
+    bucket="brainio-brainscore",
+    csv_sha1="addd260c9959f2f315db03c0a39c6c1b01fef685",
+    zip_sha1="cba4c2866ec692fb808471df7c2fed446d9fb3fe",
+    csv_version_id="Peu7WU5vanLoZNOFIAbuPzZNPDRgbCSX",
+    zip_version_id="wFkJkZMC8Fs_HfPJy32CMKcHJWeQIUDB"
+)
+stimulus_set_registry['Malania2007.short16_fit'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_short-16_fit',
+    bucket="brainio-brainscore",
+    csv_sha1="9b340fe242117482f6992f48a805297215ba9924",
+    zip_sha1="4a90d511a3ceb3307a672177a3ad6b76521e65e5",
+    csv_version_id="sYBPEmXDgbWipuepciLirlorQE3L8BLc",
+    zip_version_id="pYvOkrLxadkQ67K3__wmciNwaCW.hyyN"
+)
+stimulus_set_registry['Malania2007.vernier_only'] = lambda: load_stimulus_set_from_s3(
+    identifier='Malania2007_vernier-only',
+    bucket="brainio-brainscore",
+    csv_sha1="b2cb0f2ed32426b739f90187ae24ad4adf84110d",
+    zip_sha1="0e177aea523adc320070196fbb777af4cdba2144",
+    csv_version_id="c8wpZpqoMqdATlqdoq3srPUi_8fYg6a.",
+    zip_version_id="28lHgxERhw32Ux6IBCxWWTtRwIaRrwo6"
+)
diff --git a/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
new file mode 100644
index 000000000..091ac3fa6
--- /dev/null
+++ b/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py
@@ -0,0 +1,79 @@
+from pathlib import Path
+import numpy as np
+import xarray as xr
+
+from brainio.assemblies import PropertyAssembly
+from brainio.packaging import package_data_assembly
+import pandas as pd
+
+
+DATASETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2',
+            'long-2', 'equal-16', 'long-16', 'vernier-only']
+NUM_SUBJECTS = {'short-2': 6,
+                'short-4': 5,
+                'short-6': 5,
+                'short-8': 5,
+                'short-16': 6,
+                'equal-2': 5,
+                'long-2': 5,
+                'equal-16': 5,
+                'long-16': 5,
+                'vernier-only': 6}
+
+
+def collect_malania_data_assembly(root_directory, dataset):
+    """
+    Experiment Information:
+        - 5-6 observers per condition (for exact value, see NUM_SUBJECTS)
+        - 2AFC left/right offset discrimination task
+        - PEST staircase to 75% correct responses
+        - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit
+    """
+    # construct the assembly
+    metadata_directory = Path(f'{root_directory}/{dataset}/metadata_human.xlsx')
+    metadata = pd.read_excel(metadata_directory)
+    # Since subjects are uniquely held using 'unique_subject_id', drop the rows with a subject
+    #  without measurement
+    assembly = PropertyAssembly(metadata['threshold'],
+                                  coords={
+                                      'subject': ('subject', metadata['subject_unique_id']),
+                                  },
+                                  dims=['subject']
+                                  )
+
+    # give the assembly an identifier name
+    assembly.name = f'Malania2007_{dataset}'
+
+    # test subject numbers after removing the NaN subject
+    metadata = metadata.dropna(subset=['threshold'], axis=0)
+    assert len(metadata) == NUM_SUBJECTS[dataset]
+
+    return assembly
+
+
+def remove_subjects_with_nans(assembly1, assembly2):
+    # Find the indices of the subjects with NaN values in the first PropertyAssembly
+    nan_subjects = np.isnan(assembly1.values)
+
+    # Convert the boolean array to a DataArray with the same coordinates as the input assemblies
+    nan_subjects_da = xr.DataArray(nan_subjects, coords=assembly1.coords, dims=assembly1.dims)
+
+    # Filter out the subjects with NaN values from both PropertyAssemblies
+    filtered_assembly1 = assembly1.where(~nan_subjects_da, drop=True)
+    filtered_assembly2 = assembly2.where(~nan_subjects_da, drop=True)
+
+    return filtered_assembly1, filtered_assembly2
+
+
+if __name__ == '__main__':
+    root_directory = Path(r'.')
+    for dataset in DATASETS:
+        assembly = collect_malania_data_assembly(root_directory, dataset)
+        # upload to S3
+        prints = package_data_assembly(catalog_identifier=None,
+                                       proto_data_assembly=assembly,
+                                       assembly_identifier=assembly.name,
+                                       stimulus_set_identifier=assembly.name,
+                                       assembly_class_name="PropertyAssembly",
+                                       bucket_name="brainio-brainscore")
+        print(prints)
diff --git a/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py
new file mode 100644
index 000000000..8a9f63fde
--- /dev/null
+++ b/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py
@@ -0,0 +1,79 @@
+import csv
+from pathlib import Path
+from brainio.stimuli import StimulusSet
+from brainio.packaging import package_stimulus_set
+
+
+# every stimulus set is separate, incl. baseline condition
+STIMULUS_SETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2',
+                 'long-2', 'equal-16', 'long-16', 'vernier-only', 'short-2_fit',
+                 'short-4_fit', 'short-6_fit', 'short-8_fit', 'short-16_fit',
+                 'equal-2_fit', 'long-2_fit', 'equal-16_fit', 'long-16_fit']
+DATASET_LENGTHS = {'test': 50, 'fit': 500}
+
+
+def collect_malania_stimulus_set(root_directory, dataset):
+    """
+    Dataset Meta Info
+
+    Reported in pixels:
+        - image_size_x; image_size_y
+        - vernier_position_x; vernier_position_y
+
+    Reported in arcsec:
+        - vernier_height (height of the vernier elements combined, - middle gap)
+        - vernier_offset (horizontal offset between flankers)
+        - flanker_height (height of the flanker elements)
+        - flanker_spacing (distance between a flanker element and another flanker element)
+        - line_width (width of all the lines in all elements)
+        - flanker_distance (distance between a flanker and a vernier)
+    """
+    stimuli = []
+    stimulus_paths = {}
+
+    dataset_type = 'fit' if dataset[-3:] == 'fit' else 'test'
+    metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv')
+    image_directory = Path(f'{root_directory}/{dataset}/images')
+    with open(metadata_directory, 'r') as metadata:
+        reader = csv.DictReader(metadata)
+        for row in reader:
+            stimuli.append({
+                'image_size_x_pix': int(row['image_size_x_pix']),
+                'image_size_y_pix': int(row['image_size_y_pix']),
+                'image_size_c': int(row['image_size_c']),
+                'image_size_degrees': float(row['image_size_degrees']),
+                'vernier_height_arcsec': float(row['vernier_height_arcsec']),
+                'vernier_offset_arcsec': float(row['vernier_offset_arcsec']),
+                'image_label': row['image_label'],
+                'flanker_height_arcsec': float(row['flanker_height_arcsec']),
+                'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']),
+                'line_width_arcsec': float(row['line_width_arcsec']),
+                'flanker_distance_arcsec': float(row['flanker_distance_arcsec']),
+                'num_flankers': int(row['num_flankers']),
+                'vernier_position_x_pix': int(row['vernier_position_x_pix']),
+                'vernier_position_y_pix': int(row['vernier_position_y_pix']),
+                'stimulus_id': str(row['stimulus_id']),
+            })
+            stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}')
+
+    stimuli = StimulusSet(stimuli)
+    stimuli.stimulus_paths = stimulus_paths
+    stimuli.name = f'Malania2007_{dataset}'  # give the StimulusSet an identifier name
+    stimuli.identifier = f'Malania2007_{dataset}'
+
+    # Ensure expected number of stimuli in datasets
+    assert len(stimuli) == DATASET_LENGTHS[dataset_type]
+    return stimuli
+
+
+if __name__ == '__main__':
+    root_directory = Path(r'../data/malania2007/data_packaging/')
+    for stimulus_set in STIMULUS_SETS:
+        stimuli = collect_malania_stimulus_set(root_directory, stimulus_set)
+
+        # upload to S3
+        prints = package_stimulus_set(catalog_name=None,
+                                      proto_stimulus_set=stimuli,
+                                      stimulus_set_identifier=stimuli.name,
+                                      bucket_name="brainio-brainscore")
+        print(prints)
diff --git a/brainscore_vision/data/malania2007/test.py b/brainscore_vision/data/malania2007/test.py
new file mode 100644
index 000000000..faf216749
--- /dev/null
+++ b/brainscore_vision/data/malania2007/test.py
@@ -0,0 +1,147 @@
+import numpy as np
+import pytest
+
+from brainscore_vision import load_stimulus_set, load_dataset
+from brainscore_vision.benchmarks.malania2007.benchmark import DATASETS
+
+
+@pytest.mark.private_access
+class TestAssemblies:
+    # test the number of subjects:
+    @pytest.mark.parametrize('identifier, num_subjects', [
+        ('short2', 6),
+        ('short4', 5),
+        ('short6', 5),
+        ('short8', 5),
+        ('short16', 6),
+        ('equal2', 5),
+        ('long2', 5),
+        ('equal16', 5),
+        ('long16', 5),
+        ('vernier_only', 6)
+    ])
+    def test_num_subjects(self, identifier, num_subjects):
+        assembly = load_dataset(f"Malania2007.{identifier}")
+        assembly = assembly.dropna(dim='subject')
+        assert len(np.unique(assembly['subject'].values)) == num_subjects
+
+    # test assembly coords present in ALL 17 sets:
+    @pytest.mark.parametrize('identifier', [
+        'short2',
+        'short4',
+        'short6',
+        'short8',
+        'short16',
+        'equal2',
+        'long2',
+        'equal16',
+        'long16',
+        'vernier_only',
+    ])
+    @pytest.mark.parametrize('field', [
+        'subject'
+    ])
+    def test_fields_present(self, identifier, field):
+        assembly = load_dataset(f"Malania2007.{identifier}")
+        assert hasattr(assembly, field)
+
+
+@pytest.mark.private_access
+class TestStimulusSets:
+    # test stimulus_set data:
+    @pytest.mark.parametrize('identifier', [
+        'short2',
+        'short4',
+        'short6',
+        'short8',
+        'short16',
+        'equal2',
+        'long2',
+        'equal16',
+        'long16',
+        'short2_fit',
+        'short4_fit',
+        'short6_fit',
+        'short8_fit',
+        'short16_fit',
+        'equal2_fit',
+        'long2_fit',
+        'equal16_fit',
+        'long16_fit',
+        'vernier_only'
+    ])
+    def test_stimulus_set_exist(self, identifier):
+        full_name = f"Malania2007.{identifier}"
+        stimulus_set = load_stimulus_set(full_name)
+        assert stimulus_set is not None
+        stripped_actual_identifier = stimulus_set.identifier.replace('.', '').replace('_', '').replace('-', '')
+        stripped_expected_identifier = full_name.replace('.', '').replace('_', '').replace('-', '')
+        assert stripped_actual_identifier == stripped_expected_identifier
+
+    @pytest.mark.parametrize('identifier, num_images', [
+        ('short2', 50),
+        ('short4', 50),
+        ('short6', 50),
+        ('short8', 50),
+        ('short16', 50),
+        ('equal2', 50),
+        ('long2', 50),
+        ('equal16', 50),
+        ('long16', 50),
+        ('short2_fit', 500),
+        ('short4_fit', 500),
+        ('short6_fit', 500),
+        ('short8_fit', 500),
+        ('short16_fit', 500),
+        ('equal2_fit', 500),
+        ('long2_fit', 500),
+        ('equal16_fit', 500),
+        ('long16_fit', 500),
+        ('vernier_only', 50)
+    ])
+    def test_number_of_images(self, identifier, num_images):
+        stimulus_set = load_stimulus_set(f"Malania2007.{identifier}")
+        assert len(np.unique(stimulus_set['stimulus_id'].values)) == num_images
+
+    # tests stimulus_set coords for the 14 "normal" sets:
+    @pytest.mark.parametrize('identifier', [
+        'short2',
+        'short4',
+        'short6',
+        'short8',
+        'short16',
+        'equal2',
+        'long2',
+        'equal16',
+        'long16',
+        'short2_fit',
+        'short4_fit',
+        'short6_fit',
+        'short8_fit',
+        'short16_fit',
+        'equal2_fit',
+        'long2_fit',
+        'equal16_fit',
+        'long16_fit',
+        'vernier_only'
+    ])
+    @pytest.mark.parametrize('field', [
+        'image_size_x_pix',
+        'image_size_y_pix',
+        'image_size_c',
+        'image_size_degrees',
+        'vernier_height_arcsec',
+        'vernier_offset_arcsec',
+        'image_label',
+        'flanker_height_arcsec',
+        'flanker_spacing_arcsec',
+        'line_width_arcsec',
+        'flanker_distance_arcsec',
+        'num_flankers',
+        'vernier_position_x_pix',
+        'vernier_position_y_pix',
+        'stimulus_id',
+    ])
+    def test_fields_present(self, identifier, field):
+        stimulus_set = load_stimulus_set(f"Malania2007.{identifier}")
+        assert hasattr(stimulus_set, field)
diff --git a/brainscore_vision/metrics/threshold/__init__.py b/brainscore_vision/metrics/threshold/__init__.py
new file mode 100644
index 000000000..69f6102e8
--- /dev/null
+++ b/brainscore_vision/metrics/threshold/__init__.py
@@ -0,0 +1,5 @@
+from brainscore_vision import metric_registry
+from .metric import Threshold, ThresholdElevation
+
+metric_registry['threshold'] = Threshold
+metric_registry['threshold_elevation'] = ThresholdElevation
diff --git a/brainscore_vision/metrics/threshold/metric.py b/brainscore_vision/metrics/threshold/metric.py
new file mode 100644
index 000000000..2b04270e3
--- /dev/null
+++ b/brainscore_vision/metrics/threshold/metric.py
@@ -0,0 +1,482 @@
+from typing import Dict, Union, Tuple, List, Optional, Callable
+
+import numpy as np
+from scipy.optimize import minimize
+from scipy.stats import norm
+from sklearn.metrics import r2_score
+import matplotlib.pyplot as plt
+
+from brainscore_core.metrics import Metric, Score
+from brainio.assemblies import PropertyAssembly, BehavioralAssembly
+
+
+def psychometric_cum_gauss(x: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float:
+    """
+    The classic psychometric function as implemented in Wichmann & Hill (2001). The psychometric function: I.
+    Fitting, sampling, and goodness of fit, eq. 1.
+
+    :param x: the independent variables of the data
+    :param alpha: the slope parameter
+    :param beta: the mean of the cdf parameter
+    :param lambda_: the lapse rate
+    :param gamma: the lower bound of the fit
+
+    :return: the psychometric function values for the given parameters evaluated at `x`.
+    """
+    return gamma + (1 - gamma - lambda_) * norm.cdf(alpha * (x - beta))
+
+
+def inverse_psychometric_cum_gauss(y: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float:
+    """The inverse of psychometric_cum_gauss."""
+    return beta + (norm.ppf((y - gamma) / (1 - gamma - lambda_)) / alpha)
+
+
+def cum_gauss_neg_log_likelihood(params: Tuple[float, ...], x: np.array, y: np.array) -> float:
+    """The negative log likelihood function for psychometric_cum_gauss."""
+    alpha, beta, lambda_ = params
+    p = psychometric_cum_gauss(x, alpha, beta, lambda_)
+    log_likelihood = y * np.log(p) + (1 - y) * np.log(1 - p)
+    return -np.sum(log_likelihood)
+
+
+def get_predicted(params: Tuple[float, ...], x: np.array, fit_fn: Callable) -> np.array:
+    """Returns the predicted values based on the model parameters."""
+    return fit_fn(x, *params)
+
+
+def grid_search(x: np.array,
+                y: np.array,
+                alpha_values: np.array = np.logspace(-3, 1, 50),
+                beta_values: np.array = None,
+                fit_fn: Callable = psychometric_cum_gauss,
+                fit_log_likelihood_fn: Callable = cum_gauss_neg_log_likelihood,
+                fit_bounds: Tuple = ((None, None), (None, None), (0.03, 0.5))
+                ) -> Tuple[Tuple[float, ...], float]:
+    """
+    A classic simplified procedure for running sparse grid search over the slope and mean parameters of the
+    psychometric function.
+    This function is implemented here instead of using sklearn.GridSearchCV since we would have to make a custom
+    sklearn estimator class to use GridSearchCV with psychometric functions, likely increasing code bloat
+    substantially.
+
+    :param x: the independent variables of the data
+    :param y: the measured accuracy rates for the given x-values
+    :param alpha_values: the alpha values for the chosen fit function to grid search over
+    :param beta_values: the beta values for the chosen fit function to grid search over
+    :param fit_fn: the psychometric function that is fit
+    :param fit_log_likelihood_fn: the log likelihood function that computes the log likelihood of its corresponding
+                                  fit function
+    :param fit_bounds: the bounds assigned to the fit function called by fit_log_likelihood_fn.
+                       The default fit_bounds are assigned as:
+                       alpha: (None, None), to allow any slope
+                       beta: (None, None), any inflection point is allowed, as that is controlled for in the
+                             Threshold class
+                       lambda_: (0.03, 0.5)), to require at least a small lapse rate, as is regularly done in
+                                human fitting
+
+    :return: the parameters of the best fit in the grid search
+    """
+    assert len(x) == len(y)
+    # Default the beta_values grid search to the measured x-points.
+    if beta_values is None:
+        beta_values = x
+
+    # initialize best values for a fit
+    best_alpha, best_beta, best_lambda = None, None, None
+    min_neg_log_likelihood = np.inf
+
+    for alpha_guess in alpha_values:
+        for beta_guess in beta_values:
+            initial_guess = np.array([alpha_guess, beta_guess, 1 - np.max(y)])  # lapse rate guess set to the maximum y
+
+            # wrap inside a RuntimeError block to catch the RuntimeError thrown by scipy.minimize if a fit
+            # entirely fails. The case where all fits fail here is handled by the Threshold metric.
+            try:
+                result = minimize(fit_log_likelihood_fn, initial_guess, args=(x, y),
+                                  method='L-BFGS-B', bounds=fit_bounds)
+                alpha_hat, beta_hat, lambda_hat = result.x
+                neg_log_likelihood_hat = fit_log_likelihood_fn([alpha_hat, beta_hat, lambda_hat], x, y)
+
+                if neg_log_likelihood_hat < min_neg_log_likelihood:
+                    min_neg_log_likelihood = neg_log_likelihood_hat
+                    best_alpha, best_beta, best_lambda = alpha_hat, beta_hat, lambda_hat
+            except RuntimeError:
+                pass
+
+    y_pred = fit_fn(x, best_alpha, best_beta, best_lambda)
+    r2 = r2_score(y, y_pred)  # R^2 of the fit
+    return (best_alpha, best_beta, best_lambda), r2
+
+
+class Threshold(Metric):
+    """
+    Computes a psychometric threshold function from model responses and compares against human-computed psychometric
+    thresholds.
+
+    The model comparison to human data is currently individual-subject based, i.e., models and ceilings are compared
+    against the mean of the distance of the model threshold to human thresholds.
+    """
+    def __init__(self,
+                 independent_variable: str,
+                 fit_function=psychometric_cum_gauss,
+                 fit_inverse_function=inverse_psychometric_cum_gauss,
+                 threshold_accuracy: Union[str, float] = 'inflection',
+                 required_accuracy: Optional[float] = 0.6
+                 ):
+        """
+        :param independent_variable: The independent variable in the benchmark that the threshold is computed
+                                      over.
+        :param fit_function: The function used to fit the threshold.
+        :param fit_inverse_function: The inverse of fit_function used to find the threshold from the fit.
+        :param threshold_accuracy: The accuracy at which the threshold should be evaluated at. This can be
+                                    either a string Literal['inflection'] or a float. When Literal['inflection']
+                                    is used, the function finds the inflection point of the curve and evaluates
+                                    the threshold at that level. When a float is used, the function evaluates
+                                    the threshold at that level.
+        :param required_accuracy: The minimum accuracy required for the psychometric function fit to be considered.
+        """
+        self.fit_function = fit_function
+        self.fit_inverse_function = fit_inverse_function
+        self._independent_variable = independent_variable
+        self.threshold_accuracy = threshold_accuracy
+        self.required_accuracy = required_accuracy
+
+    def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list, PropertyAssembly]) -> Score:
+        """
+        :param source: Either a BehavioralAssembly containing model responses to individual stimuli, or a pre-computed
+                        threshold as a float.
+        :param target: Either a list containing human thresholds (for the ceiling function & ThresholdElevation),
+                        or a PropertyAsssembly.
+        :return: A Score containing the evaluated model's distance to target thresholds.
+        """
+        # compute threshold from measurements if the input is not a threshold already
+        if isinstance(source, float):
+            source_threshold = source
+        elif isinstance(source, BehavioralAssembly):
+            source_threshold = self.compute_threshold(source, self._independent_variable)
+            # check whether the psychometric function fit was successful - if not, return a score of 0
+            if source_threshold == 'fit_fail':
+                score = Score(0.)
+                score.attrs['error'] = 0.
+                return score
+        else:
+            raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.')
+        return self.scoring_function(source_threshold, target)
+
+    def ceiling(self, assembly: Union[PropertyAssembly, Dict[str, PropertyAssembly]]) -> Score:
+        """
+        Computed by one-vs all for each of the NUM_TRIALS human indexes. One index is removed, and scored against
+        a pool of the other values.
+
+        Currently copied with modification from 'https://github.com/brain-score/brain-score/blob/
+        jacob2020_occlusion_depth_ordering/brainscore/metrics/data_cloud_comparision.py#L54'.
+
+        :param assembly: the human PropertyAssembly containing human responses, or a dict containing the
+                          PropertyAssemblies of the ThresholdElevation metric.
+        :return: Score object with coords center (ceiling) and error (STD)
+        """
+        human_thresholds: list = assembly.values.tolist()
+        scores = []
+        for i in range(len(human_thresholds)):
+            random_state = np.random.RandomState(i)
+            random_human_score = random_state.choice(human_thresholds, replace=False)
+            metric = Threshold(self._independent_variable, self.fit_function, self.fit_inverse_function,
+                               self.threshold_accuracy)
+            human_thresholds.remove(random_human_score)
+            score = metric(random_human_score, human_thresholds)
+            human_thresholds.append(random_human_score)
+            scores.append(score.values)
+
+        ceiling, ceiling_error = np.mean(scores), np.std(scores)
+        ceiling = Score(ceiling)
+        ceiling.attrs['error'] = ceiling_error
+        return ceiling
+
+    def compute_threshold(self, source: BehavioralAssembly, independent_variable: str) -> Union[float, str]:
+        """Converts the source BehavioralAssembly to a threshold float value."""
+        assert len(source.values) == len(source[independent_variable].values)
+
+        x_points = source[independent_variable].values
+        accuracies = self.convert_proba_to_correct(source)
+        if np.mean(accuracies) < self.required_accuracy:
+            print('Psychometric threshold fit failure due to low accuracy.')
+            fit_params = 'fit_fail'
+        else:
+            fit_params, measurement_max = self.fit_threshold_function(x_points, accuracies)
+        if (type(fit_params) == str) and (fit_params == 'fit_fail'):
+            return fit_params
+
+        if self.threshold_accuracy == 'inflection':
+            self.threshold_accuracy = self.inflection_accuracy(x_points, fit_params)
+
+        threshold = self.find_threshold(self.threshold_accuracy, fit_params)
+
+        # check whether the fit is outside the measured model responses to discard spurious thresholds
+        if (threshold > measurement_max) or np.isnan(threshold):
+            print('Fit fail because threshold is outside of the measured range of responses.')
+            return 'fit_fail'
+        return threshold
+
+    def fit_threshold_function(self, x_points: np.array, y_points: np.array) -> Union[np.array, str]:
+        """
+        A function that takes the x and y-points of the measured variable and handles the fitting of the
+        psychometric threshold function.
+
+        Returns either the fit parameters for self.fit_function or a string tag that indicates the failure
+        of the psychometric curve fit.
+        """
+        x_points, y_points = self.aggregate_psychometric_fit_data(x_points, y_points)
+        aggregated_x_points, aggregated_y_points, at_least_third_remaining = self.remove_data_after_asymptote(x_points,
+                                                                                                              y_points)
+        measurement_max = np.max(aggregated_x_points)
+        if not at_least_third_remaining:
+            # This failure indicates that there is too little data to accurately fit the psychometric function.
+            print('Psychometric curve fit fail because performance is decreasing with the independent variable.')
+            return 'fit_fail', measurement_max
+
+        params, r2 = grid_search(aggregated_x_points, aggregated_y_points)
+
+        # if all the fits in the grid search failed, there will be a None value in params. In this case, we reject
+        #  the fit. This typically only ever happens when a model outputs one value for all test images.
+        if None in params:
+            params = 'fit_fail'
+
+        # remove fits to random data. This choice is preferred over a chi^2 test since chi^2 discards a lot of fits
+        #  that would be acceptable in a human case.
+        if r2 < 0.4:
+            print('Fit fail due to low fit R^2.')
+            params = 'fit_fail'
+
+        return params, measurement_max
+
+    def find_threshold(self, threshold_accuracy: float, fit_params: Tuple[float, ...]) -> float:
+        """
+        A function that uses the inverse fit function to find the value of the threshold in terms of
+        the independent variable (self._independent_variable).
+        """
+        threshold = self.fit_inverse_function(threshold_accuracy, *fit_params)
+        return threshold
+
+    def inflection_accuracy(self, x_points: np.array, fit_params: np.array) -> float:
+        """
+        A function that finds the accuracy at the inflection point of the fit function. Useful if you do not care
+        about the specific threshold accuracy, but rather about e.g. the elevation at the inflection point.
+        """
+        max_fit_accuracy = self.fit_function(np.max(x_points), *fit_params)
+        min_fit_accuracy = self.fit_function(np.min(x_points), *fit_params)
+        threshold_accuracy = min_fit_accuracy + (max_fit_accuracy - min_fit_accuracy) / 2
+        return threshold_accuracy
+
+    @staticmethod
+    def aggregate_psychometric_fit_data(x_points, y_points):
+        unique_x = np.unique(x_points)
+        correct_rate = np.zeros(len(unique_x))
+
+        for i, x in enumerate(unique_x):
+            trials = np.sum(x_points == x)
+            correct_trials = np.sum((x_points == x) & (y_points == 1))
+            correct_rate[i] = correct_trials / trials
+
+        return unique_x, correct_rate
+
+    @staticmethod
+    def scoring_function(source: float, target: Union[list, PropertyAssembly]) -> Score:
+        """
+        Computes the average distance of the source from each of the individual targets in units of the
+        individual targets. This is generally a more stringent scoring method than pool_score, aimed
+        to measure the average of the individual target effects.
+        """
+        raw_scores = []
+        for target_value in target:
+            # This score = 0 when the source exceeds target_value by 100%
+            raw_score = max((1 - ((np.abs(target_value - source)) / target_value)), 0)
+            raw_scores.append(raw_score)
+
+        scores_mean, scores_std = np.mean(raw_scores), np.std(raw_scores)
+        score = Score(scores_mean)
+        score.attrs['raw'] = raw_scores
+        score.attrs['error'] = scores_std
+        return score
+
+    @staticmethod
+    def convert_proba_to_correct(source: BehavioralAssembly) -> np.array:
+        """Converts the probability values returned by models doing probability tasks to behavioral choices."""
+        decisions = np.argmax(source.values, axis=1)
+        correct = []
+        for presentation, decision in enumerate(decisions):
+            if source['choice'].values[decision] == source['image_label'].values[presentation]:
+                correct.append(1)
+            else:
+                correct.append(0)
+        return np.array(correct)
+
+    @staticmethod
+    def remove_data_after_asymptote(x_values, y_values):
+        """
+        A function that removes all data after the point at which all values of the measured variable are 1 standard
+        deviation less than the maximum.
+
+        This is done to simulate the procedure in which an experimenter fine-tunes the stimuli in a pilot experiment
+        to the given system (e.g., humans) such that they only measure data in a region within which the psychometric
+        fit is monotone (as per the function fit assumption). When this assumption is violated, the function fit
+        is not a valid measure of the underlying performance function.
+
+        There are circumstances in which this behavior is expected (e.g., crowding). When e.g. a vernier element's
+        offset is increased enough, the task may paradoxically become more difficult, as the offset grows large
+        enough such that the relevant elements do not fall within a spatially relevant window, or group with the
+        flankers more than with each other due to constant target-flanker distance.
+        """
+
+        std_dev = np.std(y_values)
+        max_y_idx = np.argmax(y_values)
+
+        # initialize the index for the first data point after the maximum y_value
+        #  that deviates from the maximum by at least 1 standard deviation
+        index_to_remove = None
+
+        # iterate through the y_values after the maximum y_value
+        for idx, y in enumerate(y_values[max_y_idx + 1:], start=max_y_idx + 1):
+            # check if all the remaining y_values deviate by at least 1 standard deviation
+            if all([abs(val - y_values[max_y_idx]) >= std_dev for val in y_values[idx:]]):
+                index_to_remove = idx
+                break
+        pre_remove_length = len(y_values)
+        # if we found an index to remove, remove the data after that index
+        if index_to_remove is not None:
+            x_values = x_values[:index_to_remove]
+            y_values = y_values[:index_to_remove]
+
+        # check if at least a third of the elements remain. This is done so that we have an adequate amount of data
+        #  to fit a psychometric threshold on.
+        remaining_fraction = len(y_values) / pre_remove_length
+        is_at_least_third_remaining = remaining_fraction >= 1 / 3
+
+        return x_values, y_values, is_at_least_third_remaining
+
+
+class ThresholdElevation(Threshold):
+    """
+    Computes a threshold elevation from two conditions: a baseline condition and a test condition by dividing
+    the threshold of the test condition by the baseline condition. In other words,
+
+    `threshold_elevation = test_condition_threshold / baseline_condition_threshold`.
+    """
+    def __init__(self,
+                 independent_variable: str,
+                 baseline_condition: str,
+                 test_condition: str,
+                 threshold_accuracy: Union[str, float] = 'inflection',
+                 required_baseline_accuracy: Optional[float] = 0.6,
+                 required_test_accuracy: Optional[float] = 0.6
+                 ):
+        """
+        :param independent_variable: The independent variable in the benchmark that the threshold is computed
+                                      over.
+        :param baseline_condition: The baseline condition against which threshold elevation is measured.
+        :param test_condition: The test condition that is used to measure threshold elevation..
+        :param threshold_accuracy: The accuracy at which the threshold should be evaluated at. This can be
+                                    either a string Literal['inflection'] or a float. When Literal['inflection']
+                                    is used, the function finds the inflection point of the curve and evaluates
+                                    the threshold at that level. When a float is used, the function evaluates
+                                    the threshold at that level.
+        :param required_baseline_accuracy: The minimum accuracy required for the psychometric function fit to be
+                                             considered for the baseline condition.
+        :param required_test_accuracy: The minimum accuracy required for the psychometric function fit to be
+                                        considered for the test condition.
+        """
+        super(ThresholdElevation, self).__init__(independent_variable)
+        self.baseline_threshold_metric = Threshold(self._independent_variable,
+                                                   threshold_accuracy=threshold_accuracy,
+                                                   required_accuracy=required_baseline_accuracy)
+        self.test_threshold_metric = Threshold(self._independent_variable,
+                                               threshold_accuracy=threshold_accuracy,
+                                               required_accuracy=required_test_accuracy)
+        self.baseline_condition = baseline_condition
+        self.test_condition = test_condition
+        self.threshold_accuracy = threshold_accuracy
+
+    def __call__(self,
+                 source: Union[float, Dict[str, BehavioralAssembly]],
+                 target: Union[list, PropertyAssembly, Dict[str, PropertyAssembly]]
+                 ) -> Score:
+        """
+        :param source: Either a dictionary containing the BehavioralAssemblies for the test condition and the
+                        baseline condition, or a pre-computed float threshold elevation. If Dict, Dict
+                        keys should be 'condition_assembly' and 'baseline_assembly' respectively.
+        :param target: Either a dictionary containing the PropertyAssemblies for the test condition and the
+                        baseline condition, or a list of pre-computed threshold elevations. If Dict, Dict
+                        keys should be 'condition_assembly' and 'baseline_assembly' respectively.
+        :return: A score containing the evaluated model's ceiling-adjusted distance to target threshold elevations.
+        """
+        # check whether source is a threshold elevation already - if not, compute it.
+        if isinstance(source, float):
+            raw_source_threshold_elevation = source
+        elif isinstance(source, Dict):
+            source_baseline_threshold = self.baseline_threshold_metric.compute_threshold(source[self.baseline_condition],
+                                                                                         self._independent_variable)
+
+            # if using the inflection accuracy, get the inflection point from the baseline condition, and use that
+            # for the test condition.
+            if self.threshold_accuracy == 'inflection':
+                self.test_threshold_metric.threshold_accuracy = self.baseline_threshold_metric.threshold_accuracy
+            source_test_threshold = self.test_threshold_metric.compute_threshold(source[self.test_condition],
+                                                                                 self._independent_variable)
+            if source_baseline_threshold == 'fit_fail' or source_test_threshold == 'fit_fail':
+                return Score(0.)  # psychometric function could not be fit -- this typically means that the model is at chance throughout
+            raw_source_threshold_elevation = source_test_threshold / source_baseline_threshold
+        else:
+            raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.')
+
+        # check whether the targets are threshold elevations already - if not, compute them
+        if isinstance(target, list):
+            target_threshold_elevations = target
+        elif isinstance(target, PropertyAssembly):
+            target_threshold_elevations = target.values.tolist()
+        elif isinstance(target, Dict):
+            target_threshold_elevations = self.compute_threshold_elevations(target)
+        else:
+            raise TypeError(f'target is type {type(target)}, but type PropertyAssembly or list is required.')
+
+        # compare threshold elevation to target threshold elevations
+        return self.scoring_function(raw_source_threshold_elevation, target_threshold_elevations)
+
+    def ceiling(self, assemblies: Dict[str, PropertyAssembly]) -> Score:
+        """
+        Computed by one-vs all for each of the NUM_TRIALS human indexes. One index is removed, and scored against
+        a pool of the other values.
+
+        Currently copied with modification from 'https://github.com/brain-score/brain-score/blob/
+        jacob2020_occlusion_depth_ordering/brainscore/metrics/data_cloud_comparision.py#L54'.
+        """
+        human_threshold_elevations = self.compute_threshold_elevations(assemblies)
+        scores = []
+        for i in range(len(human_threshold_elevations)):
+            random_state = np.random.RandomState(i)
+            random_human_score = random_state.choice(human_threshold_elevations, replace=False)
+            metric = ThresholdElevation(self._independent_variable, self.baseline_condition, self.test_condition,
+                                        self.threshold_accuracy)
+            human_threshold_elevations.remove(random_human_score)
+            score = metric(random_human_score, human_threshold_elevations)
+            human_threshold_elevations.append(random_human_score)
+            scores.append(score.values)
+
+        ceiling, ceiling_error = np.mean(scores), np.std(scores)
+        ceiling = Score(ceiling)
+        ceiling.attrs['raw'] = scores
+        ceiling.attrs['error'] = ceiling_error
+        return ceiling
+
+    @staticmethod
+    def compute_threshold_elevations(assemblies: Dict[str, PropertyAssembly]) -> List:
+        """
+        Computes the threshold elevations of a baseline condition and a test condition:
+
+        `threshold_elevation = test_condition_threshold / baseline_condition_threshold`.
+        """
+        baseline_assembly = assemblies['baseline_assembly']
+        condition_assembly = assemblies['condition_assembly']
+        threshold_elevations = []
+        for i, baseline_threshold in enumerate(baseline_assembly.values):
+            condition_threshold = condition_assembly.values[i]
+            threshold_elevations.append(condition_threshold / baseline_threshold)
+        return threshold_elevations
diff --git a/brainscore_vision/metrics/threshold/test.py b/brainscore_vision/metrics/threshold/test.py
new file mode 100644
index 000000000..84f0c7214
--- /dev/null
+++ b/brainscore_vision/metrics/threshold/test.py
@@ -0,0 +1,71 @@
+from pytest import approx
+
+from brainio.assemblies import PropertyAssembly
+from brainscore_vision import load_metric
+
+
+def test_threshold_score_from_thresholds():
+    assembly = _make_threshold_data()
+    # independent_variable is not used since we compute from thresholds, and do not need to fit them
+    metric = load_metric('threshold', independent_variable='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    assert score == approx(0.5625)
+
+
+def test_threshold_elevation_score_from_threshold_elevations():
+    assembly = _make_threshold_elevation_data()
+    # independent_variable is not used since we compute from thresholds, and do not need to fit them
+    metric = load_metric('threshold_elevation',
+                         independent_variable='placeholder',
+                         baseline_condition='placeholder',
+                         test_condition='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    assert score == approx(0.525)
+
+
+def test_threshold_has_error():
+    assembly = _make_threshold_data()
+    metric = load_metric('threshold', independent_variable='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    assert hasattr(score, 'error')
+
+
+def test_threshold_elevation_has_error():
+    assembly = _make_threshold_elevation_data()
+    metric = load_metric('threshold_elevation',
+                         independent_variable='placeholder',
+                         baseline_condition='placeholder',
+                         test_condition='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    assert hasattr(score, 'error')
+
+
+def test_threshold_has_raw():
+    assembly = _make_threshold_data()
+    metric = load_metric('threshold', independent_variable='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    assert hasattr(score, 'raw')
+
+
+def test_threshold_elevation_has_raw():
+    assembly = _make_threshold_elevation_data()
+    metric = load_metric('threshold_elevation',
+                         independent_variable='placeholder',
+                         baseline_condition='placeholder',
+                         test_condition='placeholder')
+    score = metric(float(assembly.sel(subject='A').values), assembly)
+    assert hasattr(score, 'raw')
+
+
+def _make_threshold_data():
+    # Subjects have thresholds of 10, 20, 40, and 20 respectively.
+    return PropertyAssembly([10.0, 20.0, 40.0, 20.0],
+                            coords={'subject': ('subject', ['A', 'B', 'C', 'D'])},
+                            dims=['subject'])
+
+
+def _make_threshold_elevation_data():
+    # Subjects have threshold elevations of 3, 2, 1.5, and 5 respectively.
+    return PropertyAssembly([3.0, 2.0, 1.5, 5.0],
+                            coords={'subject': ('subject', ['A', 'B', 'C', 'D'])},
+                            dims=['subject'])