diff --git a/brainscore_vision/benchmarks/malania2007/__init__.py b/brainscore_vision/benchmarks/malania2007/__init__.py new file mode 100644 index 000000000..7c6a3bbd1 --- /dev/null +++ b/brainscore_vision/benchmarks/malania2007/__init__.py @@ -0,0 +1,13 @@ +from brainscore_vision import benchmark_registry +from . import benchmark + +benchmark_registry['Malania2007.short2-threshold_elevation'] = lambda: benchmark._Malania2007Base('short2') +benchmark_registry['Malania2007.short4-threshold_elevation'] = lambda: benchmark._Malania2007Base('short4') +benchmark_registry['Malania2007.short6-threshold_elevation'] = lambda: benchmark._Malania2007Base('short6') +benchmark_registry['Malania2007.short8-threshold_elevation'] = lambda: benchmark._Malania2007Base('short8') +benchmark_registry['Malania2007.short16-threshold_elevation'] = lambda: benchmark._Malania2007Base('short16') +benchmark_registry['Malania2007.equal2-threshold_elevation'] = lambda: benchmark._Malania2007Base('equal2') +benchmark_registry['Malania2007.long2-threshold_elevation'] = lambda: benchmark._Malania2007Base('long2') +benchmark_registry['Malania2007.equal16-threshold_elevation'] = lambda: benchmark._Malania2007Base('equal16') +benchmark_registry['Malania2007.long16-threshold_elevation'] = lambda: benchmark._Malania2007Base('long16') +benchmark_registry['Malania2007.vernieracuity-threshold'] = lambda: benchmark._Malania2007VernierAcuity() diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py new file mode 100644 index 000000000..7ad587b4d --- /dev/null +++ b/brainscore_vision/benchmarks/malania2007/benchmark.py @@ -0,0 +1,224 @@ +from typing import Tuple +import numpy as np + +import brainscore_vision +from brainio.assemblies import PropertyAssembly +from brainscore_vision.benchmarks import BenchmarkBase +from brainscore_vision.benchmark_helpers.screen import place_on_screen +from brainscore_vision import load_metric +from brainscore_vision.model_interface import BrainModel +from brainscore_vision.utils import LazyLoad +from brainscore_core.metrics import Score + + +BIBTEX = """@article{malania2007, + author = {Malania, Maka and Herzog, Michael H. and Westheimer, Gerald}, + title = "{Grouping of contextual elements that affect vernier thresholds}", + journal = {Journal of Vision}, + volume = {7}, + number = {2}, + pages = {1-1}, + year = {2007}, + issn = {1534-7362}, + doi = {10.1167/7.2.1}, + url = {https://doi.org/10.1167/7.2.1} + }""" + +BASELINE_CONDITION = 'vernier_only' +DATASETS = ['short2-threshold_elevation', 'short4-threshold_elevation', 'short6-threshold_elevation', + 'short8-threshold_elevation', 'short16-threshold_elevation', 'equal2-threshold_elevation', + 'long2-threshold_elevation', 'equal16-threshold_elevation', 'long16-threshold_elevation', + 'vernieracuity-threshold'] +# Values in NUM_FLANKERS_PER_CONDITION denote the condition (i.e., in this case the number of flankers) to be selected +# This is kept track of simply because the benchmark uses threshold elevation - i.e., a comparison of 2 conditions +NUM_FLANKERS_PER_CONDITION = {'short2': 2, 'short4': 4, 'short6': 6, 'short8': 8, + 'short16': 16, 'equal2': 2, 'long2': 2, 'equal16': 16, + 'long16': 16, 'vernier_only': 0} + + +class _Malania2007Base(BenchmarkBase): + """ + INFORMATION: + + Benchmark DATASETS should be considered as independent. This means that participant-specific across-condition data + should only ever be compared using the 'subject_unique_id'. In some conditions (short-2, vernier_only, short-16) + an additional observer was added from the original paper's plots. This is because in these conditions, two + experiments were independently conducted, and 1 additional observer that was non-overlapping between the + experiments was added to the aggregate benchmark. + + While humans and models are performing the same testing task in this benchmark, there are a number of choices + that are made in this benchmark that make minor deviations from the human experiment. The choices that make + deviations from the human experiment are listed below alongside the reason for why the departure was made, + and what the 'precisely faithful' alternative would be. + + Benchmark Choices: + + 1) The number and type of fitting stimuli are unfounded choices. Currently, the number of fitting stimuli is chosen + to be relatively large, and hopefully sufficient for decoding in the baseline condition in general. + - Precisely faithful alternative: Present text instructions to models as they were presented to humans + * Why not this alternative? Since the experiment is about early visual perception, and there are currently + few/no models capable of a task like this, it would not be interesting. + - Somewhat faithful alternative: Present a smaller number of training stimuli, motivated by work like + Lee & DiCarlo (2023), biorXiv (doi:https://doi.org/10.1101/2022.12.31.522402). + * Why not this alternative? Since the experiment is not about perceptual learning but about early visual + perception, and there are few/no models capable of a task like this, it would not be interesting. + - Importantly, this means the benchmark examines the models' capability to support a task like this, rather than + their capability to learn a task like this. + 2) In the human experiment, stimuli were presented at exactly the foveal position. In the model experiment, + testing stimuli are presented at exactly the foveal position +- 72arcsec = 0.02deg. + * Why this alternative? Since most models evaluated are test-time deterministic, we want a more precise + estimate of the threshold than a point estimate. Since human microsaccades of small distances are generally + uncontrolled and uncontrollable for (e.g., up to 360arcsec = 6arcmin = 0.1 deg), we believe the tiny jitter + of 0.02deg to have no impact at all on the comparison under study, while improving the precision of threshold + estimates. + + """ + def __init__(self, condition: str): + self.baseline_condition = BASELINE_CONDITION + self.condition = condition + + # since this benchmark compares threshold elevation against a baseline, we omit one subject + # in some conditions in which that subject did not perform both the baseline and the test + # condition + baseline_assembly = LazyLoad(lambda: load_assembly(self.baseline_condition)) + condition_assembly = LazyLoad(lambda: load_assembly(self.condition)) + self._assembly, self._baseline_assembly = filter_baseline_subjects(condition_assembly, + baseline_assembly) + + self._assemblies = {'baseline_assembly': self._baseline_assembly, + 'condition_assembly': self._assembly} + self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.condition}'.rstrip('-threshold_elevation')) + self._baseline_stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.baseline_condition}'.rstrip('-threshold_elevation')) + self._stimulus_sets = {self.condition: self._stimulus_set, + self.baseline_condition: self._baseline_stimulus_set} + self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'Malania2007.{self.condition}'.rstrip('-threshold_elevation') + '_fit') + + self._metric = load_metric('threshold_elevation', + independent_variable='image_label', + baseline_condition=self.baseline_condition, + test_condition=self.condition, + threshold_accuracy=0.75) + + self._visual_degrees = 2.986667 + self._number_of_trials = 10 # arbitrary choice for microsaccades to improve precision of estimates + + super(_Malania2007Base, self).__init__( + identifier=f'Malania2007.{condition}', version=1, + ceiling_func=lambda: self._metric.ceiling(self._assemblies), + parent='Malania2007', + bibtex=BIBTEX) + + def __call__(self, candidate: BrainModel): + model_responses = {} + candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli, + number_of_trials=2, require_variance=True) + for condition in (self.baseline_condition, self.condition): + stimulus_set = place_on_screen( + self._stimulus_sets[condition], + target_visual_degrees=candidate.visual_degrees(), + source_visual_degrees=self._visual_degrees + ) + model_responses[condition] = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials, + require_variance=True) + + raw_score = self._metric(model_responses, self._assemblies) + + # Adjust score to ceiling + ceiling = self.ceiling + score = raw_score / ceiling + + score.attrs['raw'] = raw_score + score.attrs['ceiling'] = ceiling + return score + + +class _Malania2007VernierAcuity(BenchmarkBase): + def __init__(self): + self.baseline_condition = BASELINE_CONDITION + self.conditions = DATASETS.copy() + self.conditions.remove('vernieracuity-threshold') + + self._assemblies = {condition: {'baseline_assembly': self.get_assemblies(condition)['baseline_assembly'], + 'condition_assembly': self.get_assemblies(condition)['condition_assembly']} + for condition in self.conditions} + self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.baseline_condition}') + self._fitting_stimuli = {condition: brainscore_vision.load_stimulus_set(f'Malania2007.{condition}'.rstrip('-threshold_elevation') + '_fit') + for condition in self.conditions} + + self._metric = load_metric('threshold', + independent_variable='image_label', + threshold_accuracy=0.75) + + self._visual_degrees = 2.986667 + self._number_of_trials = 10 # arbitrary choice for microsaccades to improve precision of estimates + + super(_Malania2007VernierAcuity, self).__init__( + identifier=f'Malania2007.vernieracuity-threshold', version=1, + ceiling_func=lambda: self.mean_ceiling(), + parent='Malania2007', + bibtex=BIBTEX) + + def __call__(self, candidate: BrainModel): + scores = [] + for condition in self.conditions: + candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli[condition], + number_of_trials=2, require_variance=True) + stimulus_set = place_on_screen( + self._stimulus_set, + target_visual_degrees=candidate.visual_degrees(), + source_visual_degrees=self._visual_degrees + ) + model_response = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials, + require_variance=True) + + raw_score = self._metric(model_response, self._assemblies[condition]) + # Adjust score to ceiling + ceiling = self.ceiling + score = raw_score / ceiling + score.attrs['error'] = raw_score.error + + score.attrs['raw'] = raw_score + score.attrs['ceiling'] = ceiling + scores.append(score) + # average all scores to get 1 average score + mean_score = Score(np.mean(scores)) + mean_score.attrs['error'] = np.mean([score.error for score in scores]) + return mean_score + + def get_assemblies(self, condition: str): + condition = condition.rstrip('-threshold_elevation') + baseline_assembly = LazyLoad(lambda: load_assembly(self.baseline_condition)) + condition_assembly = LazyLoad(lambda: load_assembly(condition)) + assembly, baseline_assembly = filter_baseline_subjects(condition_assembly, + baseline_assembly) + return {'condition_assembly': assembly, + 'baseline_assembly': baseline_assembly} + + def mean_ceiling(self): + ceilings = [] + errors = [] + for assembly_name in self._assemblies.keys(): + this_ceiling = self._metric.ceiling(self._assemblies[assembly_name]['baseline_assembly']) + ceilings.append(this_ceiling.values) + errors.append(this_ceiling.error) + mean_ceiling = Score(np.mean(ceilings)) + mean_ceiling.attrs['error'] = np.mean(errors) + return mean_ceiling + + +def load_assembly(dataset: str) -> PropertyAssembly: + assembly = brainscore_vision.load_dataset(f'Malania2007.{dataset}') + return assembly + + +def filter_baseline_subjects(condition_assembly: PropertyAssembly, + baseline_assembly: PropertyAssembly + ) -> Tuple[PropertyAssembly, PropertyAssembly]: + """A function to select only the unique subjects that exist in the condition_assembly.""" + non_nan_mask = ~np.isnan(condition_assembly.values) + unique_ids = condition_assembly.coords['subject'][non_nan_mask].values.tolist() + + mask = baseline_assembly.coords['subject'].isin(unique_ids) + filtered_baseline_assembly = baseline_assembly.where(mask, drop=True) + filtered_condition_assembly = condition_assembly.where(mask, drop=True) + return filtered_condition_assembly, filtered_baseline_assembly diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py new file mode 100644 index 000000000..8a8ca75cd --- /dev/null +++ b/brainscore_vision/benchmarks/malania2007/test.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest +from pytest import approx + +from brainscore_vision import benchmark_registry, load_benchmark, load_model +from brainscore_vision.benchmarks.malania2007.benchmark import DATASETS + + +class TestBehavioral: + def test_count(self): + assert len(DATASETS) == 5 + 2 + 2 + 1 + + @pytest.mark.parametrize('dataset', DATASETS) + def test_in_pool(self, dataset): + identifier = f"Malania2007.{dataset}" + assert identifier in benchmark_registry + + @pytest.mark.private_access + def test_mean_ceiling(self): + benchmarks = [f"Malania2007.{dataset}" for dataset in DATASETS] + benchmarks = [load_benchmark(benchmark) for benchmark in benchmarks] + ceilings = [benchmark.ceiling for benchmark in benchmarks] + mean_ceiling = np.mean(ceilings) + assert mean_ceiling == approx(0.5757928329186803, abs=0.001) + + # these test values are for the pooled score ceiling + @pytest.mark.private_access + @pytest.mark.parametrize('dataset, expected_ceiling', [ + ('short2-threshold_elevation', approx(0.78719345, abs=0.001)), + ('short4-threshold_elevation', approx(0.49998989, abs=0.001)), + ('short6-threshold_elevation', approx(0.50590051, abs=0.001)), + ('short8-threshold_elevation', approx(0.4426336, abs=0.001)), + ('short16-threshold_elevation', approx(0.8383443, abs=0.001)), + ('equal2-threshold_elevation', approx(0.56664015, abs=0.001)), + ('long2-threshold_elevation', approx(0.46470421, abs=0.001)), + ('equal16-threshold_elevation', approx(0.44087153, abs=0.001)), + ('long16-threshold_elevation', approx(0.50996587, abs=0.001)), + ('vernieracuity-threshold', approx(0.70168481, abs=0.001)) + ]) + def test_dataset_ceiling(self, dataset, expected_ceiling): + benchmark = f"Malania2007.{dataset}" + benchmark = load_benchmark(benchmark) + ceiling = benchmark.ceiling + assert ceiling == expected_ceiling + + @pytest.mark.parametrize('dataset, expected_score', [ + ('short2-threshold_elevation', approx(0.0, abs=0.001)), + ('short4-threshold_elevation', approx(0.0, abs=0.001)), + ('short6-threshold_elevation', approx(0.0, abs=0.001)), + ('short8-threshold_elevation', approx(0.0, abs=0.001)), + ('short16-threshold_elevation', approx(0.0, abs=0.001)), + ('equal2-threshold_elevation', approx(0.0, abs=0.001)), + ('long2-threshold_elevation', approx(0.0, abs=0.001)), + ('equal16-threshold_elevation', approx(0.0, abs=0.001)), + ('long16-threshold_elevation', approx(0.0, abs=0.001)), + ('vernieracuity-threshold', approx(0.0, abs=0.001)) + ]) + def test_model_score(self, dataset, expected_score): + benchmark = f"Malania2007.{dataset}" + benchmark = load_benchmark(benchmark) + model = load_model('alexnet') + model_score = benchmark(model) + assert model_score.values == expected_score diff --git a/brainscore_vision/data/malania2007/__init__.py b/brainscore_vision/data/malania2007/__init__.py new file mode 100644 index 000000000..e6ecbb5cd --- /dev/null +++ b/brainscore_vision/data/malania2007/__init__.py @@ -0,0 +1,254 @@ +from brainio.assemblies import PropertyAssembly + +from brainscore_vision import data_registry, stimulus_set_registry, load_stimulus_set +from brainscore_vision.data_helpers.s3 import load_assembly_from_s3, load_stimulus_set_from_s3 + + +BIBTEX = """@article{malania2007, + author = {Malania, Maka and Herzog, Michael H. and Westheimer, Gerald}, + title = "{Grouping of contextual elements that affect vernier thresholds}", + journal = {Journal of Vision}, + volume = {7}, + number = {2}, + pages = {1-1}, + year = {2007}, + issn = {1534-7362}, + doi = {10.1167/7.2.1}, + url = {https://doi.org/10.1167/7.2.1} + }""" + + +data_registry['Malania2007.equal2'] = lambda: load_assembly_from_s3( + identifier='Malania2007_equal-2', + version_id="yFXK8xjGjEmuYTSfS58rGS_ah3.NGg0X", + sha1="277b2fbffed00e16b6a69b488f73eeda5abaaf10", + bucket="brainio-brainscore", + cls=PropertyAssembly, + stimulus_set_loader=None, +) +data_registry['Malania2007.equal16'] = lambda: load_assembly_from_s3( + identifier='Malania2007_equal-16', + version_id="SRZ7bs.Ek59GkeS084Pvdy38uTzFs4yw", + sha1="ef49506238e8d2554918b113fbc60c133077186e", + bucket="brainio-brainscore", + cls=PropertyAssembly, + stimulus_set_loader=None, +) +data_registry['Malania2007.long2'] = lambda: load_assembly_from_s3( + identifier='Malania2007_long-2', + version_id="2c1lWuXthb3rymB3seTQX1jVqiKUTn1f", + sha1="9076a5b693948c4992b6c8e753f04a7acd2014a1", + bucket="brainio-brainscore", + cls=PropertyAssembly, + stimulus_set_loader=None, +) +data_registry['Malania2007.long16'] = lambda: load_assembly_from_s3( + identifier='Malania2007_long-16', + version_id="qshNxhxjgusWyWiXnbfFN6gqjLgRh8fO", + sha1="3106cf1f2fa9e66617ebf231df05d29077fc478f", + bucket="brainio-brainscore", + cls=PropertyAssembly, + stimulus_set_loader=None, +) +data_registry['Malania2007.short2'] = lambda: load_assembly_from_s3( + identifier='Malania2007_short-2', + version_id="8CQ9MupuljAgkkKUXs3hiOliHg8xoDxb", + sha1="85fb65ad76de48033c704b9c5689771e1ea0457d", + bucket="brainio-brainscore", + cls=PropertyAssembly, + stimulus_set_loader=None, +) +data_registry['Malania2007.short4'] = lambda: load_assembly_from_s3( + identifier='Malania2007_short-4', + version_id=".ZUO0upSfQrWLPgd4oGwAaCbN4bz6S6H", + sha1="75506be9a26ec38a223e41510f1a8cb32d5b0bc9", + bucket="brainio-brainscore", + cls=PropertyAssembly, + stimulus_set_loader=None, +) +data_registry['Malania2007.short6'] = lambda: load_assembly_from_s3( + identifier='Malania2007_short-6', + version_id="q4FugpNGkT_FQP..qIVzye83hAQR2xfS", + sha1="2901be6b352e67550da040d79d744819365b8626", + bucket="brainio-brainscore", + cls=PropertyAssembly, + stimulus_set_loader=None, +) +data_registry['Malania2007.short8'] = lambda: load_assembly_from_s3( + identifier='Malania2007_short-8', + version_id="4_lcRl_I7Mp0RHxcfqZ9tkAZjVh.5oMU", + sha1="6daf47b086cb969d75222e320f49453ed8437885", + bucket="brainio-brainscore", + cls=PropertyAssembly, + stimulus_set_loader=None, +) +data_registry['Malania2007.short16'] = lambda: load_assembly_from_s3( + identifier='Malania2007_short-16', + version_id="fFqEIyIC9CHzqTEmv0MitjCgpeMX5pxJ", + sha1="8ae0898caad718b747f85fce5888416affc3a569", + bucket="brainio-brainscore", + cls=PropertyAssembly, + stimulus_set_loader=None, +) +data_registry['Malania2007.vernier_only'] = lambda: load_assembly_from_s3( + identifier='Malania2007_vernier-only', + version_id="JLWf2pIR_UadQHqwtegJkC6XzWdbSNGi", + sha1="1cf83e8b6141f8b0d67ea46994f342325f62001f", + bucket="brainio-brainscore", + cls=PropertyAssembly, + stimulus_set_loader=None, +) + + +stimulus_set_registry['Malania2007.equal2'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_equal-2', + bucket="brainio-brainscore", + csv_sha1="77e94b9b5122a83ebbaffb4a06fcab68ef652751", + zip_sha1="99826d459f6920dafab72eed69eb2a90492ce796", + csv_version_id="MlRpSz.4.jvVRFAZl8tGEum1P0Q0GtyS", + zip_version_id="vHbAM_FjTbjp5U12BkAelJu4KW6PLYFn" +) +stimulus_set_registry['Malania2007.equal2_fit'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_equal-2_fit', + bucket="brainio-brainscore", + csv_sha1="bafdfc855c164d3e5443d67dcf9eb7762443f964", + zip_sha1="e52fec1a79ac8837e331b180c2a8a140840d6666", + csv_version_id="PIXEW.2vHvjIBP0Q2KHIpnxns7t9o8Cf", + zip_version_id="h7pp84CYFGLKlPhveD0L5ogePqisk_I7" +) +stimulus_set_registry['Malania2007.equal16'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_equal-16', + bucket="brainio-brainscore", + csv_sha1="5fedcff56c302339c3451ae2edbcb846c39c3189", + zip_sha1="b30dc2dc90e4f3d88775622e558db963765f38e0", + csv_version_id="VmRGiQkhPALDwq74NpE2VpTiKTGn.30T", + zip_version_id="c.DOlVULXZingRJ9gVY_NbZwRrj_xs_i" +) +stimulus_set_registry['Malania2007.equal16_fit'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_equal-16_fit', + bucket="brainio-brainscore", + csv_sha1="3de3e5de19a638767a01ba68cb690dc746c29a77", + zip_sha1="1728920c5ea4fb7b3a3cf3c076165aca65c8b751", + csv_version_id="joAq8JBC_7axZDfLNFgoXFhTCLU_KKr_", + zip_version_id="77JRwdldaHDr6TLW1NnB5HucIrkUCVg." +) +stimulus_set_registry['Malania2007.long2'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_long-2', + bucket="brainio-brainscore", + csv_sha1="ba65316a63dc688d8dfb410219a28fd02850b991", + zip_sha1="7fd431fbbd4a4dc0cd271624d3297c19a28a70b5", + csv_version_id="_0fqObn6k5KvXurHMsuD4IqtrqbNskyo", + zip_version_id="foL92ndVAAAETzMYHdmMtwIwKxXYhAB." +) +stimulus_set_registry['Malania2007.long2_fit'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_long-2_fit', + bucket="brainio-brainscore", + csv_sha1="b91dd9261c1d47bdd37f9b60eb8066b7b719709f", + zip_sha1="5be3e1cd57b59081103715b5d318505166e0045e", + csv_version_id="mATh8lcVisdsDnPnU6ACE23iBPfpkLZA", + zip_version_id="6nEviShTyCYQKrmxyjDyNov9Skc77eXT" +) +stimulus_set_registry['Malania2007.long16'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_long-16', + bucket="brainio-brainscore", + csv_sha1="1f1b03319b81698ba5e7db389dcd4248f94e45ca", + zip_sha1="97c70462a28905b58058c687880188d634d357f0", + csv_version_id="4RtywQ40hfQA4N80g8lxEScAmMXFRg7E", + zip_version_id="lJy2QosABzHtiA6BJaE4OqCn1w1Jhz2k" +) +stimulus_set_registry['Malania2007.long16_fit'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_long-16_fit', + bucket="brainio-brainscore", + csv_sha1="d80a02c75b9908301c3c8dc9f7116fecf8e060ec", + zip_sha1="d8819b94d3f502d7a382c8a0db0a34627132e5e2", + csv_version_id="gOxY6tjnT7LO.FDeL1xkRmowl5wYeAia", + zip_version_id="71UAPTnZscIuqdx2dhuW9V0O0DO_TgTM" +) +stimulus_set_registry['Malania2007.short2'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_short-2', + bucket="brainio-brainscore", + csv_sha1="bf0252056d2084e855646f624700ab03c19cfc3d", + zip_sha1="eee1270feb7443e7e315d8feb7fb0a6b6908f554", + csv_version_id="zcJqM.ZPwJyiMRWa3RBdvv401yPnLQAp", + zip_version_id="C8WZzAAQ0JGHAAKii4JpvlRhcUOhgSj." +) +stimulus_set_registry['Malania2007.short2_fit'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_short-2_fit', + bucket="brainio-brainscore", + csv_sha1="73127d279a2cd254ae4f07b0053580851e84b00c", + zip_sha1="918736349d714a4f784c29bf7e7d218b103e128d", + csv_version_id="iwGRp3_ktAHfJ6r7ktSK9gsthDjKek70", + zip_version_id="6RpplJ9UVXTlvhmFSXla0Qa20b44m8Ds" +) +stimulus_set_registry['Malania2007.short4'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_short-4', + bucket="brainio-brainscore", + csv_sha1="816326d89d358f6592bd1f789e5c8d429fbca1cd", + zip_sha1="ff57d976ef75ede9148a4097e90d6cf6c8054d34", + csv_version_id="Waikk.bktXIncCUtCIAyB2EqynGk.H.F", + zip_version_id="rl_muxI4UEpwXVaXuhsqroG..COGILvR" +) +stimulus_set_registry['Malania2007.short4_fit'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_short-4_fit', + bucket="brainio-brainscore", + csv_sha1="3512cfd029f4e4299bc41ede519e691d80cfc3d5", + zip_sha1="301386408dd1fb8556881f9a171be2d43dbfec6e", + csv_version_id="UhisdJqiEmkQ_4zsUtAmaxtle2kMZdcD", + zip_version_id="xt_v0xgCB8YUptyPB0yZFHIUcel5MF_x" +) +stimulus_set_registry['Malania2007.short6'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_short-6', + bucket="brainio-brainscore", + csv_sha1="3d5dd9b48a56ba0c31de94b6221b97df962b6f8a", + zip_sha1="120d90a143d1577d4745c3f69291d0db6c7e512e", + csv_version_id="GwGHPJkMDdg8N_.boyj8qJ3ChsEx4w._", + zip_version_id="gIN1O4yz.THvK0Ifm5M3AI58ZACE1QFh" +) +stimulus_set_registry['Malania2007.short6_fit'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_short-6_fit', + bucket="brainio-brainscore", + csv_sha1="27a5be4fca190837fc5b75ed2cdbbffbf6b41338", + zip_sha1="c88e05c6cadec88a2c9475b0735323a2b049bd75", + csv_version_id="oMlj7wV85s00hJFE84ym0AJHLCfYHVA6", + zip_version_id="oS.KrBTlcYAgr_lWyA_bIjVc2js_VeUe" +) +stimulus_set_registry['Malania2007.short8'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_short-8', + bucket="brainio-brainscore", + csv_sha1="8fc35f607196b4c0cdcebd8102d17e3a637e5988", + zip_sha1="a9215ed0cb0f0333582dda65f6afd7015c506ba5", + csv_version_id="gzys8s7j7euMEl7JJpqBFLFHMpFjwbA7", + zip_version_id="3fYb4Iruh3lRKUwC1APqFH4CNbE5DEuk" +) +stimulus_set_registry['Malania2007.short8_fit'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_short-8_fit', + bucket="brainio-brainscore", + csv_sha1="aa4133a9fe19a3c9004a9cb5e6eb5a72564e4883", + zip_sha1="beb9f068794708e41750202b78c438538a40a8fb", + csv_version_id="7N1Z.uiagqBknJUSBQ4mVfHKWgocM5aA", + zip_version_id="kcEOPOkvWymO0wX5j_QKxcNPl9sZsjFd" +) +stimulus_set_registry['Malania2007.short16'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_short-16', + bucket="brainio-brainscore", + csv_sha1="addd260c9959f2f315db03c0a39c6c1b01fef685", + zip_sha1="cba4c2866ec692fb808471df7c2fed446d9fb3fe", + csv_version_id="Peu7WU5vanLoZNOFIAbuPzZNPDRgbCSX", + zip_version_id="wFkJkZMC8Fs_HfPJy32CMKcHJWeQIUDB" +) +stimulus_set_registry['Malania2007.short16_fit'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_short-16_fit', + bucket="brainio-brainscore", + csv_sha1="9b340fe242117482f6992f48a805297215ba9924", + zip_sha1="4a90d511a3ceb3307a672177a3ad6b76521e65e5", + csv_version_id="sYBPEmXDgbWipuepciLirlorQE3L8BLc", + zip_version_id="pYvOkrLxadkQ67K3__wmciNwaCW.hyyN" +) +stimulus_set_registry['Malania2007.vernier_only'] = lambda: load_stimulus_set_from_s3( + identifier='Malania2007_vernier-only', + bucket="brainio-brainscore", + csv_sha1="b2cb0f2ed32426b739f90187ae24ad4adf84110d", + zip_sha1="0e177aea523adc320070196fbb777af4cdba2144", + csv_version_id="c8wpZpqoMqdATlqdoq3srPUi_8fYg6a.", + zip_version_id="28lHgxERhw32Ux6IBCxWWTtRwIaRrwo6" +) diff --git a/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py b/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py new file mode 100644 index 000000000..091ac3fa6 --- /dev/null +++ b/brainscore_vision/data/malania2007/data_packaging/malania_data_assembly.py @@ -0,0 +1,79 @@ +from pathlib import Path +import numpy as np +import xarray as xr + +from brainio.assemblies import PropertyAssembly +from brainio.packaging import package_data_assembly +import pandas as pd + + +DATASETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2', + 'long-2', 'equal-16', 'long-16', 'vernier-only'] +NUM_SUBJECTS = {'short-2': 6, + 'short-4': 5, + 'short-6': 5, + 'short-8': 5, + 'short-16': 6, + 'equal-2': 5, + 'long-2': 5, + 'equal-16': 5, + 'long-16': 5, + 'vernier-only': 6} + + +def collect_malania_data_assembly(root_directory, dataset): + """ + Experiment Information: + - 5-6 observers per condition (for exact value, see NUM_SUBJECTS) + - 2AFC left/right offset discrimination task + - PEST staircase to 75% correct responses + - thresholds measured with a cumulative gaussian psychometric function with a likelihood fit + """ + # construct the assembly + metadata_directory = Path(f'{root_directory}/{dataset}/metadata_human.xlsx') + metadata = pd.read_excel(metadata_directory) + # Since subjects are uniquely held using 'unique_subject_id', drop the rows with a subject + # without measurement + assembly = PropertyAssembly(metadata['threshold'], + coords={ + 'subject': ('subject', metadata['subject_unique_id']), + }, + dims=['subject'] + ) + + # give the assembly an identifier name + assembly.name = f'Malania2007_{dataset}' + + # test subject numbers after removing the NaN subject + metadata = metadata.dropna(subset=['threshold'], axis=0) + assert len(metadata) == NUM_SUBJECTS[dataset] + + return assembly + + +def remove_subjects_with_nans(assembly1, assembly2): + # Find the indices of the subjects with NaN values in the first PropertyAssembly + nan_subjects = np.isnan(assembly1.values) + + # Convert the boolean array to a DataArray with the same coordinates as the input assemblies + nan_subjects_da = xr.DataArray(nan_subjects, coords=assembly1.coords, dims=assembly1.dims) + + # Filter out the subjects with NaN values from both PropertyAssemblies + filtered_assembly1 = assembly1.where(~nan_subjects_da, drop=True) + filtered_assembly2 = assembly2.where(~nan_subjects_da, drop=True) + + return filtered_assembly1, filtered_assembly2 + + +if __name__ == '__main__': + root_directory = Path(r'.') + for dataset in DATASETS: + assembly = collect_malania_data_assembly(root_directory, dataset) + # upload to S3 + prints = package_data_assembly(catalog_identifier=None, + proto_data_assembly=assembly, + assembly_identifier=assembly.name, + stimulus_set_identifier=assembly.name, + assembly_class_name="PropertyAssembly", + bucket_name="brainio-brainscore") + print(prints) diff --git a/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py b/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py new file mode 100644 index 000000000..8a9f63fde --- /dev/null +++ b/brainscore_vision/data/malania2007/data_packaging/malania_stimulus_set.py @@ -0,0 +1,79 @@ +import csv +from pathlib import Path +from brainio.stimuli import StimulusSet +from brainio.packaging import package_stimulus_set + + +# every stimulus set is separate, incl. baseline condition +STIMULUS_SETS = ['short-2', 'short-4', 'short-6', 'short-8', 'short-16', 'equal-2', + 'long-2', 'equal-16', 'long-16', 'vernier-only', 'short-2_fit', + 'short-4_fit', 'short-6_fit', 'short-8_fit', 'short-16_fit', + 'equal-2_fit', 'long-2_fit', 'equal-16_fit', 'long-16_fit'] +DATASET_LENGTHS = {'test': 50, 'fit': 500} + + +def collect_malania_stimulus_set(root_directory, dataset): + """ + Dataset Meta Info + + Reported in pixels: + - image_size_x; image_size_y + - vernier_position_x; vernier_position_y + + Reported in arcsec: + - vernier_height (height of the vernier elements combined, - middle gap) + - vernier_offset (horizontal offset between flankers) + - flanker_height (height of the flanker elements) + - flanker_spacing (distance between a flanker element and another flanker element) + - line_width (width of all the lines in all elements) + - flanker_distance (distance between a flanker and a vernier) + """ + stimuli = [] + stimulus_paths = {} + + dataset_type = 'fit' if dataset[-3:] == 'fit' else 'test' + metadata_directory = Path(f'{root_directory}/{dataset}/metadata.csv') + image_directory = Path(f'{root_directory}/{dataset}/images') + with open(metadata_directory, 'r') as metadata: + reader = csv.DictReader(metadata) + for row in reader: + stimuli.append({ + 'image_size_x_pix': int(row['image_size_x_pix']), + 'image_size_y_pix': int(row['image_size_y_pix']), + 'image_size_c': int(row['image_size_c']), + 'image_size_degrees': float(row['image_size_degrees']), + 'vernier_height_arcsec': float(row['vernier_height_arcsec']), + 'vernier_offset_arcsec': float(row['vernier_offset_arcsec']), + 'image_label': row['image_label'], + 'flanker_height_arcsec': float(row['flanker_height_arcsec']), + 'flanker_spacing_arcsec': float(row['flanker_spacing_arcsec']), + 'line_width_arcsec': float(row['line_width_arcsec']), + 'flanker_distance_arcsec': float(row['flanker_distance_arcsec']), + 'num_flankers': int(row['num_flankers']), + 'vernier_position_x_pix': int(row['vernier_position_x_pix']), + 'vernier_position_y_pix': int(row['vernier_position_y_pix']), + 'stimulus_id': str(row['stimulus_id']), + }) + stimulus_paths[row['stimulus_id']] = Path(f'{image_directory}/{row["filename"]}') + + stimuli = StimulusSet(stimuli) + stimuli.stimulus_paths = stimulus_paths + stimuli.name = f'Malania2007_{dataset}' # give the StimulusSet an identifier name + stimuli.identifier = f'Malania2007_{dataset}' + + # Ensure expected number of stimuli in datasets + assert len(stimuli) == DATASET_LENGTHS[dataset_type] + return stimuli + + +if __name__ == '__main__': + root_directory = Path(r'../data/malania2007/data_packaging/') + for stimulus_set in STIMULUS_SETS: + stimuli = collect_malania_stimulus_set(root_directory, stimulus_set) + + # upload to S3 + prints = package_stimulus_set(catalog_name=None, + proto_stimulus_set=stimuli, + stimulus_set_identifier=stimuli.name, + bucket_name="brainio-brainscore") + print(prints) diff --git a/brainscore_vision/data/malania2007/test.py b/brainscore_vision/data/malania2007/test.py new file mode 100644 index 000000000..faf216749 --- /dev/null +++ b/brainscore_vision/data/malania2007/test.py @@ -0,0 +1,147 @@ +import numpy as np +import pytest + +from brainscore_vision import load_stimulus_set, load_dataset +from brainscore_vision.benchmarks.malania2007.benchmark import DATASETS + + +@pytest.mark.private_access +class TestAssemblies: + # test the number of subjects: + @pytest.mark.parametrize('identifier, num_subjects', [ + ('short2', 6), + ('short4', 5), + ('short6', 5), + ('short8', 5), + ('short16', 6), + ('equal2', 5), + ('long2', 5), + ('equal16', 5), + ('long16', 5), + ('vernier_only', 6) + ]) + def test_num_subjects(self, identifier, num_subjects): + assembly = load_dataset(f"Malania2007.{identifier}") + assembly = assembly.dropna(dim='subject') + assert len(np.unique(assembly['subject'].values)) == num_subjects + + # test assembly coords present in ALL 17 sets: + @pytest.mark.parametrize('identifier', [ + 'short2', + 'short4', + 'short6', + 'short8', + 'short16', + 'equal2', + 'long2', + 'equal16', + 'long16', + 'vernier_only', + ]) + @pytest.mark.parametrize('field', [ + 'subject' + ]) + def test_fields_present(self, identifier, field): + assembly = load_dataset(f"Malania2007.{identifier}") + assert hasattr(assembly, field) + + +@pytest.mark.private_access +class TestStimulusSets: + # test stimulus_set data: + @pytest.mark.parametrize('identifier', [ + 'short2', + 'short4', + 'short6', + 'short8', + 'short16', + 'equal2', + 'long2', + 'equal16', + 'long16', + 'short2_fit', + 'short4_fit', + 'short6_fit', + 'short8_fit', + 'short16_fit', + 'equal2_fit', + 'long2_fit', + 'equal16_fit', + 'long16_fit', + 'vernier_only' + ]) + def test_stimulus_set_exist(self, identifier): + full_name = f"Malania2007.{identifier}" + stimulus_set = load_stimulus_set(full_name) + assert stimulus_set is not None + stripped_actual_identifier = stimulus_set.identifier.replace('.', '').replace('_', '').replace('-', '') + stripped_expected_identifier = full_name.replace('.', '').replace('_', '').replace('-', '') + assert stripped_actual_identifier == stripped_expected_identifier + + @pytest.mark.parametrize('identifier, num_images', [ + ('short2', 50), + ('short4', 50), + ('short6', 50), + ('short8', 50), + ('short16', 50), + ('equal2', 50), + ('long2', 50), + ('equal16', 50), + ('long16', 50), + ('short2_fit', 500), + ('short4_fit', 500), + ('short6_fit', 500), + ('short8_fit', 500), + ('short16_fit', 500), + ('equal2_fit', 500), + ('long2_fit', 500), + ('equal16_fit', 500), + ('long16_fit', 500), + ('vernier_only', 50) + ]) + def test_number_of_images(self, identifier, num_images): + stimulus_set = load_stimulus_set(f"Malania2007.{identifier}") + assert len(np.unique(stimulus_set['stimulus_id'].values)) == num_images + + # tests stimulus_set coords for the 14 "normal" sets: + @pytest.mark.parametrize('identifier', [ + 'short2', + 'short4', + 'short6', + 'short8', + 'short16', + 'equal2', + 'long2', + 'equal16', + 'long16', + 'short2_fit', + 'short4_fit', + 'short6_fit', + 'short8_fit', + 'short16_fit', + 'equal2_fit', + 'long2_fit', + 'equal16_fit', + 'long16_fit', + 'vernier_only' + ]) + @pytest.mark.parametrize('field', [ + 'image_size_x_pix', + 'image_size_y_pix', + 'image_size_c', + 'image_size_degrees', + 'vernier_height_arcsec', + 'vernier_offset_arcsec', + 'image_label', + 'flanker_height_arcsec', + 'flanker_spacing_arcsec', + 'line_width_arcsec', + 'flanker_distance_arcsec', + 'num_flankers', + 'vernier_position_x_pix', + 'vernier_position_y_pix', + 'stimulus_id', + ]) + def test_fields_present(self, identifier, field): + stimulus_set = load_stimulus_set(f"Malania2007.{identifier}") + assert hasattr(stimulus_set, field) diff --git a/brainscore_vision/metrics/threshold/__init__.py b/brainscore_vision/metrics/threshold/__init__.py new file mode 100644 index 000000000..69f6102e8 --- /dev/null +++ b/brainscore_vision/metrics/threshold/__init__.py @@ -0,0 +1,5 @@ +from brainscore_vision import metric_registry +from .metric import Threshold, ThresholdElevation + +metric_registry['threshold'] = Threshold +metric_registry['threshold_elevation'] = ThresholdElevation diff --git a/brainscore_vision/metrics/threshold/metric.py b/brainscore_vision/metrics/threshold/metric.py new file mode 100644 index 000000000..2b04270e3 --- /dev/null +++ b/brainscore_vision/metrics/threshold/metric.py @@ -0,0 +1,482 @@ +from typing import Dict, Union, Tuple, List, Optional, Callable + +import numpy as np +from scipy.optimize import minimize +from scipy.stats import norm +from sklearn.metrics import r2_score +import matplotlib.pyplot as plt + +from brainscore_core.metrics import Metric, Score +from brainio.assemblies import PropertyAssembly, BehavioralAssembly + + +def psychometric_cum_gauss(x: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float: + """ + The classic psychometric function as implemented in Wichmann & Hill (2001). The psychometric function: I. + Fitting, sampling, and goodness of fit, eq. 1. + + :param x: the independent variables of the data + :param alpha: the slope parameter + :param beta: the mean of the cdf parameter + :param lambda_: the lapse rate + :param gamma: the lower bound of the fit + + :return: the psychometric function values for the given parameters evaluated at `x`. + """ + return gamma + (1 - gamma - lambda_) * norm.cdf(alpha * (x - beta)) + + +def inverse_psychometric_cum_gauss(y: np.array, alpha: float, beta: float, lambda_: float, gamma: float = 0.5) -> float: + """The inverse of psychometric_cum_gauss.""" + return beta + (norm.ppf((y - gamma) / (1 - gamma - lambda_)) / alpha) + + +def cum_gauss_neg_log_likelihood(params: Tuple[float, ...], x: np.array, y: np.array) -> float: + """The negative log likelihood function for psychometric_cum_gauss.""" + alpha, beta, lambda_ = params + p = psychometric_cum_gauss(x, alpha, beta, lambda_) + log_likelihood = y * np.log(p) + (1 - y) * np.log(1 - p) + return -np.sum(log_likelihood) + + +def get_predicted(params: Tuple[float, ...], x: np.array, fit_fn: Callable) -> np.array: + """Returns the predicted values based on the model parameters.""" + return fit_fn(x, *params) + + +def grid_search(x: np.array, + y: np.array, + alpha_values: np.array = np.logspace(-3, 1, 50), + beta_values: np.array = None, + fit_fn: Callable = psychometric_cum_gauss, + fit_log_likelihood_fn: Callable = cum_gauss_neg_log_likelihood, + fit_bounds: Tuple = ((None, None), (None, None), (0.03, 0.5)) + ) -> Tuple[Tuple[float, ...], float]: + """ + A classic simplified procedure for running sparse grid search over the slope and mean parameters of the + psychometric function. + This function is implemented here instead of using sklearn.GridSearchCV since we would have to make a custom + sklearn estimator class to use GridSearchCV with psychometric functions, likely increasing code bloat + substantially. + + :param x: the independent variables of the data + :param y: the measured accuracy rates for the given x-values + :param alpha_values: the alpha values for the chosen fit function to grid search over + :param beta_values: the beta values for the chosen fit function to grid search over + :param fit_fn: the psychometric function that is fit + :param fit_log_likelihood_fn: the log likelihood function that computes the log likelihood of its corresponding + fit function + :param fit_bounds: the bounds assigned to the fit function called by fit_log_likelihood_fn. + The default fit_bounds are assigned as: + alpha: (None, None), to allow any slope + beta: (None, None), any inflection point is allowed, as that is controlled for in the + Threshold class + lambda_: (0.03, 0.5)), to require at least a small lapse rate, as is regularly done in + human fitting + + :return: the parameters of the best fit in the grid search + """ + assert len(x) == len(y) + # Default the beta_values grid search to the measured x-points. + if beta_values is None: + beta_values = x + + # initialize best values for a fit + best_alpha, best_beta, best_lambda = None, None, None + min_neg_log_likelihood = np.inf + + for alpha_guess in alpha_values: + for beta_guess in beta_values: + initial_guess = np.array([alpha_guess, beta_guess, 1 - np.max(y)]) # lapse rate guess set to the maximum y + + # wrap inside a RuntimeError block to catch the RuntimeError thrown by scipy.minimize if a fit + # entirely fails. The case where all fits fail here is handled by the Threshold metric. + try: + result = minimize(fit_log_likelihood_fn, initial_guess, args=(x, y), + method='L-BFGS-B', bounds=fit_bounds) + alpha_hat, beta_hat, lambda_hat = result.x + neg_log_likelihood_hat = fit_log_likelihood_fn([alpha_hat, beta_hat, lambda_hat], x, y) + + if neg_log_likelihood_hat < min_neg_log_likelihood: + min_neg_log_likelihood = neg_log_likelihood_hat + best_alpha, best_beta, best_lambda = alpha_hat, beta_hat, lambda_hat + except RuntimeError: + pass + + y_pred = fit_fn(x, best_alpha, best_beta, best_lambda) + r2 = r2_score(y, y_pred) # R^2 of the fit + return (best_alpha, best_beta, best_lambda), r2 + + +class Threshold(Metric): + """ + Computes a psychometric threshold function from model responses and compares against human-computed psychometric + thresholds. + + The model comparison to human data is currently individual-subject based, i.e., models and ceilings are compared + against the mean of the distance of the model threshold to human thresholds. + """ + def __init__(self, + independent_variable: str, + fit_function=psychometric_cum_gauss, + fit_inverse_function=inverse_psychometric_cum_gauss, + threshold_accuracy: Union[str, float] = 'inflection', + required_accuracy: Optional[float] = 0.6 + ): + """ + :param independent_variable: The independent variable in the benchmark that the threshold is computed + over. + :param fit_function: The function used to fit the threshold. + :param fit_inverse_function: The inverse of fit_function used to find the threshold from the fit. + :param threshold_accuracy: The accuracy at which the threshold should be evaluated at. This can be + either a string Literal['inflection'] or a float. When Literal['inflection'] + is used, the function finds the inflection point of the curve and evaluates + the threshold at that level. When a float is used, the function evaluates + the threshold at that level. + :param required_accuracy: The minimum accuracy required for the psychometric function fit to be considered. + """ + self.fit_function = fit_function + self.fit_inverse_function = fit_inverse_function + self._independent_variable = independent_variable + self.threshold_accuracy = threshold_accuracy + self.required_accuracy = required_accuracy + + def __call__(self, source: Union[BehavioralAssembly, float], target: Union[list, PropertyAssembly]) -> Score: + """ + :param source: Either a BehavioralAssembly containing model responses to individual stimuli, or a pre-computed + threshold as a float. + :param target: Either a list containing human thresholds (for the ceiling function & ThresholdElevation), + or a PropertyAsssembly. + :return: A Score containing the evaluated model's distance to target thresholds. + """ + # compute threshold from measurements if the input is not a threshold already + if isinstance(source, float): + source_threshold = source + elif isinstance(source, BehavioralAssembly): + source_threshold = self.compute_threshold(source, self._independent_variable) + # check whether the psychometric function fit was successful - if not, return a score of 0 + if source_threshold == 'fit_fail': + score = Score(0.) + score.attrs['error'] = 0. + return score + else: + raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.') + return self.scoring_function(source_threshold, target) + + def ceiling(self, assembly: Union[PropertyAssembly, Dict[str, PropertyAssembly]]) -> Score: + """ + Computed by one-vs all for each of the NUM_TRIALS human indexes. One index is removed, and scored against + a pool of the other values. + + Currently copied with modification from 'https://github.com/brain-score/brain-score/blob/ + jacob2020_occlusion_depth_ordering/brainscore/metrics/data_cloud_comparision.py#L54'. + + :param assembly: the human PropertyAssembly containing human responses, or a dict containing the + PropertyAssemblies of the ThresholdElevation metric. + :return: Score object with coords center (ceiling) and error (STD) + """ + human_thresholds: list = assembly.values.tolist() + scores = [] + for i in range(len(human_thresholds)): + random_state = np.random.RandomState(i) + random_human_score = random_state.choice(human_thresholds, replace=False) + metric = Threshold(self._independent_variable, self.fit_function, self.fit_inverse_function, + self.threshold_accuracy) + human_thresholds.remove(random_human_score) + score = metric(random_human_score, human_thresholds) + human_thresholds.append(random_human_score) + scores.append(score.values) + + ceiling, ceiling_error = np.mean(scores), np.std(scores) + ceiling = Score(ceiling) + ceiling.attrs['error'] = ceiling_error + return ceiling + + def compute_threshold(self, source: BehavioralAssembly, independent_variable: str) -> Union[float, str]: + """Converts the source BehavioralAssembly to a threshold float value.""" + assert len(source.values) == len(source[independent_variable].values) + + x_points = source[independent_variable].values + accuracies = self.convert_proba_to_correct(source) + if np.mean(accuracies) < self.required_accuracy: + print('Psychometric threshold fit failure due to low accuracy.') + fit_params = 'fit_fail' + else: + fit_params, measurement_max = self.fit_threshold_function(x_points, accuracies) + if (type(fit_params) == str) and (fit_params == 'fit_fail'): + return fit_params + + if self.threshold_accuracy == 'inflection': + self.threshold_accuracy = self.inflection_accuracy(x_points, fit_params) + + threshold = self.find_threshold(self.threshold_accuracy, fit_params) + + # check whether the fit is outside the measured model responses to discard spurious thresholds + if (threshold > measurement_max) or np.isnan(threshold): + print('Fit fail because threshold is outside of the measured range of responses.') + return 'fit_fail' + return threshold + + def fit_threshold_function(self, x_points: np.array, y_points: np.array) -> Union[np.array, str]: + """ + A function that takes the x and y-points of the measured variable and handles the fitting of the + psychometric threshold function. + + Returns either the fit parameters for self.fit_function or a string tag that indicates the failure + of the psychometric curve fit. + """ + x_points, y_points = self.aggregate_psychometric_fit_data(x_points, y_points) + aggregated_x_points, aggregated_y_points, at_least_third_remaining = self.remove_data_after_asymptote(x_points, + y_points) + measurement_max = np.max(aggregated_x_points) + if not at_least_third_remaining: + # This failure indicates that there is too little data to accurately fit the psychometric function. + print('Psychometric curve fit fail because performance is decreasing with the independent variable.') + return 'fit_fail', measurement_max + + params, r2 = grid_search(aggregated_x_points, aggregated_y_points) + + # if all the fits in the grid search failed, there will be a None value in params. In this case, we reject + # the fit. This typically only ever happens when a model outputs one value for all test images. + if None in params: + params = 'fit_fail' + + # remove fits to random data. This choice is preferred over a chi^2 test since chi^2 discards a lot of fits + # that would be acceptable in a human case. + if r2 < 0.4: + print('Fit fail due to low fit R^2.') + params = 'fit_fail' + + return params, measurement_max + + def find_threshold(self, threshold_accuracy: float, fit_params: Tuple[float, ...]) -> float: + """ + A function that uses the inverse fit function to find the value of the threshold in terms of + the independent variable (self._independent_variable). + """ + threshold = self.fit_inverse_function(threshold_accuracy, *fit_params) + return threshold + + def inflection_accuracy(self, x_points: np.array, fit_params: np.array) -> float: + """ + A function that finds the accuracy at the inflection point of the fit function. Useful if you do not care + about the specific threshold accuracy, but rather about e.g. the elevation at the inflection point. + """ + max_fit_accuracy = self.fit_function(np.max(x_points), *fit_params) + min_fit_accuracy = self.fit_function(np.min(x_points), *fit_params) + threshold_accuracy = min_fit_accuracy + (max_fit_accuracy - min_fit_accuracy) / 2 + return threshold_accuracy + + @staticmethod + def aggregate_psychometric_fit_data(x_points, y_points): + unique_x = np.unique(x_points) + correct_rate = np.zeros(len(unique_x)) + + for i, x in enumerate(unique_x): + trials = np.sum(x_points == x) + correct_trials = np.sum((x_points == x) & (y_points == 1)) + correct_rate[i] = correct_trials / trials + + return unique_x, correct_rate + + @staticmethod + def scoring_function(source: float, target: Union[list, PropertyAssembly]) -> Score: + """ + Computes the average distance of the source from each of the individual targets in units of the + individual targets. This is generally a more stringent scoring method than pool_score, aimed + to measure the average of the individual target effects. + """ + raw_scores = [] + for target_value in target: + # This score = 0 when the source exceeds target_value by 100% + raw_score = max((1 - ((np.abs(target_value - source)) / target_value)), 0) + raw_scores.append(raw_score) + + scores_mean, scores_std = np.mean(raw_scores), np.std(raw_scores) + score = Score(scores_mean) + score.attrs['raw'] = raw_scores + score.attrs['error'] = scores_std + return score + + @staticmethod + def convert_proba_to_correct(source: BehavioralAssembly) -> np.array: + """Converts the probability values returned by models doing probability tasks to behavioral choices.""" + decisions = np.argmax(source.values, axis=1) + correct = [] + for presentation, decision in enumerate(decisions): + if source['choice'].values[decision] == source['image_label'].values[presentation]: + correct.append(1) + else: + correct.append(0) + return np.array(correct) + + @staticmethod + def remove_data_after_asymptote(x_values, y_values): + """ + A function that removes all data after the point at which all values of the measured variable are 1 standard + deviation less than the maximum. + + This is done to simulate the procedure in which an experimenter fine-tunes the stimuli in a pilot experiment + to the given system (e.g., humans) such that they only measure data in a region within which the psychometric + fit is monotone (as per the function fit assumption). When this assumption is violated, the function fit + is not a valid measure of the underlying performance function. + + There are circumstances in which this behavior is expected (e.g., crowding). When e.g. a vernier element's + offset is increased enough, the task may paradoxically become more difficult, as the offset grows large + enough such that the relevant elements do not fall within a spatially relevant window, or group with the + flankers more than with each other due to constant target-flanker distance. + """ + + std_dev = np.std(y_values) + max_y_idx = np.argmax(y_values) + + # initialize the index for the first data point after the maximum y_value + # that deviates from the maximum by at least 1 standard deviation + index_to_remove = None + + # iterate through the y_values after the maximum y_value + for idx, y in enumerate(y_values[max_y_idx + 1:], start=max_y_idx + 1): + # check if all the remaining y_values deviate by at least 1 standard deviation + if all([abs(val - y_values[max_y_idx]) >= std_dev for val in y_values[idx:]]): + index_to_remove = idx + break + pre_remove_length = len(y_values) + # if we found an index to remove, remove the data after that index + if index_to_remove is not None: + x_values = x_values[:index_to_remove] + y_values = y_values[:index_to_remove] + + # check if at least a third of the elements remain. This is done so that we have an adequate amount of data + # to fit a psychometric threshold on. + remaining_fraction = len(y_values) / pre_remove_length + is_at_least_third_remaining = remaining_fraction >= 1 / 3 + + return x_values, y_values, is_at_least_third_remaining + + +class ThresholdElevation(Threshold): + """ + Computes a threshold elevation from two conditions: a baseline condition and a test condition by dividing + the threshold of the test condition by the baseline condition. In other words, + + `threshold_elevation = test_condition_threshold / baseline_condition_threshold`. + """ + def __init__(self, + independent_variable: str, + baseline_condition: str, + test_condition: str, + threshold_accuracy: Union[str, float] = 'inflection', + required_baseline_accuracy: Optional[float] = 0.6, + required_test_accuracy: Optional[float] = 0.6 + ): + """ + :param independent_variable: The independent variable in the benchmark that the threshold is computed + over. + :param baseline_condition: The baseline condition against which threshold elevation is measured. + :param test_condition: The test condition that is used to measure threshold elevation.. + :param threshold_accuracy: The accuracy at which the threshold should be evaluated at. This can be + either a string Literal['inflection'] or a float. When Literal['inflection'] + is used, the function finds the inflection point of the curve and evaluates + the threshold at that level. When a float is used, the function evaluates + the threshold at that level. + :param required_baseline_accuracy: The minimum accuracy required for the psychometric function fit to be + considered for the baseline condition. + :param required_test_accuracy: The minimum accuracy required for the psychometric function fit to be + considered for the test condition. + """ + super(ThresholdElevation, self).__init__(independent_variable) + self.baseline_threshold_metric = Threshold(self._independent_variable, + threshold_accuracy=threshold_accuracy, + required_accuracy=required_baseline_accuracy) + self.test_threshold_metric = Threshold(self._independent_variable, + threshold_accuracy=threshold_accuracy, + required_accuracy=required_test_accuracy) + self.baseline_condition = baseline_condition + self.test_condition = test_condition + self.threshold_accuracy = threshold_accuracy + + def __call__(self, + source: Union[float, Dict[str, BehavioralAssembly]], + target: Union[list, PropertyAssembly, Dict[str, PropertyAssembly]] + ) -> Score: + """ + :param source: Either a dictionary containing the BehavioralAssemblies for the test condition and the + baseline condition, or a pre-computed float threshold elevation. If Dict, Dict + keys should be 'condition_assembly' and 'baseline_assembly' respectively. + :param target: Either a dictionary containing the PropertyAssemblies for the test condition and the + baseline condition, or a list of pre-computed threshold elevations. If Dict, Dict + keys should be 'condition_assembly' and 'baseline_assembly' respectively. + :return: A score containing the evaluated model's ceiling-adjusted distance to target threshold elevations. + """ + # check whether source is a threshold elevation already - if not, compute it. + if isinstance(source, float): + raw_source_threshold_elevation = source + elif isinstance(source, Dict): + source_baseline_threshold = self.baseline_threshold_metric.compute_threshold(source[self.baseline_condition], + self._independent_variable) + + # if using the inflection accuracy, get the inflection point from the baseline condition, and use that + # for the test condition. + if self.threshold_accuracy == 'inflection': + self.test_threshold_metric.threshold_accuracy = self.baseline_threshold_metric.threshold_accuracy + source_test_threshold = self.test_threshold_metric.compute_threshold(source[self.test_condition], + self._independent_variable) + if source_baseline_threshold == 'fit_fail' or source_test_threshold == 'fit_fail': + return Score(0.) # psychometric function could not be fit -- this typically means that the model is at chance throughout + raw_source_threshold_elevation = source_test_threshold / source_baseline_threshold + else: + raise TypeError(f'source is type {type(source)}, but type BehavioralAssembly or float is required.') + + # check whether the targets are threshold elevations already - if not, compute them + if isinstance(target, list): + target_threshold_elevations = target + elif isinstance(target, PropertyAssembly): + target_threshold_elevations = target.values.tolist() + elif isinstance(target, Dict): + target_threshold_elevations = self.compute_threshold_elevations(target) + else: + raise TypeError(f'target is type {type(target)}, but type PropertyAssembly or list is required.') + + # compare threshold elevation to target threshold elevations + return self.scoring_function(raw_source_threshold_elevation, target_threshold_elevations) + + def ceiling(self, assemblies: Dict[str, PropertyAssembly]) -> Score: + """ + Computed by one-vs all for each of the NUM_TRIALS human indexes. One index is removed, and scored against + a pool of the other values. + + Currently copied with modification from 'https://github.com/brain-score/brain-score/blob/ + jacob2020_occlusion_depth_ordering/brainscore/metrics/data_cloud_comparision.py#L54'. + """ + human_threshold_elevations = self.compute_threshold_elevations(assemblies) + scores = [] + for i in range(len(human_threshold_elevations)): + random_state = np.random.RandomState(i) + random_human_score = random_state.choice(human_threshold_elevations, replace=False) + metric = ThresholdElevation(self._independent_variable, self.baseline_condition, self.test_condition, + self.threshold_accuracy) + human_threshold_elevations.remove(random_human_score) + score = metric(random_human_score, human_threshold_elevations) + human_threshold_elevations.append(random_human_score) + scores.append(score.values) + + ceiling, ceiling_error = np.mean(scores), np.std(scores) + ceiling = Score(ceiling) + ceiling.attrs['raw'] = scores + ceiling.attrs['error'] = ceiling_error + return ceiling + + @staticmethod + def compute_threshold_elevations(assemblies: Dict[str, PropertyAssembly]) -> List: + """ + Computes the threshold elevations of a baseline condition and a test condition: + + `threshold_elevation = test_condition_threshold / baseline_condition_threshold`. + """ + baseline_assembly = assemblies['baseline_assembly'] + condition_assembly = assemblies['condition_assembly'] + threshold_elevations = [] + for i, baseline_threshold in enumerate(baseline_assembly.values): + condition_threshold = condition_assembly.values[i] + threshold_elevations.append(condition_threshold / baseline_threshold) + return threshold_elevations diff --git a/brainscore_vision/metrics/threshold/test.py b/brainscore_vision/metrics/threshold/test.py new file mode 100644 index 000000000..84f0c7214 --- /dev/null +++ b/brainscore_vision/metrics/threshold/test.py @@ -0,0 +1,71 @@ +from pytest import approx + +from brainio.assemblies import PropertyAssembly +from brainscore_vision import load_metric + + +def test_threshold_score_from_thresholds(): + assembly = _make_threshold_data() + # independent_variable is not used since we compute from thresholds, and do not need to fit them + metric = load_metric('threshold', independent_variable='placeholder') + score = metric(float(assembly.sel(subject='A').values), assembly) + assert score == approx(0.5625) + + +def test_threshold_elevation_score_from_threshold_elevations(): + assembly = _make_threshold_elevation_data() + # independent_variable is not used since we compute from thresholds, and do not need to fit them + metric = load_metric('threshold_elevation', + independent_variable='placeholder', + baseline_condition='placeholder', + test_condition='placeholder') + score = metric(float(assembly.sel(subject='A').values), assembly) + assert score == approx(0.525) + + +def test_threshold_has_error(): + assembly = _make_threshold_data() + metric = load_metric('threshold', independent_variable='placeholder') + score = metric(float(assembly.sel(subject='A').values), assembly) + assert hasattr(score, 'error') + + +def test_threshold_elevation_has_error(): + assembly = _make_threshold_elevation_data() + metric = load_metric('threshold_elevation', + independent_variable='placeholder', + baseline_condition='placeholder', + test_condition='placeholder') + score = metric(float(assembly.sel(subject='A').values), assembly) + assert hasattr(score, 'error') + + +def test_threshold_has_raw(): + assembly = _make_threshold_data() + metric = load_metric('threshold', independent_variable='placeholder') + score = metric(float(assembly.sel(subject='A').values), assembly) + assert hasattr(score, 'raw') + + +def test_threshold_elevation_has_raw(): + assembly = _make_threshold_elevation_data() + metric = load_metric('threshold_elevation', + independent_variable='placeholder', + baseline_condition='placeholder', + test_condition='placeholder') + score = metric(float(assembly.sel(subject='A').values), assembly) + assert hasattr(score, 'raw') + + +def _make_threshold_data(): + # Subjects have thresholds of 10, 20, 40, and 20 respectively. + return PropertyAssembly([10.0, 20.0, 40.0, 20.0], + coords={'subject': ('subject', ['A', 'B', 'C', 'D'])}, + dims=['subject']) + + +def _make_threshold_elevation_data(): + # Subjects have threshold elevations of 3, 2, 1.5, and 5 respectively. + return PropertyAssembly([3.0, 2.0, 1.5, 5.0], + coords={'subject': ('subject', ['A', 'B', 'C', 'D'])}, + dims=['subject'])