diff --git a/brainscore_language/benchmarks/futrell2018/benchmark.py b/brainscore_language/benchmarks/futrell2018/benchmark.py index 6deec995..6c5c0a49 100644 --- a/brainscore_language/benchmarks/futrell2018/benchmark.py +++ b/brainscore_language/benchmarks/futrell2018/benchmark.py @@ -25,27 +25,30 @@ class Futrell2018Pearsonr(BenchmarkBase): """ def __init__(self): - self.data = load_dataset('Futrell2018') - self.metric = load_metric('pearsonr') - ceiler = SplitHalvesConsistency(num_splits=10, split_coordinate='subject_id', consistency_metric=self.metric) + self.data = load_dataset("Futrell2018") + self.metric = load_metric("pearsonr") + ceiler = SplitHalvesConsistency( + num_splits=10, split_coordinate="subject_id", consistency_metric=self.metric + ) ceiling = ceiler(self.data) super(Futrell2018Pearsonr, self).__init__( - identifier='Futrell2018-pearsonr', - version=1, - parent='behavior', + identifier="Futrell2018-pearsonr", ceiling=ceiling, - bibtex=self.data.bibtex) + version=1, + parent="behavior", + bibtex=self.data.bibtex, + ) def __call__(self, candidate: ArtificialSubject) -> Score: # run experiment candidate.start_behavioral_task(ArtificialSubject.Task.reading_times) - stimuli = self.data['word'].values - predictions = candidate.digest_text(stimuli)['behavior'] - attach_presentation_meta(predictions, self.data['presentation']) + stimuli = self.data["word"].values + predictions = candidate.digest_text(stimuli)["behavior"] + attach_presentation_meta(predictions, self.data["presentation"]) # exclude first words - predictions = predictions[predictions['word_within_sentence_id'] != 1] - targets = self.data[self.data['word_within_sentence_id'] != 1] - targets = targets.mean('subject') # compare to "average human" + predictions = predictions[predictions["word_within_sentence_id"] != 1] + targets = self.data[self.data["word_within_sentence_id"] != 1] + targets = targets.mean("subject") # compare to "average human" # score raw_score = self.metric(predictions, targets) score = ceiling_normalize(raw_score, self.ceiling) @@ -56,7 +59,9 @@ class SplitHalvesConsistency: # following # https://github.com/brain-score/brain-score/blob/c51b8aa2c94212a9ac56c06c556afad0bb0a3521/brainscore/metrics/ceiling.py#L25-L96 - def __init__(self, num_splits: int, split_coordinate: str, consistency_metric: Metric): + def __init__( + self, num_splits: int, split_coordinate: str, consistency_metric: Metric + ): """ :param num_splits: how many times to create two halves :param split_coordinate: over which coordinate to split the assembly into halves @@ -73,18 +78,30 @@ def __call__(self, assembly: DataAssembly) -> Score: consistencies, uncorrected_consistencies = [], [] splits = range(self.num_splits) for _ in splits: - half1_values = random_state.choice(split_values, size=len(split_values) // 2, replace=False) - half2_values = set(split_values) - set(half1_values) # this only works because of `replace=False` above - half1 = assembly[{split_dim: [value in half1_values for value in split_values]}].mean(split_dim) - half2 = assembly[{split_dim: [value in half2_values for value in split_values]}].mean(split_dim) + half1_values = random_state.choice( + split_values, size=len(split_values) // 2, replace=False + ) + half2_values = set(split_values) - set( + half1_values + ) # this only works because of `replace=False` above + half1 = assembly[ + {split_dim: [value in half1_values for value in split_values]} + ].mean(split_dim) + half2 = assembly[ + {split_dim: [value in half2_values for value in split_values]} + ].mean(split_dim) consistency = self.consistency_metric(half1, half2) uncorrected_consistencies.append(consistency) # Spearman-Brown correction for sub-sampling corrected_consistency = 2 * consistency / (1 + (2 - 1) * consistency) consistencies.append(corrected_consistency) - consistencies = Score(consistencies, coords={'split': splits}, dims=['split']) - uncorrected_consistencies = Score(uncorrected_consistencies, coords={'split': splits}, dims=['split']) - average_consistency = consistencies.median('split') - average_consistency.attrs['raw'] = consistencies - average_consistency.attrs['uncorrected_consistencies'] = uncorrected_consistencies + consistencies = Score(consistencies, coords={"split": splits}, dims=["split"]) + uncorrected_consistencies = Score( + uncorrected_consistencies, coords={"split": splits}, dims=["split"] + ) + average_consistency = consistencies.median("split") + average_consistency.attrs["raw"] = consistencies + average_consistency.attrs[ + "uncorrected_consistencies" + ] = uncorrected_consistencies return average_consistency diff --git a/brainscore_language/benchmarks/pereira2018/benchmark.py b/brainscore_language/benchmarks/pereira2018/benchmark.py index 925f3880..9165cf5f 100644 --- a/brainscore_language/benchmarks/pereira2018/benchmark.py +++ b/brainscore_language/benchmarks/pereira2018/benchmark.py @@ -25,33 +25,39 @@ def Pereira2018_243sentences(): - return _Pereira2018ExperimentLinear(experiment='243sentences', ceiling_s3_kwargs=dict( - version_id='CHl_9aFHIWVnPW_njePfy28yzggKuUPw', - sha1='5e23de899883828f9c886aec304bc5aa0f58f66c', - raw_kwargs=dict( - version_id='uZye03ENmn.vKB5mARUGhcIY_DjShtPD', - sha1='525a6ac8c14ad826c63fdd71faeefb8ba542d5ac', + return _Pereira2018ExperimentLinear( + experiment="243sentences", + ceiling_s3_kwargs=dict( + version_id="CHl_9aFHIWVnPW_njePfy28yzggKuUPw", + sha1="5e23de899883828f9c886aec304bc5aa0f58f66c", raw_kwargs=dict( - version_id='XVTo58Po5YrNjTuDIWrmfHI0nbN2MVZa', - sha1='34ba453dc7e8a19aed18cc9bca160e97b4a80be5' - ) - ) - )) + version_id="uZye03ENmn.vKB5mARUGhcIY_DjShtPD", + sha1="525a6ac8c14ad826c63fdd71faeefb8ba542d5ac", + raw_kwargs=dict( + version_id="XVTo58Po5YrNjTuDIWrmfHI0nbN2MVZa", + sha1="34ba453dc7e8a19aed18cc9bca160e97b4a80be5", + ), + ), + ), + ) def Pereira2018_384sentences(): - return _Pereira2018ExperimentLinear(experiment='384sentences', ceiling_s3_kwargs=dict( - version_id='sjlnXr5wXUoGv6exoWu06C4kYI0KpZLk', - sha1='fc895adc52fd79cea3040961d65d8f736a9d3e29', - raw_kwargs=dict( - version_id='Hi74r9UKfpK0h0Bjf5DL.JgflGoaknrA', - sha1='ce2044a7713426870a44131a99bfc63d8843dae0', + return _Pereira2018ExperimentLinear( + experiment="384sentences", + ceiling_s3_kwargs=dict( + version_id="sjlnXr5wXUoGv6exoWu06C4kYI0KpZLk", + sha1="fc895adc52fd79cea3040961d65d8f736a9d3e29", raw_kwargs=dict( - version_id='m4dq_ouKWZkYtdyNPMSP0p6rqb7wcYpi', - sha1='fe9fb24b34fd5602e18e34006ac5ccc7d4c825b8' - ) - ) - )) + version_id="Hi74r9UKfpK0h0Bjf5DL.JgflGoaknrA", + sha1="ce2044a7713426870a44131a99bfc63d8843dae0", + raw_kwargs=dict( + version_id="m4dq_ouKWZkYtdyNPMSP0p6rqb7wcYpi", + sha1="fe9fb24b34fd5602e18e34006ac5ccc7d4c825b8", + ), + ), + ), + ) class _Pereira2018ExperimentLinear(BenchmarkBase): @@ -73,43 +79,74 @@ class _Pereira2018ExperimentLinear(BenchmarkBase): def __init__(self, experiment: str, ceiling_s3_kwargs: dict): self.data = self._load_data(experiment) - self.metric = load_metric('linear_pearsonr') - identifier = f'Pereira2018.{experiment}-linear' + self.metric = load_metric("linear_pearsonr") + identifier = f"Pereira2018.{experiment}-linear" ceiling = self._load_ceiling(identifier=identifier, **ceiling_s3_kwargs) super(_Pereira2018ExperimentLinear, self).__init__( identifier=identifier, - version=1, - parent='Pereira2018-linear', ceiling=ceiling, - bibtex=BIBTEX) + version=1, + parent="Pereira2018-linear", + bibtex=BIBTEX, + ) def _load_data(self, experiment: str) -> NeuroidAssembly: - data = load_dataset('Pereira2018.language') + data = load_dataset("Pereira2018.language") data = data.sel(experiment=experiment) # filter experiment - data = data.dropna('neuroid') # not all subjects have done both experiments, drop those that haven't - data.attrs['identifier'] = f"{data.identifier}.{experiment}" + data = data.dropna( + "neuroid" + ) # not all subjects have done both experiments, drop those that haven't + data.attrs["identifier"] = f"{data.identifier}.{experiment}" return data - def _load_ceiling(self, identifier: str, version_id: str, sha1: str, assembly_prefix="ceiling_", raw_kwargs=None): - ceiling = load_from_s3(identifier, cls=Score, assembly_prefix=assembly_prefix, version_id=version_id, sha1=sha1) + def _load_ceiling( + self, + identifier: str, + version_id: str, + sha1: str, + assembly_prefix="ceiling_", + raw_kwargs=None, + ): + ceiling = load_from_s3( + identifier, + cls=Score, + assembly_prefix=assembly_prefix, + version_id=version_id, + sha1=sha1, + ) if raw_kwargs: # recursively load raw attributes - raw = self._load_ceiling(identifier=identifier, assembly_prefix=assembly_prefix + "raw_", **raw_kwargs) - ceiling.attrs['raw'] = raw + raw = self._load_ceiling( + identifier=identifier, + assembly_prefix=assembly_prefix + "raw_", + **raw_kwargs, + ) + ceiling.attrs["raw"] = raw return ceiling def __call__(self, candidate: ArtificialSubject) -> Score: - candidate.start_neural_recording(recording_target=ArtificialSubject.RecordingTarget.language_system, - recording_type=ArtificialSubject.RecordingType.fMRI) - stimuli = self.data['stimulus'] - passages = self.data['passage_label'].values + candidate.start_neural_recording( + recording_target=ArtificialSubject.RecordingTarget.language_system, + recording_type=ArtificialSubject.RecordingType.fMRI, + ) + stimuli = self.data["stimulus"] + passages = self.data["passage_label"].values predictions = [] - for passage in sorted(set(passages)): # go over individual passages, sorting to keep consistency across runs - passage_indexer = [stimulus_passage == passage for stimulus_passage in passages] + for passage in sorted( + set(passages) + ): # go over individual passages, sorting to keep consistency across runs + passage_indexer = [ + stimulus_passage == passage for stimulus_passage in passages + ] passage_stimuli = stimuli[passage_indexer] - passage_predictions = candidate.digest_text(passage_stimuli.values)['neural'] - passage_predictions['stimulus_id'] = 'presentation', passage_stimuli['stimulus_id'].values + passage_predictions = candidate.digest_text(passage_stimuli.values)[ + "neural" + ] + passage_predictions["stimulus_id"] = ( + "presentation", + passage_stimuli["stimulus_id"].values, + ) predictions.append(passage_predictions) - predictions = xr.concat(predictions, dim='presentation') + predictions = xr.concat(predictions, dim="presentation") raw_score = self.metric(predictions, self.data) score = ceiling_normalize(raw_score, self.ceiling) return score diff --git a/brainscore_language/benchmarks/pereira2018_v2022/__init__.py b/brainscore_language/benchmarks/pereira2018_v2022/__init__.py new file mode 100644 index 00000000..aa4556f2 --- /dev/null +++ b/brainscore_language/benchmarks/pereira2018_v2022/__init__.py @@ -0,0 +1,5 @@ +from .benchmark import Pereira2018_243sentences, Pereira2018_384sentences +from brainscore_language import benchmark_registry + +benchmark_registry["Pereira2018_v2022.243sentences-linreg_pearsonr"] = Pereira2018_243sentences +benchmark_registry["Pereira2018_v2022.384sentences-linreg_pearsonr"] = Pereira2018_384sentences diff --git a/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py new file mode 100644 index 00000000..59cef365 --- /dev/null +++ b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py @@ -0,0 +1,157 @@ +from brainio.assemblies import NeuroidAssembly +from brainscore_core.benchmarks import BenchmarkBase +from brainscore_core.metrics import Score +from brainscore_language.artificial_subject import ArtificialSubject +from brainscore_language.utils.ceiling import ceiling_normalize +from brainscore_language.utils.s3 import load_from_s3 +from brainscore_language import load_dataset, load_metric + +import logging + +logger = logging.getLogger(__name__) + +BIBTEX = """@article{pereira2018toward, + title={Toward a universal decoder of linguistic meaning from brain activation}, + author={Pereira, Francisco and Lou, Bin and Pritchett, Brianna and Ritter, Samuel and Gershman, Samuel J + and Kanwisher, Nancy and Botvinick, Matthew and Fedorenko, Evelina}, + journal={Nature communications}, + volume={9}, + number={1}, + pages={1--13}, + year={2018}, + publisher={Nature Publishing Group} +}""" + + +def Pereira2018_243sentences(): + return _Pereira2018LinregPearsonr( + experiment="PereiraE3_72pass", + ceiling_s3_kwargs=dict( + sha1="f74a68b74568e44257778effc40c56d223cc5219", + version_id="g1oQg1J.HBuG2WwD6fwjNsjhMvsbQpjA", + raw_kwargs=dict( + sha1="e64c0c8e73169340f30f7135d1f2f4f1275cb2ad", + version_id="8NMVc498YgYz7TZiocfeueCB5uxBt42G", + raw_kwargs=dict( + sha1="5b97b28406e18ea345a254d15562f4d271c6dc59", + version_id="cNV3rCdVCAu2FjbQyoNGtFZNBfkh0q5d", + ), + ), + ), + ) + + +def Pereira2018_384sentences(): + return _Pereira2018LinregPearsonr( + experiment="PereiraE2_96pass", + ceiling_s3_kwargs=dict( + sha1="d4b55d2d7b44a27d45c375b938584b6b2c9b2265", + version_id="U_kebareV3iUmbUwg4P1MVJJdj_ImpcP", + raw_kwargs=dict( + sha1="e64c0c8e73169340f30f7135d1f2f4f1275cb2ad", + version_id="yKvtiG3W5FUrzRvi7nXdk7tH2x4C0H_F", + raw_kwargs=dict( + sha1="5b97b28406e18ea345a254d15562f4d271c6dc59", + version_id="2lHr8sfxTfZ25j88on1Wve2mmPnk3Cfp", + ), + ), + ), + ) + + +class _Pereira2018LinregPearsonr(BenchmarkBase): + """ + Evaluate model ability to predict neural activity in the human language system in response to natural sentences, + recorded by Pereira et al. 2018. + Alignment of neural activity between model and human subjects is evaluated via cross-validated linear predictivity. + + This benchmark builds off the behavioral benchmark introduced + in Schrimpf et al. 2021 (https://www.pnas.org/doi/10.1073/pnas.2105646118), but: + + * computes neural alignment to each of the two experiments ({243,384}sentences) separately, as well as ceilings + * requires the model to have committed to neural readouts (e.g. layer 41 corresponds to the language system), + rather than testing every layer separately + + Each of these benchmarks evaluates one of the two experiments, the overall Pereira2018-linear score is the mean of + the two ceiling-normalized scores. + """ + + def __init__( + self, + experiment: str, + ceiling_s3_kwargs: dict, + ): + self.data = self._load_data(experiment) + self.metric = load_metric( + "linear_pearsonr", + crossvalidation_kwargs={ + "split_coord": "stimuli", + }, + regression_kwargs={ + "stimulus_coord": "stimuli", + }, + correlation_kwargs={ + "correlation_coord": "stimuli", + "neuroid_coord": "neuroid", + }, + ) + identifier = f"Pereira2018_v2022.{experiment}-linreg_pearsonr" + ceiling = self._load_ceiling(identifier=identifier, **ceiling_s3_kwargs) + super(_Pereira2018LinregPearsonr, self).__init__( + identifier=identifier, + ceiling=ceiling, + version=1, + parent="Pereira2018-linreg_pearsonr", + bibtex=BIBTEX, + ) + + def _load_data(self, experiment: str) -> NeuroidAssembly: + data = load_dataset("Pereira2018_v2022.language") + # data = data.sel(experiment=experiment) # filter experiment + data = data.loc[data.experiment == experiment, :, :] + data = data.dropna( + "neuroid" + ) # not all subjects have done both experiments, drop those that haven't + data.attrs["identifier"] = f"{data.identifier}.{experiment}" + if "time" in data.dims: + data = data.drop("time").squeeze("time") + return data + + def _load_ceiling( + self, + identifier: str, + version_id: str, + sha1: str, + assembly_prefix="ceiling_", + raw_kwargs=None, + ): + ceiling = load_from_s3( + identifier, + cls=Score, + assembly_prefix=assembly_prefix, + version_id=version_id, + sha1=sha1, + ) + if raw_kwargs: # recursively load raw attributes + raw = self._load_ceiling( + identifier=identifier, + assembly_prefix=assembly_prefix + "raw_", + **raw_kwargs, + ) + ceiling.attrs["raw"] = raw + return ceiling + + def __call__(self, candidate: ArtificialSubject) -> Score: + candidate.perform_neural_recording( + recording_target=ArtificialSubject.RecordingTarget.language_system, + recording_type=ArtificialSubject.RecordingType.fMRI, + ) + stimuli = self.data["stimuli"] + predictions = candidate.digest_text(stimuli.values)["neural"] + predictions["presentation"] = "presentation", stimuli["stimuli"].values + raw_score = self.metric( + predictions, + self.data, + ) + score = raw_score # ceiling_normalize(raw_score, self.ceiling) + return score diff --git a/brainscore_language/benchmarks/pereira2018_v2022/ceiling_packaging.py b/brainscore_language/benchmarks/pereira2018_v2022/ceiling_packaging.py new file mode 100644 index 00000000..27db0421 --- /dev/null +++ b/brainscore_language/benchmarks/pereira2018_v2022/ceiling_packaging.py @@ -0,0 +1,405 @@ +import logging +import sys +import warnings +from collections import defaultdict + +import numpy as np +from numpy import AxisError +from numpy.random import RandomState +from result_caching import store +from scipy.optimize import curve_fit +from tqdm import tqdm, trange + +from brainio.assemblies import ( + DataAssembly, + array_is_element, + walk_coords, + merge_data_arrays, +) +from brainscore_core.metrics import Score +from brainscore_language import load_benchmark +from brainscore_language.utils import fullname +from brainscore_language.utils.s3 import upload_data_assembly +from brainscore_language.utils.transformations import apply_aggregate + +_logger = logging.getLogger(__name__) + +""" +Because ceiling estimations for Pereira2018-linear benchmarks are very costly to run (many hours), we pre-compute them, +and store the resulting ceiling (including per-neuroid estimates) on S3. +The code in this file was run only once, and is kept here for reference. +""" + + +def upload_ceiling(experiment): + benchmark = load_benchmark(f"Pereira2018_v2022.{experiment}-linreg_pearsonr") + ceiler = ExtrapolationCeiling() + ceiling = ceiler(benchmark.data, metric=benchmark.metric) + _logger.info(f"Uploading ceiling {ceiling}") + # Note that because we cannot serialize complex objects to netCDF, attributes like 'raw' and 'bootstrapped_params' + # will get lost during upload + upload_data_assembly( + ceiling, assembly_identifier=benchmark.identifier, assembly_prefix="ceiling_" + ) + # also upload raw (and raw.raw) attributes + upload_data_assembly( + ceiling.raw, + assembly_identifier=benchmark.identifier, + assembly_prefix="ceiling_raw_", + ) + upload_data_assembly( + ceiling.raw.raw, + assembly_identifier=benchmark.identifier, + assembly_prefix="ceiling_raw_raw_", + ) + + +def v(x, v0, tau0): # function for ceiling extrapolation + return v0 * (1 - np.exp(-x / tau0)) + + +class HoldoutSubjectCeiling: + def __init__(self, subject_column): + self.subject_column = subject_column + self._logger = logging.getLogger(fullname(self)) + self._rng = RandomState(0) + self._num_bootstraps = 5 + + def __call__(self, assembly, metric): + subjects = set(assembly[self.subject_column].values) + scores = [] + iterate_subjects = self._rng.choice( + list(subjects), size=self._num_bootstraps + ) # use only a subset of subjects + for subject in tqdm(iterate_subjects, desc="heldout subject"): + try: + subject_assembly = assembly[ + { + "neuroid": [ + subject_value == subject + for subject_value in assembly[self.subject_column].values + ] + } + ] + # run subject pool as neural candidate + subject_pool = subjects - {subject} + pool_assembly = assembly[ + { + "neuroid": [ + subject in subject_pool + for subject in assembly[self.subject_column].values + ] + } + ] + score = metric(pool_assembly, subject_assembly) + # store scores + apply_raw = "raw" in score.attrs and not hasattr( + score.raw, self.subject_column + ) # only propagate if column not part of score + score = score.expand_dims(self.subject_column, _apply_raw=apply_raw) + score.__setitem__(self.subject_column, [subject], _apply_raw=apply_raw) + scores.append(score) + except NoOverlapException as e: + self._logger.debug(f"Ignoring no overlap {e}") + continue # ignore + except ValueError as e: + if "Found array with" in str(e): + self._logger.debug(f"Ignoring empty array {e}") + continue + else: + raise e + + scores = Score.merge(*scores) + scores = apply_aggregate( + lambda scores: scores.mean(self.subject_column), scores + ) + return scores + + +class ExtrapolationCeiling: + def __init__( + self, + subject_column="subject", + extrapolation_dimension="neuroid", + num_bootstraps=100, + ): + self._logger = logging.getLogger(fullname(self)) + self.subject_column = subject_column + self.extrapolation_dimension = extrapolation_dimension + self.num_bootstraps = num_bootstraps + self.num_subsamples = 10 + self.holdout_ceiling = HoldoutSubjectCeiling(subject_column=subject_column) + self._rng = RandomState(0) + + def __call__(self, assembly, metric): + scores = self.collect( + identifier=assembly.identifier, assembly=assembly, metric=metric + ) + return self.extrapolate(identifier=assembly.identifier, ceilings=scores) + + @store(identifier_ignore=["assembly", "metric"]) + def collect(self, identifier, assembly, metric): + self._logger.debug("Collecting data for extrapolation") + subjects = set(assembly[self.subject_column].values) + subject_subsamples = tuple(range(2, len(subjects) + 1)) + scores = [] + for num_subjects in tqdm(subject_subsamples, desc="num subjects"): + for sub_subjects in self._random_combinations( + subjects=set(assembly[self.subject_column].values), + num_subjects=num_subjects, + choice=self.num_subsamples, + rng=self._rng, + ): + sub_assembly = assembly[ + { + "neuroid": [ + subject in sub_subjects + for subject in assembly[self.subject_column].values + ] + } + ] + selections = {self.subject_column: sub_subjects} + try: + score = self.holdout_ceiling(assembly=sub_assembly, metric=metric) + score = score.expand_dims("num_subjects") + score["num_subjects"] = [num_subjects] + for key, selection in selections.items(): + expand_dim = f"sub_{key}" + score = score.expand_dims(expand_dim) + score[expand_dim] = [str(selection)] + scores.append(score.raw) + except KeyError as e: # nothing to merge + if str(e) == "'z'": + self._logger.debug(f"Ignoring merge error {e}") + continue + else: + raise e + scores = Score.merge(*scores) + return scores + + def _random_combinations(self, subjects, num_subjects, choice, rng): + # following https://stackoverflow.com/a/55929159/2225200. Also see similar method in `behavioral.py`. + subjects = np.array(list(subjects)) + combinations = set() + while len(combinations) < choice: + elements = rng.choice(subjects, size=num_subjects, replace=False) + combinations.add(tuple(elements)) + return combinations + + def extrapolate(self, identifier, ceilings): + neuroid_ceilings = [] + raw_keys = ["bootstrapped_params", "error_low", "error_high", "endpoint_x"] + raw_attrs = defaultdict(list) + for i in trange( + len(ceilings[self.extrapolation_dimension]), + desc=f"{self.extrapolation_dimension} extrapolations", + ): + try: + # extrapolate per-neuroid ceiling + neuroid_ceiling = ceilings.isel(**{self.extrapolation_dimension: [i]}) + extrapolated_ceiling = self.extrapolate_neuroid( + neuroid_ceiling.squeeze() + ) + extrapolated_ceiling = self.add_neuroid_meta( + extrapolated_ceiling, neuroid_ceiling + ) + neuroid_ceilings.append(extrapolated_ceiling) + # keep track of raw attributes + for key in raw_keys: + values = extrapolated_ceiling.attrs[key] + values = self.add_neuroid_meta(values, neuroid_ceiling) + raw_attrs[key].append(values) + except AxisError: # no extrapolation successful (happens for 1 neuroid in Pereira) + _logger.warning( + f"Failed to extrapolate neuroid ceiling {i}", exc_info=True + ) + continue + + # merge and add meta + self._logger.debug("Merging neuroid ceilings") + neuroid_ceilings = manual_merge( + *neuroid_ceilings, on=self.extrapolation_dimension + ) + neuroid_ceilings.attrs["raw"] = ceilings + + for key, values in raw_attrs.items(): + self._logger.debug(f"Merging {key}") + values = manual_merge(*values, on=self.extrapolation_dimension) + neuroid_ceilings.attrs[key] = values + # aggregate + ceiling = self.aggregate_neuroid_ceilings(neuroid_ceilings, raw_keys=raw_keys) + ceiling.attrs["identifier"] = identifier + return ceiling + + def add_neuroid_meta(self, target, source): + target = target.expand_dims(self.extrapolation_dimension) + for coord, dims, values in walk_coords(source): + if array_is_element(dims, self.extrapolation_dimension): + target[coord] = dims, values + return target + + def aggregate_neuroid_ceilings(self, neuroid_ceilings, raw_keys): + ceiling = neuroid_ceilings.median(self.extrapolation_dimension) + ceiling.attrs["raw"] = neuroid_ceilings + for key in raw_keys: + values = neuroid_ceilings.attrs[key] + aggregate = values.median(self.extrapolation_dimension) + if not aggregate.shape: # scalar value, e.g. for error_low + aggregate = aggregate.item() + ceiling.attrs[key] = aggregate + return ceiling + + def extrapolate_neuroid(self, ceilings): + # figure out how many extrapolation x points we have. E.g. for Pereira, not all combinations are possible + subject_subsamples = list(sorted(set(ceilings["num_subjects"].values))) + rng = RandomState(0) + bootstrap_params = [] + for bootstrap in range(self.num_bootstraps): + bootstrapped_scores = [] + for num_subjects in subject_subsamples: + num_scores = ceilings.sel(num_subjects=num_subjects) + # the sub_subjects dimension creates nans, get rid of those + num_scores = num_scores.dropna(f"sub_{self.subject_column}") + assert set(num_scores.dims) == { + f"sub_{self.subject_column}", + "split", + } or set(num_scores.dims) == {f"sub_{self.subject_column}"} + # choose from subject subsets and the splits therein, with replacement for variance + choices = num_scores.values.flatten() + bootstrapped_score = rng.choice( + choices, size=len(choices), replace=True + ) + bootstrapped_scores.append(np.mean(bootstrapped_score)) + + try: + params = self.fit(subject_subsamples, bootstrapped_scores) + except RuntimeError: # optimal parameters not found + params = [np.nan, np.nan] + params = DataAssembly( + [params], + coords={"bootstrap": [bootstrap], "param": ["v0", "tau0"]}, + dims=["bootstrap", "param"], + ) + bootstrap_params.append(params) + bootstrap_params = merge_data_arrays(bootstrap_params) + # find endpoint and error + asymptote_threshold = 0.0005 + interpolation_xs = np.arange(1000) + ys = np.array( + [ + v(interpolation_xs, *params) + for params in bootstrap_params.values + if not np.isnan(params).any() + ] + ) + median_ys = np.median(ys, axis=0) + diffs = np.diff(median_ys) + end_x = np.where(diffs < asymptote_threshold)[ + 0 + ].min() # first x where increase smaller than threshold + # put together + center = np.median(np.array(bootstrap_params)[:, 0]) + error_low, error_high = ci_error(ys[:, end_x], center=center) + score = Score(center) + score.attrs["raw"] = ceilings + score.attrs["error_low"] = DataAssembly(error_low) + score.attrs["error_high"] = DataAssembly(error_high) + score.attrs["bootstrapped_params"] = bootstrap_params + score.attrs["endpoint_x"] = DataAssembly(end_x) + return score + + def fit(self, subject_subsamples, bootstrapped_scores): + valid = ~np.isnan(bootstrapped_scores) + if sum(valid) < 1: + raise RuntimeError("No valid scores in sample") + params, pcov = curve_fit( + v, + subject_subsamples, + bootstrapped_scores, + # v (i.e. max ceiling) is between 0 and 1, tau0 unconstrained + bounds=([0, -np.inf], [1, np.inf]), + ) + return params + + +def manual_merge(*elements, on="neuroid"): + dims = elements[0].dims + assert all(element.dims == dims for element in elements[1:]) + merge_index = dims.index(on) + # the coordinates in the merge index should have the same keys + assert _coords_match( + elements, dim=on, match_values=False + ), f"coords in {[element[on] for element in elements]} do not match" + # all other dimensions, their coordinates and values should already align + for dim in set(dims) - {on}: + assert _coords_match( + elements, dim=dim, match_values=True + ), f"coords in {[element[dim] for element in elements]} do not match" + # merge values without meta + merged_values = np.concatenate( + [element.values for element in elements], axis=merge_index + ) + # piece together with meta + result = type(elements[0])( + merged_values, + coords={ + **{ + coord: (dims, values) + for coord, dims, values in walk_coords(elements[0]) + if not array_is_element(dims, on) + }, + **{ + coord: ( + dims, + np.concatenate([element[coord].values for element in elements]), + ) + for coord, dims, _ in walk_coords(elements[0]) + if array_is_element(dims, on) + }, + }, + dims=elements[0].dims, + ) + return result + + +def _coords_match(elements, dim, match_values=False): + first_coords = [ + (key, tuple(value)) if match_values else key + for _, key, value in walk_coords(elements[0][dim]) + ] + other_coords = [ + [ + (key, tuple(value)) if match_values else key + for _, key, value in walk_coords(element[dim]) + ] + for element in elements[1:] + ] + return all(tuple(first_coords) == tuple(coords) for coords in other_coords) + + +def ci_error(samples, center, confidence=0.95): + low, high = 100 * ((1 - confidence) / 2), 100 * (1 - ((1 - confidence) / 2)) + confidence_below, confidence_above = np.nanpercentile( + samples, low + ), np.nanpercentile(samples, high) + confidence_below, confidence_above = ( + center - confidence_below, + confidence_above - center, + ) + return confidence_below, confidence_above + + +class NoOverlapException(Exception): + pass + + +if __name__ == "__main__": + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + for shush_logger in ["botocore", "boto3", "s3transfer", "urllib3"]: + logging.getLogger(shush_logger).setLevel(logging.INFO) + warnings.filterwarnings( + "ignore" + ) # suppress RuntimeWarning from extrapolation overflows + + upload_ceiling("243sentences") + upload_ceiling("384sentences") diff --git a/brainscore_language/benchmarks/pereira2018_v2022/test.py b/brainscore_language/benchmarks/pereira2018_v2022/test.py new file mode 100644 index 00000000..8047a222 --- /dev/null +++ b/brainscore_language/benchmarks/pereira2018_v2022/test.py @@ -0,0 +1,114 @@ +import copy + +import numpy as np +import pytest +from numpy.random import RandomState +from pytest import approx + +from brainio.assemblies import NeuroidAssembly +from brainscore_language import load_dataset, ArtificialSubject, load_benchmark + + +class TestBenchmark: + class DummyModel(ArtificialSubject): + def __init__(self, neural_activity): + self.neural_activity = neural_activity + + def digest_text(self, stimuli): + assert len(stimuli) == len(self.neural_activity["presentation"]) + self.neural_activity["sentence"] = "presentation", stimuli # copy over + return {"neural": self.neural_activity} + + def perform_neural_recording( + self, + recording_target: ArtificialSubject.RecordingTarget, + recording_type: ArtificialSubject.RecordingType, + ): + assert recording_target == ArtificialSubject.RecordingTarget.language_system + assert recording_type == ArtificialSubject.RecordingType.fMRI + + @pytest.mark.parametrize( + "experiment, expected_score", + [ + (243, approx(0.0017534 / 0.35378928, abs=0.001)), + (384, approx(0.01215216 / 0.36343748, abs=0.001)), + ], + ) + def test_dummy_bad(self, experiment, expected_score): + benchmark = load_benchmark( + f"Pereira2018_v2022.{experiment}sentences-linreg_pearsonr" + ) + neural_activity = RandomState(0).random( + size=(experiment, 25) + ) # presentation x neuroid + neural_activity = NeuroidAssembly( + neural_activity, + coords={ + "stimuli": ("presentation", np.arange(experiment)), + # "stimuli_id": ("presentation", np.arange(experiment)), + "neuroid_id": ("neuroid", np.arange(25)), + "region": ("neuroid", ["some_region"] * 25), + }, + dims=["presentation", "neuroid"], + ) + dummy_model = TestBenchmark.DummyModel(neural_activity=neural_activity) + score = benchmark(dummy_model) + assert score == expected_score + + @pytest.mark.parametrize( + "experiment", + [ + 243, + 384, + ], + ) + def test_exact(self, experiment): + benchmark = load_benchmark( + f"Pereira2018_v2022.{experiment}sentences-linreg_pearsonr" + ) + + # TODO: we are lazy-dropping benchark.data timeid dim, but drop in packaging or accommodate donwstream later? + benchmark.data = benchmark.data.squeeze(dim="time", drop=True) + + exact_data = copy.deepcopy(benchmark.data) # .reset_index("presentation") + # exact_data = exact_data.drop_vars("sentence") + del (exact_data["presentation"],) + # exact_data = exact_data.set_index( + # presentation=list(exact_data["presentation"].coords) + # ) + dummy_model = TestBenchmark.DummyModel(neural_activity=exact_data) + score = benchmark(dummy_model) + print(score) + assert score == approx(1) + + # @pytest.mark.parametrize( + # "experiment, expected_ceiling", + # [ + # (243, 0.35378928), + # (384, 0.36343748), + # ], + # ) + # def test_ceiling(self, experiment, expected_ceiling): + # benchmark = load_benchmark(f"Pereira2018.{experiment}sentences-linear") + # ceiling = benchmark.ceiling + # assert ceiling == approx(expected_ceiling, abs=0.0005) + + # @pytest.mark.parametrize("experiment", [243, 384]) + # def test_ceiling_raw(self, experiment): + # benchmark = load_benchmark(f"Pereira2018.{experiment}sentences-linear") + # ceiling = benchmark.ceiling + # assert hasattr(ceiling, "raw") + # assert set(ceiling.raw.dims) == {"neuroid"} + # assert ceiling.raw.median() == ceiling + # assert hasattr(ceiling.raw, "raw") + # assert set(ceiling.raw.raw.dims) == { + # "sub_subject", + # "num_subjects", + # "split", + # "neuroid", + # } + + +if __name__ == "__main__": + test = TestBenchmark() + test.test_exact(243) diff --git a/brainscore_language/data/pereira2018_v2022/__init__.py b/brainscore_language/data/pereira2018_v2022/__init__.py new file mode 100644 index 00000000..90a34a8f --- /dev/null +++ b/brainscore_language/data/pereira2018_v2022/__init__.py @@ -0,0 +1,29 @@ +import logging + +from brainscore_language import data_registry +from brainscore_language.utils.s3 import load_from_s3 + +# from brainscore_language.utils.local import load_from_disk + +_logger = logging.getLogger(__name__) + +BIBTEX = """@article{pereira2018toward, + title={Toward a universal decoder of linguistic meaning from brain activation}, + author={Pereira, Francisco and Lou, Bin and Pritchett, Brianna and Ritter, Samuel and Gershman, Samuel J + and Kanwisher, Nancy and Botvinick, Matthew and Fedorenko, Evelina}, + journal={Nature communications}, + volume={9}, + number={1}, + pages={1--13}, + year={2018}, + publisher={Nature Publishing Group} +}""" + + + +data_registry["Pereira2018_v2022.language"] = lambda: load_from_s3( + identifier="Pereira2018ROI", + version_id="pwsEbmuuEc60F2z0Y8xVw7i2RgtjyEf9", + sha1="63543362c5e8175efb40721016edb9963b4f7e1e", + assembly_prefix="assembly_", +) diff --git a/brainscore_language/data/pereira2018_v2022/data_packaging.py b/brainscore_language/data/pereira2018_v2022/data_packaging.py new file mode 100644 index 00000000..5ee049fb --- /dev/null +++ b/brainscore_language/data/pereira2018_v2022/data_packaging.py @@ -0,0 +1,120 @@ +import logging +import re +import sys +from pathlib import Path + +import numpy as np +import pandas as pd + +from brainio import fetch +from brainio.assemblies import NeuroidAssembly +from brainio.packaging import upload_to_s3, write_netcdf +from brainscore_core.benchmarks import BenchmarkBase +from brainscore_core.metrics import Score +from brainscore_language import benchmark_registry, load_dataset, load_metric +from brainscore_language.artificial_subject import ArtificialSubject +from brainscore_language.utils.ceiling import ceiling_normalize +from brainscore_language.utils.s3 import load_from_s3 +from tqdm import tqdm + +_logger = logging.getLogger(__name__) + +""" +The code in this file is intended to be run only once to initially upload the data, and is kept here +for reference. +""" + + +def package_Pereira2018ROI(): + """ + Load Pereira2018ROI benchmark csv file and package it into an xarray benchmark. + """ + data_path = Path(__file__).parent + data_file = ( + data_path / "Pereira2018_Lang_fROI.csv" + ) # contains both neural data and associated stimuli. + _logger.info(f"Data file: {data_file}.") + + # get data + data = pd.read_csv(data_file) + data["Neuroid"] = data.UID.astype(str) + "_" + data.ROI + + neuroid_id_map = dict(zip(set(data.Neuroid), np.arange(len(set(data.Neuroid))))) + presentation_id_map = dict( + zip(set(data["Sentence"]), np.arange(len(set(data["Sentence"])))), + ) + + presentation_arr = [None] * len(presentation_id_map) + experiment_arr = [None] * len(presentation_id_map) + neuroid_arr = [None] * len(neuroid_id_map) + subject_arr = [None] * len(neuroid_id_map) + + # inv_neuroid_map = {v: k for k, v in neuroid_id_map} + # inv_presentation_map = {v: k for k, v in presentation_id_map} + + effect_sizes = np.array(data["EffectSize"]) + effect_sizes_placeholder = np.array( + [np.nan for _ in neuroid_id_map for _ in presentation_id_map] + ).reshape(len(presentation_id_map), len(neuroid_id_map)) + + for ix, row in tqdm( + data.iterrows(), total=len(data), desc=f"iterating through {data_file}" + ): + neuroid = row["Neuroid"] + presentation = row["Sentence"] + neuroid_id = neuroid_id_map[neuroid] + presentation_id = presentation_id_map[presentation] + + effect_sizes_placeholder[presentation_id, neuroid_id] = row["EffectSize"] + + presentation_arr[presentation_id] = presentation + experiment_arr[presentation_id] = row["Experiment"] + subject_arr[neuroid_id] = row["UID"] + neuroid_arr[neuroid_id] = neuroid + + assembly = NeuroidAssembly( + effect_sizes_placeholder.reshape(*effect_sizes_placeholder.shape, 1), + dims=("presentation", "neuroid", "time"), # ? added time + coords={ + "stimuli": ("presentation", presentation_arr), + "experiment": ("presentation", experiment_arr), + "subject": ("neuroid", subject_arr), + "roi": ("neuroid", neuroid_arr), + "time": ("time", [0]), + }, + ) + + # upload + upload_data_assembly( + assembly, + assembly_identifier="Pereira2018ROI", + bucket_name="brainscore-language", + ) + + +def upload_data_assembly(assembly, assembly_identifier, bucket_name): + # adapted from + # https://github.com/mschrimpf/brainio/blob/8a40a3558d0b86072b9e221808f19005c7cb8c17/brainio/packaging.py#L217 + + _logger.debug(f"Uploading {assembly_identifier} to S3") + + # identifiers + assembly_store_identifier = "assembly_" + assembly_identifier.replace(".", "_") + netcdf_file_name = assembly_store_identifier + ".nc" + target_netcdf_path = ( + Path(fetch.get_local_data_path()) / assembly_store_identifier / netcdf_file_name + ) + s3_key = netcdf_file_name + + # write to disk and upload + netcdf_kf_sha1 = write_netcdf(assembly, target_netcdf_path) + _logger.debug(f"Wrote file to {target_netcdf_path}.") + upload_to_s3(target_netcdf_path, bucket_name, s3_key) + _logger.debug( + f"Uploaded assembly {assembly_identifier} to S3: {s3_key} (SHA1 hash {netcdf_kf_sha1})" + ) + + +if __name__ == "__main__": + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + package_Pereira2018ROI() diff --git a/brainscore_language/data/pereira2018_v2022/test.py b/brainscore_language/data/pereira2018_v2022/test.py new file mode 100644 index 00000000..f94c192e --- /dev/null +++ b/brainscore_language/data/pereira2018_v2022/test.py @@ -0,0 +1,44 @@ +import copy +import logging + +import numpy as np +import pytest +from numpy.random import RandomState +from pytest import approx + +from brainio.assemblies import NeuroidAssembly +from brainscore_language import load_dataset, ArtificialSubject, load_benchmark + +_logger = logging.getLogger(__name__) + + +class TestData: + def test_language(self): + assembly = load_dataset("Pereira2018_v2022.language") + assert set(assembly["experiment"].values) == { + "PereiraE2_96pass", + "PereiraE3_72pass", + } + _logger.info("experiment names match up!") + + assert len(assembly["presentation"]) == 243 + 384 + _logger.info(f'no. of presentation IDs == {len(assembly["presentation"])}') + + assert len(set(assembly["stimuli"].values)) == 243 + 384 + _logger.info(f'no. of stimuli == {len(assembly["stimuli"].values)}') + + assert ( + "The concert pianist went blind in adulthood." + in assembly["stimuli"].values + ) + _logger.info( + f'stimulus "The concert pianist went blind in adulthood." is present!' + ) + + assert len(set(assembly["subject"].values)) == 10 + _logger.info(f'no. of subjects == {len(set(assembly["subject"].values))}') + + assert ( + len(set(assembly["neuroid"].values)) == 120 + ) # potential TODO: rename to neuroid_id + _logger.info(f'no. of neuroids == {len(set(assembly["neuroid"].values))}')