diff --git a/.history/main_20250213065956.py b/.history/main_20250213065956.py new file mode 100644 index 0000000..e69de29 diff --git a/.history/main_20250213070010.py b/.history/main_20250213070010.py new file mode 100644 index 0000000..9c9bb14 --- /dev/null +++ b/.history/main_20250213070010.py @@ -0,0 +1,36 @@ +import itertools +import numpy as np +import pandas as pd +from matched_markets.methodology.tbrmmdata import TBRMMData +from matched_markets.methodology.tbrmatchedmarkets import TBRMatchedMarkets +from matched_markets.methodology.tbrmmdiagnostics import TBRMMDiagnostics +from matched_markets.methodology.tbrmmdesignparameters import TBRMMDesignParameters + +n_geos = 5 +n_days = 21 +geos = {str(geo) for geo in range(n_geos)} +dates = pd.date_range('2020-03-01', periods=n_days) +df_data = [{'date': date, 'geo': geo} for geo, date in + itertools.product(geos, dates)] +df = pd.DataFrame(df_data) +response_column = 'sales' + +# Create sales data. +def day_geo_sales(geo, n_days): + # Larger geos have different means and variances. + return [ + 100 * geo + 10 * geo * day + day + np.random.randint(10) + for day in range(n_days) + ] + +df[response_column] = 0.0 +for geo in geos: + sales_time_series = day_geo_sales(int(geo), n_days) + df.loc[df.geo == geo, response_column] = sales_time_series + +parameters = TBRMMDesignParameters(n_test=14, iroas=3.0, + budget_range=(0.1, 300000)) +data = TBRMMData(df, response_column) + +mm = TBRMatchedMarkets(data, parameters) +designs = mm.greedy_search() diff --git a/.history/matched_markets/methodology/geoeligibility_20250213065231.py b/.history/matched_markets/methodology/geoeligibility_20250213065231.py new file mode 100644 index 0000000..124fdc1 --- /dev/null +++ b/.history/matched_markets/methodology/geoeligibility_20250213065231.py @@ -0,0 +1,190 @@ +# Copyright 2020 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""TBR Matched Markets preanalysis. +""" +from typing import List, Optional, Set, Text, Union +import dataclasses + +GeoRef = Union[Text, int] + + +@dataclasses.dataclass +class GeoAssignments: + """Representation of all possible geo assignments. + + All attributes are sets of references to geos, can be geo IDs (strings) or + integers. + + Attributes: + all: All geos. + c: All geos that can be assigned to Control. + t: All geos that can be assigned to Treatment. + x: All geos that can be excluded. + c_fixed: geos that must be assigned only to Control. + t_fixed: geos that must be assigned only to Treatment. + x_fixed: geos that must be excluded. + cx: geos that can be in control or excluded (but not in treatment). + tx: geos that can be in treatment or excluded (but not in control). + ctx: geos that can be in either group or excluded. + ct: geos that must be assigned to either control or treatment, but not + excluded. + """ + all: Set[GeoRef] + c: Set[GeoRef] + t: Set[GeoRef] + x: Set[GeoRef] + t_fixed: Set[GeoRef] + c_fixed: Set[GeoRef] + x_fixed: Set[GeoRef] + ct: Set[GeoRef] + cx: Set[GeoRef] + ctx: Set[GeoRef] + tx: Set[GeoRef] + + def __init__(self, c: Set[GeoRef], t: Set[GeoRef], x: Set[GeoRef]): + self.c = c + self.t = t + self.x = x + a = c | t | x + self.all = a + not_c = a - c + not_t = a - t + not_x = a - x + self.c_fixed = c & not_t & not_x + self.t_fixed = not_c & t & not_x + self.x_fixed = not_c & not_t & x + self.ct = c & t & not_x + self.cx = c & not_t & x + self.ctx = c & t & x + self.tx = not_c & t & x + + +class GeoEligibility: + """Validate a Geo Eligibility Matrix. + + A Geo Eligibility Matrix maps each geo to the possible mappings into treatment + groups, or possible exclusion from the design. Used in the TBR Matched Markets + preanalysis. + """ + + def __init__(self, df): + """Initialize and validate a GeoEligibility object. + + Args: + + df: A DataFrame with columns 'geo', 'control' 'treatment', 'exclude'. Each + row specifies to which groups each geo can be assigned to, by using + codes 1 = possible and 0 = not possible. 'geo' can also be the index. + + control treatment exclude + 0 0 1 - geo must be excluded. + 0 1 0 - geo must be assigned to treatment. + 1 0 0 - geo must be assigned to control. + 1 1 1 - geo can be excluded, or included in either + control or treatment. + 0 1 1 - geo can be assigned only to treatment, or + excluded. + 1 0 1 - geo can be assigned only to control, or + excluded. + 1 1 0 - geo must be included in either control or + treatment but never excluded. + 0 0 0 - not allowed. + + Attributes: + data: A copy of the dataframe, indexed by 'geo'. + + Raises: + ValueError: if (a) the DataFrame does not have columns 'geo', 'control', + 'treatment' and 'exclude'; (b) any geo ids are duplicated; (c) if the + values in columns 'control', 'treatment', and 'exclude' are something + else than 0 and 1; (d) if any row in the columns 'control', 'treatment' + and 'exclude' has all zeros in it. + """ + + df = df.copy().reset_index() + + if 'geo' not in df.columns: + raise ValueError('There is no column or index \'geo\'') + + dups = df.columns.duplicated() + if any(dups): + raise ValueError('Duplicate column(s): ' + ', '.join(df.columns[dups])) + + # Ensure that the geo column is a string. + df.geo = df.geo.astype('str') + + value_columns = ['control', 'treatment', 'exclude'] + if not set(value_columns).issubset(set(df.columns)): + missing_columns = [x for x in value_columns if x not in df.columns] + raise ValueError('Missing column(s): ' + ', '.join(missing_columns)) + + all_column_names = ['geo'] + value_columns + + # Ensure the correct column order. + df = df.loc[:, all_column_names] + + dup_geo_ids = set(df['geo'][df['geo'].duplicated()]) + if dup_geo_ids: + raise ValueError('\'geo\' has duplicate values: ' + + ', '.join(str(id) for id in dup_geo_ids)) + + if not all([set(df[col]) <= {0, 1} for col in value_columns]): + raise ValueError('GeoEligibility objects must have only values ' + '0, 1 in columns ' + ', '.join(value_columns)) + + zero_row = df[value_columns].sum(axis=1) == 0 + if any(zero_row): + geos = df['geo'][zero_row] + raise ValueError('Three zeros found for geo(s) ' + ', '.join(geos)) + + df.set_index('geo', inplace=True) + self.data = df + + def __str__(self): + return 'Geo eligibility matrix with %d geos' % self.data.shape[0] + + def get_eligible_assignments(self, geos: Optional[List[GeoRef]] = None, + indices: bool = False) -> GeoAssignments: + """Get an object representing all possible geo assignment groups. + + Args: + geos: A list of geo IDs to include. If None, all geos are included. The + order is important if 'indices' are used. + indices: Instead of generating sets of geo IDs, generate sets of the + positional index numbers of the geo IDs in the list 'geos'. Raises an + error if 'geos' is not specified. + + Returns: + A GeoAssignments object. + + Raises: + ValueError: if geos is not specified but indices is True. + """ + + df = self.data # DataFrame indexed by the geo ID. + + if geos: + df = df.loc[list(geos)] + if indices: + df = df.reset_index() + elif indices: + raise ValueError('\'geos\' is not specified but indices=True') + + # Generate sets of geos (IDs or indices) indicating membership of the group. + c = set(df.index[df['control'] == 1]) + t = set(df.index[df['treatment'] == 1]) + x = set(df.index[df['exclude'] == 1]) + + return GeoAssignments(c, t, x) diff --git a/.history/matched_markets/methodology/geoeligibility_20250213071034.py b/.history/matched_markets/methodology/geoeligibility_20250213071034.py new file mode 100644 index 0000000..4d9956e --- /dev/null +++ b/.history/matched_markets/methodology/geoeligibility_20250213071034.py @@ -0,0 +1,83 @@ +from typing import List, Optional, Set, Text, Union +import dataclasses +import pandas as pd + +GeoRef = Union[Text, int] + +@dataclasses.dataclass +class GeoAssignments: + """Representation of all possible geo assignments.""" + all: Set[GeoRef] + c: Set[GeoRef] + t: Set[GeoRef] + x: Set[GeoRef] + c_fixed: Set[GeoRef] + t_fixed: Set[GeoRef] + x_fixed: Set[GeoRef] + ct: Set[GeoRef] + cx: Set[GeoRef] + ctx: Set[GeoRef] + tx: Set[GeoRef] + + def __init__(self, c: Set[GeoRef], t: Set[GeoRef], x: Set[GeoRef]): + self.c = c + self.t = t + self.x = x + self.all = c | t | x + + self.c_fixed = c - (t | x) + self.t_fixed = t - (c | x) + self.x_fixed = x - (c | t) + + self.ct = (c & t) - x + self.cx = (c & x) - t + self.ctx = c & t & x + self.tx = (t & x) - c + +class GeoEligibility: + """Validate a Geo Eligibility Matrix.""" + def __init__(self, df: pd.DataFrame): + df = df.copy() + df.reset_index(drop=True, inplace=True) + + required_columns = {'geo', 'control', 'treatment', 'exclude'} + if not required_columns.issubset(df.columns): + missing = required_columns - set(df.columns) + raise ValueError(f'Missing required column(s): {", ".join(missing)}') + + if df.columns.duplicated().any(): + raise ValueError('Duplicate columns found in DataFrame.') + + df['geo'] = df['geo'].astype(str) + + if df.duplicated(subset=['geo']).any(): + dup_geo_ids = df['geo'][df.duplicated(subset=['geo'])].unique() + raise ValueError(f'Duplicate geo values found: {", ".join(dup_geo_ids)}') + + if not all(df[col].isin([0, 1]).all() for col in ['control', 'treatment', 'exclude']): + raise ValueError('Columns control, treatment, and exclude must contain only 0 or 1.') + + if (df[['control', 'treatment', 'exclude']].sum(axis=1) == 0).any(): + zero_rows = df['geo'][df[['control', 'treatment', 'exclude']].sum(axis=1) == 0] + raise ValueError(f'Invalid rows with all zeros found for geos: {", ".join(zero_rows)}') + + df.set_index('geo', inplace=True) + self.data = df + + def __str__(self): + return f'Geo eligibility matrix with {self.data.shape[0]} geos' + + def get_eligible_assignments(self, geos: Optional[List[GeoRef]] = None, indices: bool = False) -> GeoAssignments: + if indices and geos is None: + raise ValueError('`geos` must be specified when `indices=True`.') + + df = self.data if geos is None else self.data.loc[geos] + + if indices: + df = df.reset_index() + + c = set(df.index[df['control'] == 1]) + t = set(df.index[df['treatment'] == 1]) + x = set(df.index[df['exclude'] == 1]) + + return GeoAssignments(c, t, x) diff --git a/.history/matched_markets/methodology/heapdict_20250213065231.py b/.history/matched_markets/methodology/heapdict_20250213065231.py new file mode 100644 index 0000000..06463a1 --- /dev/null +++ b/.history/matched_markets/methodology/heapdict_20250213065231.py @@ -0,0 +1,80 @@ +# Copyright 2020 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""TBR Matched Markets: utilities. +""" + +import collections +import heapq + +from typing import Any, Dict, List, TypeVar + +DictKey = TypeVar('DictKey', str, int, float) + + +class HeapDict: + """A dictionary of priority queues of a given limited size. + + Each dictionary key points to a separate queue that has a fixed maximum + size. Upon pushing an item in a queue, the smallest item will be discarded if + the maximum size is exceeded. Hence each queue stores the largest items that + have been pushed in. + + Each item must be sortable; an item of arbitrary class can be used if it + features a custom __lt__ method. + + Example: + h = HeapDict(1) # Keep only the largest item. + h.push(10, 0.5) + h.push(10, 1.0) + h.push(20, 1.0) + h.push(20, 2.0) + h.get_result() # Returns {10: [1.0], 20: [2.0]}. + """ + + def __init__(self, size: int): + """Initialize a HeapDict. + + Args: + size: Maximum size of each heap (priority queue). + """ + self._size = size + self._result = collections.defaultdict(list) + + def push(self, key: DictKey, item: Any): + """Push an item into the queue associated with the key. + + Args: + key: A dictionary key, string, integer, or float. + item: Any object. The queue corresponding to the key will be sorted based + on this object. + """ + queue = self._result[key] + if len(queue) < self._size: + heapq.heappush(queue, item) + else: + # Push the new item, and remove the smallest item. + heapq.heappushpop(queue, item) + self._result[key] = queue + + def get_result(self) -> Dict[DictKey, List[Any]]: + """Return a copy of the dictionary, each queue sorted in descending order. + + Returns: + A dictionary with the sorted lists as values, largest values first. + """ + result = {} + for key, q in self._result.items(): + result[key] = heapq.nlargest(len(q), q) + return result diff --git a/.history/matched_markets/methodology/heapdict_20250213071403.py b/.history/matched_markets/methodology/heapdict_20250213071403.py new file mode 100644 index 0000000..41de674 --- /dev/null +++ b/.history/matched_markets/methodology/heapdict_20250213071403.py @@ -0,0 +1,71 @@ +# Copyright 2020 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""TBR Matched Markets: utilities.""" + +import heapq +import collections +from typing import Any, Dict, List, TypeVar + +DictKey = TypeVar('DictKey', str, int, float) + + +class HeapDict: + """A dictionary of priority queues with a fixed maximum size per queue. + + Each dictionary key maps to a separate priority queue with a fixed maximum + size. When adding a new item, the smallest item is discarded if the maximum + size is exceeded. This ensures each queue retains only the largest items. + + Items must be sortable. Custom classes must implement __lt__ for sorting. + + Example: + h = HeapDict(1) # Keep only the largest item per key. + h.push(10, 0.5) + h.push(10, 1.0) + h.push(20, 1.0) + h.push(20, 2.0) + h.get_result() # Returns {10: [1.0], 20: [2.0]}. + """ + + def __init__(self, size: int): + """Initializes the HeapDict with a fixed queue size. + + Args: + size: Maximum number of elements per priority queue. + """ + self._size = size + self._result: Dict[DictKey, List[Any]] = collections.defaultdict(list) + + def push(self, key: DictKey, item: Any) -> None: + """Pushes an item into the priority queue of the given key. + + Args: + key: The dictionary key (string, integer, or float). + item: The item to be inserted into the priority queue. + """ + queue = self._result[key] + if len(queue) < self._size: + heapq.heappush(queue, item) + else: + heapq.heappushpop(queue, item) + + def get_result(self) -> Dict[DictKey, List[Any]]: + """Returns a dictionary with sorted queues in descending order. + + Returns: + A dictionary where each key maps to a sorted list of values + (largest values first). + """ + return {key: heapq.nlargest(len(q), q) for key, q in self._result.items()} diff --git a/main.py b/main.py new file mode 100644 index 0000000..9c9bb14 --- /dev/null +++ b/main.py @@ -0,0 +1,36 @@ +import itertools +import numpy as np +import pandas as pd +from matched_markets.methodology.tbrmmdata import TBRMMData +from matched_markets.methodology.tbrmatchedmarkets import TBRMatchedMarkets +from matched_markets.methodology.tbrmmdiagnostics import TBRMMDiagnostics +from matched_markets.methodology.tbrmmdesignparameters import TBRMMDesignParameters + +n_geos = 5 +n_days = 21 +geos = {str(geo) for geo in range(n_geos)} +dates = pd.date_range('2020-03-01', periods=n_days) +df_data = [{'date': date, 'geo': geo} for geo, date in + itertools.product(geos, dates)] +df = pd.DataFrame(df_data) +response_column = 'sales' + +# Create sales data. +def day_geo_sales(geo, n_days): + # Larger geos have different means and variances. + return [ + 100 * geo + 10 * geo * day + day + np.random.randint(10) + for day in range(n_days) + ] + +df[response_column] = 0.0 +for geo in geos: + sales_time_series = day_geo_sales(int(geo), n_days) + df.loc[df.geo == geo, response_column] = sales_time_series + +parameters = TBRMMDesignParameters(n_test=14, iroas=3.0, + budget_range=(0.1, 300000)) +data = TBRMMData(df, response_column) + +mm = TBRMatchedMarkets(data, parameters) +designs = mm.greedy_search() diff --git a/matched_markets/__pycache__/__init__.cpython-39.pyc b/matched_markets/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..f473fed Binary files /dev/null and b/matched_markets/__pycache__/__init__.cpython-39.pyc differ diff --git a/matched_markets/examples/salesandcost.py b/matched_markets/examples/salesandcost.py index 671e388..1080336 100644 --- a/matched_markets/examples/salesandcost.py +++ b/matched_markets/examples/salesandcost.py @@ -65,4 +65,3 @@ def example_data_formatted(srcdir): """Summon the data.""" data, geoassign, exdates = example_data(srcdir) return format_example_data(data, geoassign, exdates) - diff --git a/matched_markets/methodology/__pycache__/__init__.cpython-39.pyc b/matched_markets/methodology/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..6d633ee Binary files /dev/null and b/matched_markets/methodology/__pycache__/__init__.cpython-39.pyc differ diff --git a/matched_markets/methodology/__pycache__/geoeligibility.cpython-39.pyc b/matched_markets/methodology/__pycache__/geoeligibility.cpython-39.pyc new file mode 100644 index 0000000..326c4b8 Binary files /dev/null and b/matched_markets/methodology/__pycache__/geoeligibility.cpython-39.pyc differ diff --git a/matched_markets/methodology/__pycache__/heapdict.cpython-39.pyc b/matched_markets/methodology/__pycache__/heapdict.cpython-39.pyc new file mode 100644 index 0000000..4a642a8 Binary files /dev/null and b/matched_markets/methodology/__pycache__/heapdict.cpython-39.pyc differ diff --git a/matched_markets/methodology/__pycache__/tbrmatchedmarkets.cpython-39.pyc b/matched_markets/methodology/__pycache__/tbrmatchedmarkets.cpython-39.pyc new file mode 100644 index 0000000..35d30c1 Binary files /dev/null and b/matched_markets/methodology/__pycache__/tbrmatchedmarkets.cpython-39.pyc differ diff --git a/matched_markets/methodology/__pycache__/tbrmmdata.cpython-39.pyc b/matched_markets/methodology/__pycache__/tbrmmdata.cpython-39.pyc new file mode 100644 index 0000000..3bca365 Binary files /dev/null and b/matched_markets/methodology/__pycache__/tbrmmdata.cpython-39.pyc differ diff --git a/matched_markets/methodology/__pycache__/tbrmmdesign.cpython-39.pyc b/matched_markets/methodology/__pycache__/tbrmmdesign.cpython-39.pyc new file mode 100644 index 0000000..7ac4233 Binary files /dev/null and b/matched_markets/methodology/__pycache__/tbrmmdesign.cpython-39.pyc differ diff --git a/matched_markets/methodology/__pycache__/tbrmmdesignparameters.cpython-39.pyc b/matched_markets/methodology/__pycache__/tbrmmdesignparameters.cpython-39.pyc new file mode 100644 index 0000000..bfc5048 Binary files /dev/null and b/matched_markets/methodology/__pycache__/tbrmmdesignparameters.cpython-39.pyc differ diff --git a/matched_markets/methodology/__pycache__/tbrmmdiagnostics.cpython-39.pyc b/matched_markets/methodology/__pycache__/tbrmmdiagnostics.cpython-39.pyc new file mode 100644 index 0000000..e5eab22 Binary files /dev/null and b/matched_markets/methodology/__pycache__/tbrmmdiagnostics.cpython-39.pyc differ diff --git a/matched_markets/methodology/__pycache__/tbrmmscore.cpython-39.pyc b/matched_markets/methodology/__pycache__/tbrmmscore.cpython-39.pyc new file mode 100644 index 0000000..339bc35 Binary files /dev/null and b/matched_markets/methodology/__pycache__/tbrmmscore.cpython-39.pyc differ diff --git a/matched_markets/methodology/geoeligibility.py b/matched_markets/methodology/geoeligibility.py index bc448e8..4d9956e 100644 --- a/matched_markets/methodology/geoeligibility.py +++ b/matched_markets/methodology/geoeligibility.py @@ -1,190 +1,83 @@ -# Copyright 2020 Google LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""TBR Matched Markets preanalysis. -""" from typing import List, Optional, Set, Text, Union import dataclasses +import pandas as pd GeoRef = Union[Text, int] - @dataclasses.dataclass class GeoAssignments: - """Representation of all possible geo assignments. - - All attributes are sets of references to geos, can be geo IDs (strings) or - integers. - - Attributes: - all: All geos. - c: All geos that can be assigned to Control. - t: All geos that can be assigned to Treatment. - x: All geos that can be excluded. - c_fixed: geos that must be assigned only to Control. - t_fixed: geos that must be assigned only to Treatment. - x_fixed: geos that must be excluded. - cx: geos that can be in control or excluded (but not in treatment). - tx: geos that can be in treatment or excluded (but not in control). - ctx: geos that can be in either group or excluded. - ct: geos that must be assigned to either control or treatment, but not - excluded. - """ - all: Set[GeoRef] - c: Set[GeoRef] - t: Set[GeoRef] - x: Set[GeoRef] - t_fixed: Set[GeoRef] - c_fixed: Set[GeoRef] - x_fixed: Set[GeoRef] - ct: Set[GeoRef] - cx: Set[GeoRef] - ctx: Set[GeoRef] - tx: Set[GeoRef] - - def __init__(self, c: Set[GeoRef], t: Set[GeoRef], x: Set[GeoRef]): - self.c = c - self.t = t - self.x = x - a = c | t | x - self.all = a - not_c = a - c - not_t = a - t - not_x = a - x - self.c_fixed = c & not_t & not_x - self.t_fixed = not_c & t & not_x - self.x_fixed = not_c & not_t & x - self.ct = c & t & not_x - self.cx = c & not_t & x - self.ctx = c & t & x - self.tx = not_c & t & x - + """Representation of all possible geo assignments.""" + all: Set[GeoRef] + c: Set[GeoRef] + t: Set[GeoRef] + x: Set[GeoRef] + c_fixed: Set[GeoRef] + t_fixed: Set[GeoRef] + x_fixed: Set[GeoRef] + ct: Set[GeoRef] + cx: Set[GeoRef] + ctx: Set[GeoRef] + tx: Set[GeoRef] + + def __init__(self, c: Set[GeoRef], t: Set[GeoRef], x: Set[GeoRef]): + self.c = c + self.t = t + self.x = x + self.all = c | t | x + + self.c_fixed = c - (t | x) + self.t_fixed = t - (c | x) + self.x_fixed = x - (c | t) + + self.ct = (c & t) - x + self.cx = (c & x) - t + self.ctx = c & t & x + self.tx = (t & x) - c class GeoEligibility: - """Validate a Geo Eligibility Matrix. - - A Geo Eligibility Matrix maps each geo to the possible mappings into treatment - groups, or possible exclusion from the design. Used in the TBR Matched Markets - preanalysis. - """ - - def __init__(self, df): - """Initialize and validate a GeoEligibility object. - - Args: - - df: A DataFrame with columns 'geo', 'control' 'treatment', 'exclude'. Each - row specifies to which groups each geo can be assigned to, by using - codes 1 = possible and 0 = not possible. 'geo' can also be the index. - - control treatment exclude - 0 0 1 - geo must be excluded. - 0 1 0 - geo must be assigned to treatment. - 1 0 0 - geo must be assigned to control. - 1 1 1 - geo can be excluded, or included in either - control or treatment. - 0 1 1 - geo can be assigned only to treatment, or - excluded. - 1 0 1 - geo can be assigned only to control, or - excluded. - 1 1 0 - geo must be included in either control or - treatment but never excluded. - 0 0 0 - not allowed. - - Attributes: - data: A copy of the dataframe, indexed by 'geo'. - - Raises: - ValueError: if (a) the DataFrame does not have columns 'geo', 'control', - 'treatment' and 'exclude'; (b) any geo ids are duplicated; (c) if the - values in columns 'control', 'treatment', and 'exclude' are something - else than 0 and 1; (d) if any row in the columns 'control', 'treatment' - and 'exclude' has all zeros in it. - """ - - df = df.copy().reset_index() - - if 'geo' not in df.columns: - raise ValueError('There is no column or index \'geo\'') - - dups = df.columns.duplicated() - if any(dups): - raise ValueError('Duplicate column(s): ' + ', '.join(df.columns[dups])) - - # Ensure that the geo column is a string. - df.geo = df.geo.astype('str') - - value_columns = ['control', 'treatment', 'exclude'] - if not set(value_columns).issubset(set(df.columns)): - missing_columns = [x for x in value_columns if x not in df.columns] - raise ValueError('Missing column(s): ' + ', '.join(missing_columns)) - - all_column_names = ['geo'] + value_columns - - # Ensure the correct column order. - df = df.loc[:, all_column_names] - - dup_geo_ids = set(df['geo'][df['geo'].duplicated()]) - if dup_geo_ids: - raise ValueError('\'geo\' has duplicate values: ' + - ', '.join(str(id) for id in dup_geo_ids)) - - if not all([set(df[col]) <= {0, 1} for col in value_columns]): - raise ValueError('GeoEligibility objects must have only values ' - '0, 1 in columns ' + ', '.join(value_columns)) - - zero_row = df[value_columns].sum(axis=1) == 0 - if any(zero_row): - geos = df['geo'][zero_row] - raise ValueError('Three zeros found for geo(s) ' + ', '.join(geos)) - - df.set_index('geo', inplace=True) - self.data = df - - def __str__(self): - return 'Geo eligibility matrix with %d geos' % self.data.shape[0] - - def get_eligible_assignments(self, geos: Optional[List[GeoRef]] = None, - indices: bool = False) -> GeoAssignments: - """Get an object representing all possible geo assignment groups. - - Args: - geos: A list of geo IDs to include. If None, all geos are included. The - order is important if 'indices' are used. - indices: Instead of generating sets of geo IDs, generate sets of the - positional index numbers of the geo IDs in the list 'geos'. Raises an - error if 'geos' is not specified. - - Returns: - A GeoAssignments object. - - Raises: - ValueError: if geos is not specified but indices is True. - """ - - df = self.data # DataFrame indexed by the geo ID. - - if geos: - df = df.loc[geos] - if indices: - df = df.reset_index() - elif indices: - raise ValueError('\'geos\' is not specified but indices=True') - - # Generate sets of geos (IDs or indices) indicating membership of the group. - c = set(df.index[df['control'] == 1]) - t = set(df.index[df['treatment'] == 1]) - x = set(df.index[df['exclude'] == 1]) - - return GeoAssignments(c, t, x) + """Validate a Geo Eligibility Matrix.""" + def __init__(self, df: pd.DataFrame): + df = df.copy() + df.reset_index(drop=True, inplace=True) + + required_columns = {'geo', 'control', 'treatment', 'exclude'} + if not required_columns.issubset(df.columns): + missing = required_columns - set(df.columns) + raise ValueError(f'Missing required column(s): {", ".join(missing)}') + + if df.columns.duplicated().any(): + raise ValueError('Duplicate columns found in DataFrame.') + + df['geo'] = df['geo'].astype(str) + + if df.duplicated(subset=['geo']).any(): + dup_geo_ids = df['geo'][df.duplicated(subset=['geo'])].unique() + raise ValueError(f'Duplicate geo values found: {", ".join(dup_geo_ids)}') + + if not all(df[col].isin([0, 1]).all() for col in ['control', 'treatment', 'exclude']): + raise ValueError('Columns control, treatment, and exclude must contain only 0 or 1.') + + if (df[['control', 'treatment', 'exclude']].sum(axis=1) == 0).any(): + zero_rows = df['geo'][df[['control', 'treatment', 'exclude']].sum(axis=1) == 0] + raise ValueError(f'Invalid rows with all zeros found for geos: {", ".join(zero_rows)}') + + df.set_index('geo', inplace=True) + self.data = df + + def __str__(self): + return f'Geo eligibility matrix with {self.data.shape[0]} geos' + + def get_eligible_assignments(self, geos: Optional[List[GeoRef]] = None, indices: bool = False) -> GeoAssignments: + if indices and geos is None: + raise ValueError('`geos` must be specified when `indices=True`.') + + df = self.data if geos is None else self.data.loc[geos] + + if indices: + df = df.reset_index() + + c = set(df.index[df['control'] == 1]) + t = set(df.index[df['treatment'] == 1]) + x = set(df.index[df['exclude'] == 1]) + + return GeoAssignments(c, t, x) diff --git a/matched_markets/methodology/heapdict.py b/matched_markets/methodology/heapdict.py index 06463a1..41de674 100644 --- a/matched_markets/methodology/heapdict.py +++ b/matched_markets/methodology/heapdict.py @@ -12,69 +12,60 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""TBR Matched Markets: utilities. -""" +"""TBR Matched Markets: utilities.""" -import collections import heapq - +import collections from typing import Any, Dict, List, TypeVar DictKey = TypeVar('DictKey', str, int, float) class HeapDict: - """A dictionary of priority queues of a given limited size. + """A dictionary of priority queues with a fixed maximum size per queue. - Each dictionary key points to a separate queue that has a fixed maximum - size. Upon pushing an item in a queue, the smallest item will be discarded if - the maximum size is exceeded. Hence each queue stores the largest items that - have been pushed in. + Each dictionary key maps to a separate priority queue with a fixed maximum + size. When adding a new item, the smallest item is discarded if the maximum + size is exceeded. This ensures each queue retains only the largest items. - Each item must be sortable; an item of arbitrary class can be used if it - features a custom __lt__ method. + Items must be sortable. Custom classes must implement __lt__ for sorting. - Example: - h = HeapDict(1) # Keep only the largest item. - h.push(10, 0.5) - h.push(10, 1.0) - h.push(20, 1.0) - h.push(20, 2.0) - h.get_result() # Returns {10: [1.0], 20: [2.0]}. - """ + Example: + h = HeapDict(1) # Keep only the largest item per key. + h.push(10, 0.5) + h.push(10, 1.0) + h.push(20, 1.0) + h.push(20, 2.0) + h.get_result() # Returns {10: [1.0], 20: [2.0]}. + """ - def __init__(self, size: int): - """Initialize a HeapDict. + def __init__(self, size: int): + """Initializes the HeapDict with a fixed queue size. - Args: - size: Maximum size of each heap (priority queue). - """ - self._size = size - self._result = collections.defaultdict(list) + Args: + size: Maximum number of elements per priority queue. + """ + self._size = size + self._result: Dict[DictKey, List[Any]] = collections.defaultdict(list) - def push(self, key: DictKey, item: Any): - """Push an item into the queue associated with the key. + def push(self, key: DictKey, item: Any) -> None: + """Pushes an item into the priority queue of the given key. - Args: - key: A dictionary key, string, integer, or float. - item: Any object. The queue corresponding to the key will be sorted based - on this object. - """ - queue = self._result[key] - if len(queue) < self._size: - heapq.heappush(queue, item) - else: - # Push the new item, and remove the smallest item. - heapq.heappushpop(queue, item) - self._result[key] = queue + Args: + key: The dictionary key (string, integer, or float). + item: The item to be inserted into the priority queue. + """ + queue = self._result[key] + if len(queue) < self._size: + heapq.heappush(queue, item) + else: + heapq.heappushpop(queue, item) - def get_result(self) -> Dict[DictKey, List[Any]]: - """Return a copy of the dictionary, each queue sorted in descending order. + def get_result(self) -> Dict[DictKey, List[Any]]: + """Returns a dictionary with sorted queues in descending order. - Returns: - A dictionary with the sorted lists as values, largest values first. - """ - result = {} - for key, q in self._result.items(): - result[key] = heapq.nlargest(len(q), q) - return result + Returns: + A dictionary where each key maps to a sorted list of values + (largest values first). + """ + return {key: heapq.nlargest(len(q), q) for key, q in self._result.items()} diff --git a/matched_markets/methodology/utils.py b/matched_markets/methodology/utils.py index a33444e..f893f04 100644 --- a/matched_markets/methodology/utils.py +++ b/matched_markets/methodology/utils.py @@ -48,7 +48,7 @@ def kwarg_subdict(prefix, **kwargs): sub_kwargs = [k for k in kwargs.keys() if rgx.search(k)] # Return the matched kwargs, stripping off prefix. - return {rgx.match(k).group(1): kwargs[k] for k in sub_kwargs} + return {rgx.match(k).group(1): kwargs[k] for k in sub_kwargs} # pytype: disable=attribute-error # re-none def float_order(x): @@ -113,9 +113,9 @@ def brownian_bridge_bounds(n, sd_bound_multiplier): Args: n: (int >= 1) Length of the time series of the cumulative residuals - following a Brownian Bridge process. - sd_bound_multiplier: (numeric > 0) Multiplier for bounds on cumulative - standardized residuals. + following a Brownian Bridge process. + sd_bound_multiplier: (numeric > 0) Multiplier for bounds on cumulative + standardized residuals. Returns: A list of length n, of the Brownian Bridge process bounds in absolute @@ -145,24 +145,23 @@ def credible_interval(simulations, level): Raises: ValueError: if the requested level is too large (< 1/ len(sims)). """ - alpha = (1 - level)/2.0 + alpha = (1 - level) / 2.0 nvals = len(simulations) - if alpha < 1.0/nvals: + if alpha < 1.0 / nvals: raise ValueError('Too few values to provide requested quantiles.') sims_sort = np.sort(np.copy(simulations)) frac = nvals * np.array([alpha, 0.5, 1.0 - alpha]) - 1.0 low = np.floor(frac).astype(np.int64) - return sims_sort[low] + (frac - low)*(sims_sort[low + 1] - sims_sort[low]) + return sims_sort[low] + (frac - low) * (sims_sort[low + 1] - sims_sort[low]) -def find_days_to_exclude( - dates_to_exclude: List[str]) -> List[TimeWindow]: +def find_days_to_exclude(dates_to_exclude: List[str]) -> List[TimeWindow]: """Returns a list of time windows to exclude from a list of days and periods. Args: dates_to_exclude: a List of strings with format indicating a single day as - '2020/01/01' (YYYY/MM/DD) or an entire time period as - '2020/01/01 - 2020/02/01' (indicating start and end date of the time period) + '2020/01/01' (YYYY/MM/DD) or an entire time period as '2020/01/01 - + 2020/02/01' (indicating start and end date of the time period) Returns: days_exclude: a List of TimeWindows obtained from the list in input. @@ -173,19 +172,24 @@ def find_days_to_exclude( if len(tmp) == 1: try: days_exclude.append( - TimeWindow(pd.Timestamp(tmp[0]), pd.Timestamp(tmp[0]))) + TimeWindow(pd.Timestamp(tmp[0]), pd.Timestamp(tmp[0])) + ) except ValueError: raise ValueError(f'Cannot convert the string {tmp[0]} to a valid date.') elif len(tmp) == 2: try: days_exclude.append( - TimeWindow(pd.Timestamp(tmp[0]), pd.Timestamp(tmp[1]))) + TimeWindow(pd.Timestamp(tmp[0]), pd.Timestamp(tmp[1])) + ) except ValueError: raise ValueError( - f'Cannot convert the strings in {tmp} to a valid date.') + f'Cannot convert the strings in {tmp} to a valid date.' + ) else: - raise ValueError(f'The input {tmp} cannot be interpreted as a single' + - ' day or a time window') + raise ValueError( + f'The input {tmp} cannot be interpreted as a single' + ' day or a time window' + ) return days_exclude @@ -202,7 +206,8 @@ def expand_time_windows(periods: List[TimeWindow]) -> List[pd.Timestamp]: days_exclude = [] for window in periods: days_exclude += pd.date_range( - window.first_day, window.last_day, freq='D').to_list() + window.first_day, window.last_day, freq='D' + ).to_list() return list(set(days_exclude)) @@ -223,13 +228,16 @@ def human_readable_number(number: float) -> str: while abs(number) >= 1000 and magnitude < 4: magnitude += 1 number /= 1000.0 - readable_number = '{}{}'.format('{:f}'.format(number).rstrip('0').rstrip('.'), - ['', 'K', 'M', 'B', 'tn'][magnitude]) + readable_number = '{}{}'.format( + '{:f}'.format(number).rstrip('0').rstrip('.'), + ['', 'K', 'M', 'B', 'tn'][magnitude], + ) return readable_number -def default_geo_assignment(geo_level_time_series: pd.DataFrame, - geo_eligibility: pd.DataFrame) -> pd.DataFrame: +def default_geo_assignment( + geo_level_time_series: pd.DataFrame, geo_eligibility: pd.DataFrame +) -> pd.DataFrame: """Set the default assignment eligibility for missing geos. Geos missing in the geo assignment table but present in the geo level time @@ -254,19 +262,25 @@ def default_geo_assignment(geo_level_time_series: pd.DataFrame, geo_eligibility['geo'] = pd.to_numeric(geo_eligibility['geo']) missing_geos = list( - set(geo_level_time_series['geo']) - set(geo_eligibility['geo'])) - - return geo_eligibility.append( - pd.DataFrame({ - 'geo': missing_geos, - 'control': 1, - 'treatment': 1, - 'exclude': 1 - })).sort_values(by='geo').reset_index(drop=True) - - -def plot_iroas_over_time(iroas_df: pd.DataFrame, experiment_dates: pd.DataFrame, - cooldown_date: pd.DataFrame): + set(geo_level_time_series['geo']) - set(geo_eligibility['geo']) + ) + + return pd.concat( + [ + geo_eligibility, + pd.DataFrame( + {'geo': missing_geos, 'control': 1, 'treatment': 1, 'exclude': 1} + ) + ], + ignore_index=True, + ).sort_values(by='geo').reset_index(drop=True) + + +def plot_iroas_over_time( + iroas_df: pd.DataFrame, + experiment_dates: pd.DataFrame, + cooldown_date: pd.DataFrame, +): """Returns a chart of the iROAS estimate over time with confidence bands. This function provides a visualization of the evolution of the iROAS estimate @@ -276,8 +290,8 @@ def plot_iroas_over_time(iroas_df: pd.DataFrame, experiment_dates: pd.DataFrame, Args: iroas_df: a dataframe with columns: date, lower, mean, upper experiment_dates: dataframe with columns (date, color) which contains two - dates for each period (start, end), and the column color is the label - used in the chart to refer to the corresponding period, e.g. "Experiment + dates for each period (start, end), and the column color is the label used + in the chart to refer to the corresponding period, e.g. "Experiment period" or "Pretes period". cooldown_date: dataframe with column (date, color) with only one entry, where date indicates the last day in the cooldown period, and color is the @@ -286,56 +300,83 @@ def plot_iroas_over_time(iroas_df: pd.DataFrame, experiment_dates: pd.DataFrame, Returns: iroas_chart: Chart containing the plot. """ - iroas_base = alt.Chart(iroas_df).mark_line().encode( - x=alt.X('date:T', axis=alt.Axis(title='', format=('%b %e')))) + iroas_base = ( + alt.Chart(iroas_df) + .mark_line() + .encode(x=alt.X('date:T', axis=alt.Axis(title='', format='%b %e'))) + ) iroas_selection = alt.selection_single( fields=['date'], nearest=True, on='mouseover', empty='none', - clear='mouseout') + clear='mouseout', + ) iroas_lines = iroas_base.mark_line().encode( - y=alt.Y('mean:Q', axis=alt.Axis(title=' ', format='.3'))) + y=alt.Y('mean:Q', axis=alt.Axis(title=' ', format='.3')) + ) iroas_points = iroas_lines.mark_point().transform_filter(iroas_selection) iroas_rule1 = iroas_base.mark_rule().encode( - tooltip=['date:T', 'mean:Q', 'lower:Q', 'upper:Q']) + tooltip=['date:T', 'mean:Q', 'lower:Q', 'upper:Q'] + ) iroas_rule = iroas_rule1.encode( - opacity=alt.condition(iroas_selection, alt.value(0.3), alt.value( - 0))).add_selection(iroas_selection) - - iroas_ci_bands_rule = alt.Chart(iroas_df).mark_area(color='gray').encode( - alt.X('date:T'), y='lower:Q', y2='upper:Q', opacity=alt.value(0.5)) - - date_rule = alt.Chart( - experiment_dates[experiment_dates['color'] == - 'Experiment period']).mark_rule(strokeWidth=2).encode( - x='date:T', - color=alt.Color( - 'color', - scale=alt.Scale( - domain=[ - 'Experiment period', - 'End of cooldown period', - 'iROAS estimate' - ], - range=['black', 'black', '#1f77b4']))) - cooldown_date_rule = alt.Chart(cooldown_date).mark_rule( - strokeWidth=2, strokeDash=[5, 2], color='black').encode( - x='date:T', color='color:N') + opacity=alt.condition(iroas_selection, alt.value(0.3), alt.value(0)) + ).add_selection(iroas_selection) + + iroas_ci_bands_rule = ( + alt.Chart(iroas_df) + .mark_area(color='gray') + .encode( + alt.X('date:T'), y='lower:Q', y2='upper:Q', opacity=alt.value(0.5) + ) + ) + + date_rule = ( + alt.Chart( + experiment_dates[experiment_dates['color'] == 'Experiment period'] + ) + .mark_rule(strokeWidth=2) + .encode( + x='date:T', + color=alt.Color( + 'color', + scale=alt.Scale( + domain=[ + 'Experiment period', + 'End of cooldown period', + 'iROAS estimate', + ], + range=['black', 'black', '#1f77b4'], + ), + ), + ) + ) + cooldown_date_rule = ( + alt.Chart(cooldown_date) + .mark_rule(strokeWidth=2, strokeDash=[5, 2], color='black') + .encode(x='date:T', color='color:N') + ) # Compile chart - iroas_chart = alt.layer(iroas_lines, iroas_rule, iroas_points, date_rule, - cooldown_date_rule, iroas_ci_bands_rule) + iroas_chart = alt.layer( + iroas_lines, + iroas_rule, + iroas_points, + date_rule, + cooldown_date_rule, + iroas_ci_bands_rule, + ) return iroas_chart -def infer_frequency(data: pd.DataFrame, date_index: str, - series_index: str) -> str: +def infer_frequency( + data: pd.DataFrame, date_index: str, series_index: str +) -> str: """Infers frequency of data from pd.DataFrame with multiple indices. Infers frequency of data from pd.DataFrame with two indices, one for the slice @@ -364,34 +405,37 @@ def infer_frequency(data: pd.DataFrame, date_index: str, series_names = data.index.get_level_values(series_index).unique().tolist() series_frequencies = [] for series in series_names: - observed_times = data.iloc[data.index.get_level_values(series_index) == - series].index.get_level_values(date_index) + observed_times = data.iloc[ + data.index.get_level_values(series_index) == series + ].index.get_level_values(date_index) n_steps = len(observed_times) if n_steps > 1: time_diffs = ( observed_times[1:n_steps] - - observed_times[0:(n_steps - 1)]).astype('timedelta64[D]').array + observed_times[0:(n_steps - 1)]).astype('timedelta64[s]').values - min_frequency = np.min(time_diffs) + # Compute the frequency in days + min_frequency = np.min(time_diffs).astype('float') / (60 * 60 * 24) series_frequencies.append(min_frequency) if not series_frequencies: raise ValueError( - 'At least one series with more than one observation must be provided.') + 'At least one series with more than one observation must be provided.' + ) if series_frequencies.count(series_frequencies[0]) != len(series_frequencies): raise ValueError( - 'The provided time series seem to have irregular frequencies.') + 'The provided time series seem to have irregular frequencies.' + ) try: - frequency = { - 1: 'D', - 7: 'W' - }[series_frequencies[0]] + frequency = {1: 'D', 7: 'W'}[series_frequencies[0]] except KeyError: - raise ValueError('Frequency could not be identified. Got %d days.' % - series_frequencies[0]) + raise ValueError( + 'Frequency could not be identified. Got' + f' {int(series_frequencies[0])} days.' + ) return frequency diff --git a/matched_markets/notebook/design_colab_for_tbrmm.ipynb b/matched_markets/notebook/design_colab_for_tbrmm.ipynb index 26ef92a..9f27d87 100644 --- a/matched_markets/notebook/design_colab_for_tbrmm.ipynb +++ b/matched_markets/notebook/design_colab_for_tbrmm.ipynb @@ -14,6 +14,15 @@ "Using this colab, you can create a geoexperiment design for a client using TBR in combination with Matched Markets. In the following we will use the acronyms TBR for Time Based Regression and MM for Matched Markets. For a general introduction to TBR and MM, please refer to the TBR [paper](https://research.google/pubs/pub45950/), the MM [paper](https://research.google/pubs/pub48983/), and this [introduction](http://www.unofficialgoogledatascience.com/2016/06/estimating-causal-effects-using-geo.html) to geo experiments." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pip install git+https://github.com/google/matched_markets.git" + ] + }, { "cell_type": "code", "execution_count": null, @@ -268,7 +277,7 @@ "\n", "max_feasible_number_of_designs = 5 * 10 ** 6\n", "\n", - "if MMclass.count_max_designs() \u003c max_feasible_number_of_designs:\n", + "if MMclass.count_max_designs() < max_feasible_number_of_designs:\n", " matched_designs = MMclass.exhaustive_search()\n", "else:\n", " matched_designs = MMclass.greedy_search()\n", @@ -310,7 +319,7 @@ "first_day = geo_level_time_series[\"date\"].max() - pd.Timedelta(\n", " str(experiment_duration_in_weeks) + \"W\")\n", "most_recent_geo_level_time_series = geo_level_time_series[\n", - " geo_level_time_series['date'] \u003e first_day]\n", + " geo_level_time_series['date'] > first_day]\n", "\n", "total_response = most_recent_geo_level_time_series[\"response\"].sum()\n", "total_spend = most_recent_geo_level_time_series[\"cost\"].sum()\n", @@ -357,7 +366,7 @@ " \"\"\"\n", " if float(row[\"Minimum detectable iROAS\"]) == minimum_detectable_iROAS:\n", " return pd.Series('background-color: lightgreen', row.index)\n", - " elif float(row[\"Minimum detectable iROAS\"]) \u003e minimum_detectable_iROAS:\n", + " elif float(row[\"Minimum detectable iROAS\"]) > minimum_detectable_iROAS:\n", " return pd.Series('background-color: orange', row.index)\n", " else:\n", " return pd.Series('background-color: beige', row.index)\n", @@ -367,7 +376,7 @@ " Color a cell in red if its value is larger than the value\n", " in input\n", " \"\"\"\n", - " color = 'red' if float(val.strip(' %')) \u003e value else 'black'\n", + " color = 'red' if float(val.strip(' %')) > value else 'black'\n", " return 'color: %s' % color\n", "\n", "def flag_warning_revenue(val, value):\n", @@ -375,7 +384,7 @@ " Color a cell in red if its value is smaller than the value\n", " in input\n", " \"\"\"\n", - " color = 'red' if float(val.strip(' %')) \u003c value else 'black'\n", + " color = 'red' if float(val.strip(' %')) < value else 'black'\n", " return 'color: %s' % color\n", "\n", "\n", @@ -399,8 +408,7 @@ " subset=[\"Revenue covered by treatment group\"]).apply(\n", " is_optimal_design, axis=1)\n", "\n", - "designs_table\n", - "" + "designs_table\n" ] }, { @@ -579,8 +587,7 @@ " ).configure_title(\n", " fontSize=title_font_size\n", " ).display()\n", - "\n", - "" + "\n" ] }, { @@ -619,7 +626,7 @@ "\n", "print(f\"The design has Power {100 * power_level:.3}+% with Type-I error \" +\n", " f\"{100 *(1 - confidence_level):.3}% for testing H0: iROAS=0 vs \" +\n", - " f\"H1: iROAS \u003e= {final_design['Minimum detectable iROAS'].values[0]}\")" + " f\"H1: iROAS >= {final_design['Minimum detectable iROAS'].values[0]}\")" ] }, { @@ -699,8 +706,7 @@ }, "outputs": [], "source": [ - "\n", - "" + "\n" ] } ], diff --git a/matched_markets/notebook/post_analysis_colab_for_tbrmm.ipynb b/matched_markets/notebook/post_analysis_colab_for_tbrmm.ipynb index 7795c70..2aabfda 100644 --- a/matched_markets/notebook/post_analysis_colab_for_tbrmm.ipynb +++ b/matched_markets/notebook/post_analysis_colab_for_tbrmm.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pip install git+https://github.com/google/matched_markets.git" + ] + }, { "cell_type": "code", "execution_count": null, @@ -128,7 +137,7 @@ " f'Please correct them.\\n{Style.RESET_ALL}'))\n", "\n", "geox_data = data.copy()\n", - "geox_data = geox_data[geox_data[\"date\"]\u003e=pretest_start_date]\n", + "geox_data = geox_data[geox_data[\"date\"]>=pretest_start_date]\n", "geox_data[\"period\"] = geox_data[\"date\"].apply(\n", " lambda row: 0 if row in pd.Interval(\n", " pretest_start_date, pretest_end_date, closed=\"both\") else\n", @@ -139,12 +148,12 @@ "total_cost = geox_data.loc[geox_data[\"period\"]==1, \"cost\"].sum()\n", "print(\"Total cost: {}\".format(human_readable_number(total_cost)))\n", "\n", - "treatment_cost = geox_data.loc[(geox_data[\"period\"] == 1) \u0026\n", + "treatment_cost = geox_data.loc[(geox_data[\"period\"] == 1) &\n", " (geox_data[\"assignment\"] == group_treatment),\n", " \"cost\"].sum()\n", "print(\"Treatment cost: {}\".format(human_readable_number(treatment_cost)))\n", "\n", - "control_cost = geox_data.loc[(geox_data[\"period\"] == 1) \u0026\n", + "control_cost = geox_data.loc[(geox_data[\"period\"] == 1) &\n", " (geox_data[\"assignment\"] == group_control),\n", " \"cost\"].sum()\n", "print(\"Control cost: {}\".format(human_readable_number(control_cost)))\n", @@ -338,8 +347,7 @@ " ).configure_title(\n", " fontSize=title_font_size\n", " ).display()\n", - "\n", - "" + "\n" ] }, { @@ -374,7 +382,7 @@ " results.lower.values[0] * average_order_value,\n", " results.upper.values[0] * average_order_value))\n", "\n", - " print(f'Probability that the iROAS is \u003e= ' +\n", + " print(f'Probability that the iROAS is >= ' +\n", " f'{results.posterior_threshold.values[0]}:' +\n", " f' {results.probability.values[0]}')\n", "\n", @@ -440,14 +448,14 @@ " periods).reset_index().rename(columns={0: \"response\"})\n", " lower = np.diff(delta_response.ppf(alpha), prepend=0)\n", " lower = np.concatenate(\n", - " (pointwise_difference.loc[pointwise_difference[\"date\"] \u003c test_start_date,\n", + " (pointwise_difference.loc[pointwise_difference[\"date\"] < test_start_date,\n", " \"response\"].values, lower))\n", " upper = np.diff(delta_response.ppf(1 - alpha), prepend=0)\n", " upper = np.concatenate(\n", - " (pointwise_difference.loc[pointwise_difference[\"date\"] \u003c test_start_date,\n", + " (pointwise_difference.loc[pointwise_difference[\"date\"] < test_start_date,\n", " \"response\"].values, upper))\n", " ci_bands = pd.DataFrame({\n", - " \"date\": geox_data.loc[geox_data[\"period\"]\u003e=0, \"date\"].unique(),\n", + " \"date\": geox_data.loc[geox_data[\"period\"]>=0, \"date\"].unique(),\n", " \"lower\": np.round(lower, 2),\n", " \"upper\": np.round(upper, 2),\n", " \"pointwise_difference\": np.round(pointwise_difference[\"response\"], 2)\n", @@ -461,7 +469,7 @@ " cumul_effect[\"date\"].between(test_start_date, cooldown_end_date),\n", " \"response\"]\n", " cumulative_df = pd.DataFrame({\n", - " \"date\": geox_data.loc[geox_data[\"period\"]\u003e=1, \"date\"].unique(),\n", + " \"date\": geox_data.loc[geox_data[\"period\"]>=1, \"date\"].unique(),\n", " \"lower\": np.round(delta_response.ppf(alpha), 2),\n", " \"upper\": np.round(delta_response.ppf(1 - alpha), 2),\n", " \"cumulative_effect\": np.round(cumul_effect, 2)\n", @@ -667,7 +675,7 @@ " tbr_results.tbr_cost.causal_effect(\n", " (tbr_results.periods.test, tbr_results.periods.cooldown)))\n", " iroas_df = pd.DataFrame({\n", - " \"date\": geox_data.loc[geox_data[\"period\"]\u003e=1, \"date\"].unique(),\n", + " \"date\": geox_data.loc[geox_data[\"period\"]>=1, \"date\"].unique(),\n", " \"lower\": np.round(iroas_dist.ppf(alpha) / cost, 2),\n", " \"upper\": np.round(iroas_dist.ppf(1 - alpha) / cost, 2),\n", " \"mean\": np.round(iroas_dist.mean() / cost, 2)\n", diff --git a/matched_markets/tests/test_utils.py b/matched_markets/tests/test_utils.py index 5375b94..cef74a2 100644 --- a/matched_markets/tests/test_utils.py +++ b/matched_markets/tests/test_utils.py @@ -15,14 +15,15 @@ """Test utilities. """ +import altair as alt from matched_markets.methodology import common_classes from matched_markets.methodology import utils -import altair as alt import numpy as np import pandas as pd import unittest + TimeWindow = common_classes.TimeWindow @@ -139,8 +140,8 @@ def testExpandTimeWindows(self): days_to_remove = utils.find_days_to_exclude(day_week_exclude) periods = utils.expand_time_windows(days_to_remove) expected = [ - pd.Timestamp('2020-10-10', freq='D'), - pd.Timestamp('2020-08-10', freq='D'), + pd.Timestamp('2020-10-10'), + pd.Timestamp('2020-08-10'), ] expected += pd.date_range( start='2020-11-10', end='2020-12-10', freq='D').to_list()