google · GiraldoNainggolan · Jun 13, 2023 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/.history/main_20250213065956.py b/.history/main_20250213065956.py
diff --git a/.history/main_20250213070010.py b/.history/main_20250213070010.py
@@ -0,0 +1,36 @@
+import itertools
+import numpy as np
+import pandas as pd
+from matched_markets.methodology.tbrmmdata import TBRMMData
+from matched_markets.methodology.tbrmatchedmarkets import TBRMatchedMarkets
+from matched_markets.methodology.tbrmmdiagnostics import TBRMMDiagnostics
+from matched_markets.methodology.tbrmmdesignparameters import TBRMMDesignParameters
+
+n_geos = 5
+n_days = 21
+geos = {str(geo) for geo in range(n_geos)}
+dates = pd.date_range('2020-03-01', periods=n_days)
+df_data = [{'date': date, 'geo': geo} for geo, date in
+           itertools.product(geos, dates)]
+df = pd.DataFrame(df_data)
+response_column = 'sales'
+
+# Create sales data.
+def day_geo_sales(geo, n_days):
+  # Larger geos have different means and variances.
+  return [
+      100 * geo + 10 * geo * day + day + np.random.randint(10)
+      for day in range(n_days)
+  ]
+
+df[response_column] = 0.0
+for geo in geos:
+  sales_time_series = day_geo_sales(int(geo), n_days)
+  df.loc[df.geo == geo, response_column] = sales_time_series
+
+parameters = TBRMMDesignParameters(n_test=14, iroas=3.0,
+                                   budget_range=(0.1, 300000))
+data = TBRMMData(df, response_column)
+
+mm = TBRMatchedMarkets(data, parameters)
+designs = mm.greedy_search()
diff --git a/.history/matched_markets/methodology/geoeligibility_20250213065231.py b/.history/matched_markets/methodology/geoeligibility_20250213065231.py
@@ -0,0 +1,190 @@
+# Copyright 2020 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""TBR Matched Markets preanalysis.
+"""
+from typing import List, Optional, Set, Text, Union
+import dataclasses
+
+GeoRef = Union[Text, int]
+
+
+@dataclasses.dataclass
+class GeoAssignments:
+  """Representation of all possible geo assignments.
+
+  All attributes are sets of references to geos, can be geo IDs (strings) or
+  integers.
+
+  Attributes:
+    all: All geos.
+    c: All geos that can be assigned to Control.
+    t: All geos that can be assigned to Treatment.
+    x: All geos that can be excluded.
+    c_fixed: geos that must be assigned only to Control.
+    t_fixed: geos that must be assigned only to Treatment.
+    x_fixed: geos that must be excluded.
+    cx: geos that can be in control or excluded (but not in treatment).
+    tx: geos that can be in treatment or excluded (but not in control).
+    ctx: geos that can be in either group or excluded.
+    ct: geos that must be assigned to either control or treatment, but not
+      excluded.
+  """
+  all: Set[GeoRef]
+  c: Set[GeoRef]
+  t: Set[GeoRef]
+  x: Set[GeoRef]
+  t_fixed: Set[GeoRef]
+  c_fixed: Set[GeoRef]
+  x_fixed: Set[GeoRef]
+  ct: Set[GeoRef]
+  cx: Set[GeoRef]
+  ctx: Set[GeoRef]
+  tx: Set[GeoRef]
+
+  def __init__(self, c: Set[GeoRef], t: Set[GeoRef], x: Set[GeoRef]):
+    self.c = c
+    self.t = t
+    self.x = x
+    a = c | t | x
+    self.all = a
+    not_c = a - c
+    not_t = a - t
+    not_x = a - x
+    self.c_fixed = c & not_t & not_x
+    self.t_fixed = not_c & t & not_x
+    self.x_fixed = not_c & not_t & x
+    self.ct = c & t & not_x
+    self.cx = c & not_t & x
+    self.ctx = c & t & x
+    self.tx = not_c & t & x
+
+
+class GeoEligibility:
+  """Validate a Geo Eligibility Matrix.
+
+  A Geo Eligibility Matrix maps each geo to the possible mappings into treatment
+  groups, or possible exclusion from the design. Used in the TBR Matched Markets
+  preanalysis.
+  """
+
+  def __init__(self, df):
+    """Initialize and validate a GeoEligibility object.
+
+    Args:
+
+      df: A DataFrame with columns 'geo', 'control' 'treatment', 'exclude'. Each
+        row specifies to which groups each geo can be assigned to, by using
+        codes 1 = possible and 0 = not possible. 'geo' can also be the index.
+
+        control treatment exclude
+           0        0        1     - geo must be excluded.
+           0        1        0     - geo must be assigned to treatment.
+           1        0        0     - geo must be assigned to control.
+           1        1        1     - geo can be excluded, or included in either
+                                     control or treatment.
+           0        1        1     - geo can be assigned only to treatment, or
+                                     excluded.
+           1        0        1     - geo can be assigned only to control, or
+                                     excluded.
+           1        1        0     - geo must be included in either control or
+                                     treatment but never excluded.
+           0        0        0     - not allowed.
+
+    Attributes:
+      data: A copy of the dataframe, indexed by 'geo'.
+
+    Raises:
+      ValueError: if (a) the DataFrame does not have columns 'geo', 'control',
+        'treatment' and 'exclude'; (b) any geo ids are duplicated; (c) if the
+        values in columns 'control', 'treatment', and 'exclude' are something
+        else than 0 and 1; (d) if any row in the columns 'control', 'treatment'
+        and 'exclude' has all zeros in it.
+    """
+
+    df = df.copy().reset_index()
+
+    if 'geo' not in df.columns:
+      raise ValueError('There is no column or index \'geo\'')
+
+    dups = df.columns.duplicated()
+    if any(dups):
+      raise ValueError('Duplicate column(s): ' + ', '.join(df.columns[dups]))
+
+    # Ensure that the geo column is a string.
+    df.geo = df.geo.astype('str')
+
+    value_columns = ['control', 'treatment', 'exclude']
+    if not set(value_columns).issubset(set(df.columns)):
+      missing_columns = [x for x in value_columns if x not in df.columns]
+      raise ValueError('Missing column(s): ' + ', '.join(missing_columns))
+
+    all_column_names = ['geo'] + value_columns
+
+    # Ensure the correct column order.
+    df = df.loc[:, all_column_names]
+
+    dup_geo_ids = set(df['geo'][df['geo'].duplicated()])
+    if dup_geo_ids:
+      raise ValueError('\'geo\' has duplicate values: ' +
+                       ', '.join(str(id) for id in dup_geo_ids))
+
+    if not all([set(df[col]) <= {0, 1} for col in value_columns]):
+      raise ValueError('GeoEligibility objects must have only values '
+                       '0, 1 in columns ' + ', '.join(value_columns))
+
+    zero_row = df[value_columns].sum(axis=1) == 0
+    if any(zero_row):
+      geos = df['geo'][zero_row]
+      raise ValueError('Three zeros found for geo(s) ' + ', '.join(geos))
+
+    df.set_index('geo', inplace=True)
+    self.data = df
+
+  def __str__(self):
+    return 'Geo eligibility matrix with %d geos' % self.data.shape[0]
+
+  def get_eligible_assignments(self, geos: Optional[List[GeoRef]] = None,
+                               indices: bool = False) -> GeoAssignments:
+    """Get an object representing all possible geo assignment groups.
+
+    Args:
+      geos: A list of geo IDs to include. If None, all geos are included. The
+        order is important if 'indices' are used.
+      indices: Instead of generating sets of geo IDs, generate sets of the
+        positional index numbers of the geo IDs in the list 'geos'. Raises an
+        error if 'geos' is not specified.
+
+    Returns:
+      A GeoAssignments object.
+
+    Raises:
+      ValueError: if geos is not specified but indices is True.
+    """
+
+    df = self.data  # DataFrame indexed by the geo ID.
+
+    if geos:
+      df = df.loc[list(geos)]
+      if indices:
+        df = df.reset_index()
+    elif indices:
+      raise ValueError('\'geos\' is not specified but indices=True')
+
+    # Generate sets of geos (IDs or indices) indicating membership of the group.
+    c = set(df.index[df['control'] == 1])
+    t = set(df.index[df['treatment'] == 1])
+    x = set(df.index[df['exclude'] == 1])
+
+    return GeoAssignments(c, t, x)
diff --git a/.history/matched_markets/methodology/geoeligibility_20250213071034.py b/.history/matched_markets/methodology/geoeligibility_20250213071034.py
@@ -0,0 +1,83 @@
+from typing import List, Optional, Set, Text, Union
+import dataclasses
+import pandas as pd
+
+GeoRef = Union[Text, int]
+
+@dataclasses.dataclass
+class GeoAssignments:
+    """Representation of all possible geo assignments."""
+    all: Set[GeoRef]
+    c: Set[GeoRef]
+    t: Set[GeoRef]
+    x: Set[GeoRef]
+    c_fixed: Set[GeoRef]
+    t_fixed: Set[GeoRef]
+    x_fixed: Set[GeoRef]
+    ct: Set[GeoRef]
+    cx: Set[GeoRef]
+    ctx: Set[GeoRef]
+    tx: Set[GeoRef]
+
+    def __init__(self, c: Set[GeoRef], t: Set[GeoRef], x: Set[GeoRef]):
+        self.c = c
+        self.t = t
+        self.x = x
+        self.all = c | t | x
+
+        self.c_fixed = c - (t | x)
+        self.t_fixed = t - (c | x)
+        self.x_fixed = x - (c | t)
+
+        self.ct = (c & t) - x
+        self.cx = (c & x) - t
+        self.ctx = c & t & x
+        self.tx = (t & x) - c
+
+class GeoEligibility:
+    """Validate a Geo Eligibility Matrix."""
+    def __init__(self, df: pd.DataFrame):
+        df = df.copy()
+        df.reset_index(drop=True, inplace=True)
+
+        required_columns = {'geo', 'control', 'treatment', 'exclude'}
+        if not required_columns.issubset(df.columns):
+            missing = required_columns - set(df.columns)
+            raise ValueError(f'Missing required column(s): {", ".join(missing)}')
+
+        if df.columns.duplicated().any():
+            raise ValueError('Duplicate columns found in DataFrame.')
+
+        df['geo'] = df['geo'].astype(str)
+
+        if df.duplicated(subset=['geo']).any():
+            dup_geo_ids = df['geo'][df.duplicated(subset=['geo'])].unique()
+            raise ValueError(f'Duplicate geo values found: {", ".join(dup_geo_ids)}')
+
+        if not all(df[col].isin([0, 1]).all() for col in ['control', 'treatment', 'exclude']):
+            raise ValueError('Columns control, treatment, and exclude must contain only 0 or 1.')
+
+        if (df[['control', 'treatment', 'exclude']].sum(axis=1) == 0).any():
+            zero_rows = df['geo'][df[['control', 'treatment', 'exclude']].sum(axis=1) == 0]
+            raise ValueError(f'Invalid rows with all zeros found for geos: {", ".join(zero_rows)}')
+
+        df.set_index('geo', inplace=True)
+        self.data = df
+
+    def __str__(self):
+        return f'Geo eligibility matrix with {self.data.shape[0]} geos'
+
+    def get_eligible_assignments(self, geos: Optional[List[GeoRef]] = None, indices: bool = False) -> GeoAssignments:
+        if indices and geos is None:
+            raise ValueError('`geos` must be specified when `indices=True`.')
+
+        df = self.data if geos is None else self.data.loc[geos]
+
+        if indices:
+            df = df.reset_index()
+
+        c = set(df.index[df['control'] == 1])
+        t = set(df.index[df['treatment'] == 1])
+        x = set(df.index[df['exclude'] == 1])
+
+        return GeoAssignments(c, t, x)
diff --git a/.history/matched_markets/methodology/heapdict_20250213065231.py b/.history/matched_markets/methodology/heapdict_20250213065231.py
@@ -0,0 +1,80 @@
+# Copyright 2020 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""TBR Matched Markets: utilities.
+"""
+
+import collections
+import heapq
+
+from typing import Any, Dict, List, TypeVar
+
+DictKey = TypeVar('DictKey', str, int, float)
+
+
+class HeapDict:
+  """A dictionary of priority queues of a given limited size.
+
+  Each dictionary key points to a separate queue that has a fixed maximum
+  size. Upon pushing an item in a queue, the smallest item will be discarded if
+  the maximum size is exceeded. Hence each queue stores the largest items that
+  have been pushed in.
+
+  Each item must be sortable; an item of arbitrary class can be used if it
+  features a custom __lt__ method.
+
+  Example:
+    h = HeapDict(1)  # Keep only the largest item.
+    h.push(10, 0.5)
+    h.push(10, 1.0)
+    h.push(20, 1.0)
+    h.push(20, 2.0)
+    h.get_result()  # Returns {10: [1.0], 20: [2.0]}.
+  """
+
+  def __init__(self, size: int):
+    """Initialize a HeapDict.
+
+    Args:
+      size: Maximum size of each heap (priority queue).
+    """
+    self._size = size
+    self._result = collections.defaultdict(list)
+
+  def push(self, key: DictKey, item: Any):
+    """Push an item into the queue associated with the key.
+
+    Args:
+      key: A dictionary key, string, integer, or float.
+      item: Any object. The queue corresponding to the key will be sorted based
+        on this object.
+    """
+    queue = self._result[key]
+    if len(queue) < self._size:
+      heapq.heappush(queue, item)
+    else:
+      # Push the new item, and remove the smallest item.
+      heapq.heappushpop(queue, item)
+    self._result[key] = queue
+
+  def get_result(self) -> Dict[DictKey, List[Any]]:
+    """Return a copy of the dictionary, each queue sorted in descending order.
+
+    Returns:
+      A dictionary with the sorted lists as values, largest values first.
+    """
+    result = {}
+    for key, q in self._result.items():
+      result[key] = heapq.nlargest(len(q), q)
+    return result