Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
336563d
Fix error cannot convert from timedelta64 to be compatible with new p…
marcolongfils Jun 13, 2023
df4299e
Revert setup.py back to the previous version
a-googler Feb 12, 2025
b582ab1
Revert BUILD file
sophiechengeox Feb 12, 2025
d7c44b7
Revert BUILD to previous version
sophiechengeox Feb 12, 2025
7950cda
Revert salesandcost.py to previous version
sophiechengeox Feb 12, 2025
19ec871
Delete matched_markets/methodology/robust_iroas.py
sophiechengeox Feb 12, 2025
1a9f788
Delete matched_markets/methodology/robust.py
sophiechengeox Feb 12, 2025
45cf1a1
Delete matched_markets/methodology/tbrmatchedmarkets_v2.py
sophiechengeox Feb 12, 2025
4d78987
Delete matched_markets/methodology/base.py
sophiechengeox Feb 12, 2025
ac80436
Revert semantics.py
sophiechengeox Feb 12, 2025
5d16e65
Revert tbr.py
sophiechengeox Feb 12, 2025
3ad0531
Revert tbr_iroas.py
sophiechengeox Feb 12, 2025
ea32030
Revert tbrdiagnostics.py
sophiechengeox Feb 12, 2025
1e98da5
Revert tbrmatchedmarkets.py
sophiechengeox Feb 12, 2025
d2aa979
Revert tbrmmdata.py
sophiechengeox Feb 12, 2025
e3da83b
Revert tbrmmdesignparameters.py
sophiechengeox Feb 12, 2025
019982f
Revert tbrmmdiagnostics.py
sophiechengeox Feb 12, 2025
bfd0b06
Revert tbrmmscore.py
sophiechengeox Feb 12, 2025
ca5c984
Revert post_analysis_colab_for_tbrmm.ipynb
sophiechengeox Feb 12, 2025
8f5a8f0
Delete matched_markets/tests/test_robust.py
sophiechengeox Feb 12, 2025
333d857
Delete matched_markets/tests/test_robust_iroas.py
sophiechengeox Feb 12, 2025
257b487
Revert test_semantics.py
sophiechengeox Feb 12, 2025
24f1136
Revert test_tbr_iroas.py
sophiechengeox Feb 12, 2025
56270df
Revert test_tbrdiagnostics.py
sophiechengeox Feb 12, 2025
40ee389
Revert test_tbrmatchedmarkets3.py
sophiechengeox Feb 12, 2025
a564278
Revert test_tbrmmdata.py
sophiechengeox Feb 12, 2025
61d691e
Revert test_tbrmmdesignparameters.py
sophiechengeox Feb 12, 2025
729c5f6
Revert test_tbrmmdiagnostics.py
sophiechengeox Feb 12, 2025
2da03c6
Improve HeapDict implementation with better docstrings and type hints
GiraldoSN Feb 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added .history/main_20250213065956.py
Empty file.
36 changes: 36 additions & 0 deletions .history/main_20250213070010.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import itertools
import numpy as np
import pandas as pd
from matched_markets.methodology.tbrmmdata import TBRMMData
from matched_markets.methodology.tbrmatchedmarkets import TBRMatchedMarkets
from matched_markets.methodology.tbrmmdiagnostics import TBRMMDiagnostics
from matched_markets.methodology.tbrmmdesignparameters import TBRMMDesignParameters

n_geos = 5
n_days = 21
geos = {str(geo) for geo in range(n_geos)}
dates = pd.date_range('2020-03-01', periods=n_days)
df_data = [{'date': date, 'geo': geo} for geo, date in
itertools.product(geos, dates)]
df = pd.DataFrame(df_data)
response_column = 'sales'

# Create sales data.
def day_geo_sales(geo, n_days):
# Larger geos have different means and variances.
return [
100 * geo + 10 * geo * day + day + np.random.randint(10)
for day in range(n_days)
]

df[response_column] = 0.0
for geo in geos:
sales_time_series = day_geo_sales(int(geo), n_days)
df.loc[df.geo == geo, response_column] = sales_time_series

parameters = TBRMMDesignParameters(n_test=14, iroas=3.0,
budget_range=(0.1, 300000))
data = TBRMMData(df, response_column)

mm = TBRMatchedMarkets(data, parameters)
designs = mm.greedy_search()
190 changes: 190 additions & 0 deletions .history/matched_markets/methodology/geoeligibility_20250213065231.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# Copyright 2020 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""TBR Matched Markets preanalysis.
"""
from typing import List, Optional, Set, Text, Union
import dataclasses

GeoRef = Union[Text, int]


@dataclasses.dataclass
class GeoAssignments:
"""Representation of all possible geo assignments.

All attributes are sets of references to geos, can be geo IDs (strings) or
integers.

Attributes:
all: All geos.
c: All geos that can be assigned to Control.
t: All geos that can be assigned to Treatment.
x: All geos that can be excluded.
c_fixed: geos that must be assigned only to Control.
t_fixed: geos that must be assigned only to Treatment.
x_fixed: geos that must be excluded.
cx: geos that can be in control or excluded (but not in treatment).
tx: geos that can be in treatment or excluded (but not in control).
ctx: geos that can be in either group or excluded.
ct: geos that must be assigned to either control or treatment, but not
excluded.
"""
all: Set[GeoRef]
c: Set[GeoRef]
t: Set[GeoRef]
x: Set[GeoRef]
t_fixed: Set[GeoRef]
c_fixed: Set[GeoRef]
x_fixed: Set[GeoRef]
ct: Set[GeoRef]
cx: Set[GeoRef]
ctx: Set[GeoRef]
tx: Set[GeoRef]

def __init__(self, c: Set[GeoRef], t: Set[GeoRef], x: Set[GeoRef]):
self.c = c
self.t = t
self.x = x
a = c | t | x
self.all = a
not_c = a - c
not_t = a - t
not_x = a - x
self.c_fixed = c & not_t & not_x
self.t_fixed = not_c & t & not_x
self.x_fixed = not_c & not_t & x
self.ct = c & t & not_x
self.cx = c & not_t & x
self.ctx = c & t & x
self.tx = not_c & t & x


class GeoEligibility:
"""Validate a Geo Eligibility Matrix.

A Geo Eligibility Matrix maps each geo to the possible mappings into treatment
groups, or possible exclusion from the design. Used in the TBR Matched Markets
preanalysis.
"""

def __init__(self, df):
"""Initialize and validate a GeoEligibility object.

Args:

df: A DataFrame with columns 'geo', 'control' 'treatment', 'exclude'. Each
row specifies to which groups each geo can be assigned to, by using
codes 1 = possible and 0 = not possible. 'geo' can also be the index.

control treatment exclude
0 0 1 - geo must be excluded.
0 1 0 - geo must be assigned to treatment.
1 0 0 - geo must be assigned to control.
1 1 1 - geo can be excluded, or included in either
control or treatment.
0 1 1 - geo can be assigned only to treatment, or
excluded.
1 0 1 - geo can be assigned only to control, or
excluded.
1 1 0 - geo must be included in either control or
treatment but never excluded.
0 0 0 - not allowed.

Attributes:
data: A copy of the dataframe, indexed by 'geo'.

Raises:
ValueError: if (a) the DataFrame does not have columns 'geo', 'control',
'treatment' and 'exclude'; (b) any geo ids are duplicated; (c) if the
values in columns 'control', 'treatment', and 'exclude' are something
else than 0 and 1; (d) if any row in the columns 'control', 'treatment'
and 'exclude' has all zeros in it.
"""

df = df.copy().reset_index()

if 'geo' not in df.columns:
raise ValueError('There is no column or index \'geo\'')

dups = df.columns.duplicated()
if any(dups):
raise ValueError('Duplicate column(s): ' + ', '.join(df.columns[dups]))

# Ensure that the geo column is a string.
df.geo = df.geo.astype('str')

value_columns = ['control', 'treatment', 'exclude']
if not set(value_columns).issubset(set(df.columns)):
missing_columns = [x for x in value_columns if x not in df.columns]
raise ValueError('Missing column(s): ' + ', '.join(missing_columns))

all_column_names = ['geo'] + value_columns

# Ensure the correct column order.
df = df.loc[:, all_column_names]

dup_geo_ids = set(df['geo'][df['geo'].duplicated()])
if dup_geo_ids:
raise ValueError('\'geo\' has duplicate values: ' +
', '.join(str(id) for id in dup_geo_ids))

if not all([set(df[col]) <= {0, 1} for col in value_columns]):
raise ValueError('GeoEligibility objects must have only values '
'0, 1 in columns ' + ', '.join(value_columns))

zero_row = df[value_columns].sum(axis=1) == 0
if any(zero_row):
geos = df['geo'][zero_row]
raise ValueError('Three zeros found for geo(s) ' + ', '.join(geos))

df.set_index('geo', inplace=True)
self.data = df

def __str__(self):
return 'Geo eligibility matrix with %d geos' % self.data.shape[0]

def get_eligible_assignments(self, geos: Optional[List[GeoRef]] = None,
indices: bool = False) -> GeoAssignments:
"""Get an object representing all possible geo assignment groups.

Args:
geos: A list of geo IDs to include. If None, all geos are included. The
order is important if 'indices' are used.
indices: Instead of generating sets of geo IDs, generate sets of the
positional index numbers of the geo IDs in the list 'geos'. Raises an
error if 'geos' is not specified.

Returns:
A GeoAssignments object.

Raises:
ValueError: if geos is not specified but indices is True.
"""

df = self.data # DataFrame indexed by the geo ID.

if geos:
df = df.loc[list(geos)]
if indices:
df = df.reset_index()
elif indices:
raise ValueError('\'geos\' is not specified but indices=True')

# Generate sets of geos (IDs or indices) indicating membership of the group.
c = set(df.index[df['control'] == 1])
t = set(df.index[df['treatment'] == 1])
x = set(df.index[df['exclude'] == 1])

return GeoAssignments(c, t, x)
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from typing import List, Optional, Set, Text, Union
import dataclasses
import pandas as pd

GeoRef = Union[Text, int]

@dataclasses.dataclass
class GeoAssignments:
"""Representation of all possible geo assignments."""
all: Set[GeoRef]
c: Set[GeoRef]
t: Set[GeoRef]
x: Set[GeoRef]
c_fixed: Set[GeoRef]
t_fixed: Set[GeoRef]
x_fixed: Set[GeoRef]
ct: Set[GeoRef]
cx: Set[GeoRef]
ctx: Set[GeoRef]
tx: Set[GeoRef]

def __init__(self, c: Set[GeoRef], t: Set[GeoRef], x: Set[GeoRef]):
self.c = c
self.t = t
self.x = x
self.all = c | t | x

self.c_fixed = c - (t | x)
self.t_fixed = t - (c | x)
self.x_fixed = x - (c | t)

self.ct = (c & t) - x
self.cx = (c & x) - t
self.ctx = c & t & x
self.tx = (t & x) - c

class GeoEligibility:
"""Validate a Geo Eligibility Matrix."""
def __init__(self, df: pd.DataFrame):
df = df.copy()
df.reset_index(drop=True, inplace=True)

required_columns = {'geo', 'control', 'treatment', 'exclude'}
if not required_columns.issubset(df.columns):
missing = required_columns - set(df.columns)
raise ValueError(f'Missing required column(s): {", ".join(missing)}')

if df.columns.duplicated().any():
raise ValueError('Duplicate columns found in DataFrame.')

df['geo'] = df['geo'].astype(str)

if df.duplicated(subset=['geo']).any():
dup_geo_ids = df['geo'][df.duplicated(subset=['geo'])].unique()
raise ValueError(f'Duplicate geo values found: {", ".join(dup_geo_ids)}')

if not all(df[col].isin([0, 1]).all() for col in ['control', 'treatment', 'exclude']):
raise ValueError('Columns control, treatment, and exclude must contain only 0 or 1.')

if (df[['control', 'treatment', 'exclude']].sum(axis=1) == 0).any():
zero_rows = df['geo'][df[['control', 'treatment', 'exclude']].sum(axis=1) == 0]
raise ValueError(f'Invalid rows with all zeros found for geos: {", ".join(zero_rows)}')

df.set_index('geo', inplace=True)
self.data = df

def __str__(self):
return f'Geo eligibility matrix with {self.data.shape[0]} geos'

def get_eligible_assignments(self, geos: Optional[List[GeoRef]] = None, indices: bool = False) -> GeoAssignments:
if indices and geos is None:
raise ValueError('`geos` must be specified when `indices=True`.')

df = self.data if geos is None else self.data.loc[geos]

if indices:
df = df.reset_index()

c = set(df.index[df['control'] == 1])
t = set(df.index[df['treatment'] == 1])
x = set(df.index[df['exclude'] == 1])

return GeoAssignments(c, t, x)
80 changes: 80 additions & 0 deletions .history/matched_markets/methodology/heapdict_20250213065231.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright 2020 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""TBR Matched Markets: utilities.
"""

import collections
import heapq

from typing import Any, Dict, List, TypeVar

DictKey = TypeVar('DictKey', str, int, float)


class HeapDict:
"""A dictionary of priority queues of a given limited size.

Each dictionary key points to a separate queue that has a fixed maximum
size. Upon pushing an item in a queue, the smallest item will be discarded if
the maximum size is exceeded. Hence each queue stores the largest items that
have been pushed in.

Each item must be sortable; an item of arbitrary class can be used if it
features a custom __lt__ method.

Example:
h = HeapDict(1) # Keep only the largest item.
h.push(10, 0.5)
h.push(10, 1.0)
h.push(20, 1.0)
h.push(20, 2.0)
h.get_result() # Returns {10: [1.0], 20: [2.0]}.
"""

def __init__(self, size: int):
"""Initialize a HeapDict.

Args:
size: Maximum size of each heap (priority queue).
"""
self._size = size
self._result = collections.defaultdict(list)

def push(self, key: DictKey, item: Any):
"""Push an item into the queue associated with the key.

Args:
key: A dictionary key, string, integer, or float.
item: Any object. The queue corresponding to the key will be sorted based
on this object.
"""
queue = self._result[key]
if len(queue) < self._size:
heapq.heappush(queue, item)
else:
# Push the new item, and remove the smallest item.
heapq.heappushpop(queue, item)
self._result[key] = queue

def get_result(self) -> Dict[DictKey, List[Any]]:
"""Return a copy of the dictionary, each queue sorted in descending order.

Returns:
A dictionary with the sorted lists as values, largest values first.
"""
result = {}
for key, q in self._result.items():
result[key] = heapq.nlargest(len(q), q)
return result
Loading