|
3 | 3 | Functions to help generate sensor for different geographical levels |
4 | 4 | """ |
5 | 5 | import pandas as pd |
6 | | -from .data_tools import fill_dates, raw_positive_prop, smoothed_positive_prop |
| 6 | +from .data_tools import (fill_dates, raw_positive_prop, |
| 7 | + smoothed_positive_prop, |
| 8 | + smoothed_tests_per_device, |
| 9 | + raw_tests_per_device) |
7 | 10 |
|
8 | 11 | MIN_OBS = 50 # minimum number of observations in order to compute a proportion. |
9 | 12 | POOL_DAYS = 7 |
10 | 13 |
|
11 | | -def generate_sensor_for_states(state_data, smooth, first_date, last_date): |
| 14 | +def generate_sensor_for_states(state_groups, smooth, device, first_date, last_date): |
12 | 15 | """ |
13 | 16 | fit over states |
14 | 17 | Args: |
15 | | - state_data: pd.DataFrame |
| 18 | + state_groups: pd.groupby.generic.DataFrameGroupBy |
16 | 19 | state_key: "state_id" |
17 | 20 | smooth: bool |
| 21 | + Consider raw or smooth |
| 22 | + device: bool |
| 23 | + Consider test_per_device or pct_positive |
18 | 24 | Returns: |
19 | 25 | df: pd.DataFrame |
20 | 26 | """ |
21 | 27 | state_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size", "timestamp"]) |
22 | | - state_groups = state_data.groupby("state_id") |
23 | 28 | state_list = list(state_groups.groups.keys()) |
24 | 29 | for state in state_list: |
25 | 30 | state_group = state_groups.get_group(state) |
26 | 31 | state_group = state_group.drop(columns=["state_id"]) |
27 | 32 | state_group.set_index("timestamp", inplace=True) |
28 | 33 | state_group = fill_dates(state_group, first_date, last_date) |
29 | 34 |
|
30 | | - if smooth: |
31 | | - stat, se, sample_size = smoothed_positive_prop(tests=state_group['totalTest'].values, |
32 | | - positives=state_group['positiveTest'].values, |
33 | | - min_obs=MIN_OBS, pool_days=POOL_DAYS) |
| 35 | + # smoothed test per device |
| 36 | + if device & smooth: |
| 37 | + stat, se, sample_size = smoothed_tests_per_device( |
| 38 | + devices=state_group["numUniqueDevices"].values, |
| 39 | + tests=state_group['totalTest'].values, |
| 40 | + min_obs=MIN_OBS, pool_days=POOL_DAYS) |
| 41 | + # raw test per device |
| 42 | + elif device & (not smooth): |
| 43 | + stat, se, sample_size = raw_tests_per_device( |
| 44 | + devices=state_group["numUniqueDevices"].values, |
| 45 | + tests=state_group['totalTest'].values, |
| 46 | + min_obs=MIN_OBS) |
| 47 | + # smoothed pct positive |
| 48 | + elif (not device) & smooth: |
| 49 | + stat, se, sample_size = smoothed_positive_prop( |
| 50 | + tests=state_group['totalTest'].values, |
| 51 | + positives=state_group['positiveTest'].values, |
| 52 | + min_obs=MIN_OBS, pool_days=POOL_DAYS) |
| 53 | + stat = stat * 100 |
| 54 | + # raw pct positive |
34 | 55 | else: |
35 | | - stat, se, sample_size = raw_positive_prop(tests=state_group['totalTest'].values, |
36 | | - positives=state_group['positiveTest'].values, |
37 | | - min_obs=MIN_OBS) |
38 | | - stat = stat * 100 |
| 56 | + stat, se, sample_size = raw_positive_prop( |
| 57 | + tests=state_group['totalTest'].values, |
| 58 | + positives=state_group['positiveTest'].values, |
| 59 | + min_obs=MIN_OBS) |
| 60 | + stat = stat * 100 |
| 61 | + |
39 | 62 | se = se * 100 |
40 | 63 | state_df = state_df.append(pd.DataFrame({"geo_id": state, |
41 | 64 | "timestamp": state_group.index, |
42 | 65 | "val": stat, |
43 | 66 | "se": se, |
44 | 67 | "sample_size": sample_size})) |
45 | | - return state_df, state_groups |
| 68 | + return state_df |
46 | 69 |
|
47 | | -def generate_sensor_for_other_geores(state_groups, data, res_key, smooth, first_date, last_date): |
| 70 | +def generate_sensor_for_other_geores(state_groups, data, res_key, smooth, |
| 71 | + device, first_date, last_date): |
48 | 72 | """ |
49 | 73 | fit over counties/HRRs/MSAs |
50 | 74 | Args: |
51 | 75 | data: pd.DataFrame |
52 | 76 | res_key: "fips", "cbsa_id" or "hrrnum" |
53 | 77 | smooth: bool |
| 78 | + Consider raw or smooth |
| 79 | + device: bool |
| 80 | + Consider test_per_device or pct_positive |
54 | 81 | Returns: |
55 | 82 | df: pd.DataFrame |
56 | 83 | """ |
| 84 | + has_parent = True |
57 | 85 | res_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size"]) |
58 | 86 | res_groups = data.groupby(res_key) |
59 | 87 | loc_list = list(res_groups.groups.keys()) |
60 | 88 | for loc in loc_list: |
61 | 89 | res_group = res_groups.get_group(loc) |
62 | 90 | parent_state = res_group['state_id'].values[0] |
63 | | - parent_group = state_groups.get_group(parent_state) |
64 | | - res_group = res_group.merge(parent_group, how="left", |
65 | | - on="timestamp", suffixes=('', '_parent')) |
66 | | - res_group = res_group.drop(columns=[res_key, "state_id", "state_id" + '_parent']) |
| 91 | + try: |
| 92 | + parent_group = state_groups.get_group(parent_state) |
| 93 | + res_group = res_group.merge(parent_group, how="left", |
| 94 | + on="timestamp", suffixes=('', '_parent')) |
| 95 | + res_group = res_group.drop(columns=[res_key, "state_id", "state_id" + '_parent']) |
| 96 | + except: |
| 97 | + has_parent = False |
| 98 | + res_group = res_group.drop(columns=[res_key, "state_id"]) |
67 | 99 | res_group.set_index("timestamp", inplace=True) |
68 | 100 | res_group = fill_dates(res_group, first_date, last_date) |
69 | 101 |
|
70 | 102 | if smooth: |
71 | | - stat, se, sample_size = smoothed_positive_prop( |
72 | | - tests=res_group['totalTest'].values, |
73 | | - positives=res_group['positiveTest'].values, |
74 | | - min_obs=MIN_OBS, pool_days=POOL_DAYS, |
75 | | - parent_tests=res_group["totalTest_parent"].values, |
76 | | - parent_positives=res_group['positiveTest_parent'].values) |
| 103 | + if has_parent: |
| 104 | + if device: |
| 105 | + stat, se, sample_size = smoothed_tests_per_device( |
| 106 | + devices=res_group["numUniqueDevices"].values, |
| 107 | + tests=res_group['totalTest'].values, |
| 108 | + min_obs=MIN_OBS, pool_days=POOL_DAYS, |
| 109 | + parent_devices=res_group["numUniqueDevices_parent"].values, |
| 110 | + parent_tests=res_group["totalTest_parent"].values) |
| 111 | + else: |
| 112 | + stat, se, sample_size = smoothed_positive_prop( |
| 113 | + tests=res_group['totalTest'].values, |
| 114 | + positives=res_group['positiveTest'].values, |
| 115 | + min_obs=MIN_OBS, pool_days=POOL_DAYS, |
| 116 | + parent_tests=res_group["totalTest_parent"].values, |
| 117 | + parent_positives=res_group['positiveTest_parent'].values) |
| 118 | + stat = stat * 100 |
| 119 | + else: |
| 120 | + if device: |
| 121 | + stat, se, sample_size = smoothed_tests_per_device( |
| 122 | + devices=res_group["numUniqueDevices"].values, |
| 123 | + tests=res_group['totalTest'].values, |
| 124 | + min_obs=MIN_OBS, pool_days=POOL_DAYS) |
| 125 | + else: |
| 126 | + stat, se, sample_size = smoothed_positive_prop( |
| 127 | + tests=res_group['totalTest'].values, |
| 128 | + positives=res_group['positiveTest'].values, |
| 129 | + min_obs=MIN_OBS, pool_days=POOL_DAYS) |
| 130 | + stat = stat * 100 |
77 | 131 | else: |
78 | | - stat, se, sample_size = raw_positive_prop( |
79 | | - tests=res_group['totalTest'].values, |
80 | | - positives=res_group['positiveTest'].values, |
81 | | - min_obs=MIN_OBS) |
82 | | - stat = stat * 100 |
83 | | - se = se * 100 |
| 132 | + if device: |
| 133 | + stat, se, sample_size = raw_tests_per_device( |
| 134 | + devices=res_group["numUniqueDevices"].values, |
| 135 | + tests=res_group['totalTest'].values, |
| 136 | + min_obs=MIN_OBS) |
| 137 | + else: |
| 138 | + stat, se, sample_size = raw_positive_prop( |
| 139 | + tests=res_group['totalTest'].values, |
| 140 | + positives=res_group['positiveTest'].values, |
| 141 | + min_obs=MIN_OBS) |
| 142 | + stat = stat * 100 |
84 | 143 |
|
| 144 | + se = se * 100 |
85 | 145 | res_df = res_df.append(pd.DataFrame({"geo_id": loc, |
86 | 146 | "timestamp": res_group.index, |
87 | 147 | "val": stat, |
|
0 commit comments