4040from git import Repo
4141from git .refs .head import Head
4242import pandas as pd
43+ import numpy as np
4344
4445from .utils import read_params
4546from .logger import get_structured_logger
47+ from .nancodes import Nans
4648
4749Files = List [str ]
4850FileDiffMap = Dict [str , Optional [str ]]
@@ -73,8 +75,10 @@ def diff_export_csv(
7375 changed_df is the pd.DataFrame of common rows from after_csv with changed values.
7476 added_df is the pd.DataFrame of added rows from after_csv.
7577 """
76- export_csv_dtypes = {"geo_id" : str , "val" : float ,
77- "se" : float , "sample_size" : float }
78+ export_csv_dtypes = {
79+ "geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
80+ "missing_val" : int , "missing_se" : int , "missing_sample_size" : int
81+ }
7882
7983 before_df = pd .read_csv (before_csv , dtype = export_csv_dtypes )
8084 before_df .set_index ("geo_id" , inplace = True )
@@ -89,12 +93,22 @@ def diff_export_csv(
8993 before_df_cmn = before_df .reindex (common_idx )
9094 after_df_cmn = after_df .reindex (common_idx )
9195
92- # Exact comparisons, treating NA == NA as True
93- same_mask = before_df_cmn == after_df_cmn
94- same_mask |= pd .isna (before_df_cmn ) & pd .isna (after_df_cmn )
96+ # If CSVs have different columns (no missingness), mark all values as new
97+ if ("missing_val" in before_df_cmn .columns ) ^ ("missing_val" in after_df_cmn .columns ):
98+ same_mask = after_df_cmn .copy ()
99+ same_mask .loc [:] = False
100+ else :
101+ # Exact comparisons, treating NA == NA as True
102+ same_mask = before_df_cmn == after_df_cmn
103+ same_mask |= pd .isna (before_df_cmn ) & pd .isna (after_df_cmn )
104+
105+ # Code deleted entries as nans with the deleted missing code
106+ deleted_df = before_df .loc [deleted_idx , :].copy ()
107+ deleted_df [["val" , "se" , "sample_size" ]] = np .nan
108+ deleted_df [["missing_val" , "missing_se" , "missing_sample_size" ]] = Nans .DELETED
95109
96110 return (
97- before_df . loc [ deleted_idx , :] ,
111+ deleted_df ,
98112 after_df_cmn .loc [~ (same_mask .all (axis = 1 )), :],
99113 after_df .loc [added_idx , :])
100114
@@ -227,11 +241,11 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]:
227241
228242 deleted_df , changed_df , added_df = diff_export_csv (
229243 before_file , after_file )
230- new_issues_df = pd .concat ([changed_df , added_df ], axis = 0 )
244+ new_issues_df = pd .concat ([deleted_df , changed_df , added_df ], axis = 0 )
231245
232246 if len (deleted_df ) > 0 :
233247 print (
234- f"Warning, diff has deleted indices in { after_file } that will be ignored " )
248+ f"Diff has deleted indices in { after_file } that have been coded as nans. " )
235249
236250 # Write the diffs to diff_file, if applicable
237251 if len (new_issues_df ) > 0 :
@@ -414,6 +428,9 @@ def archive_exports(self, # pylint: disable=arguments-differ
414428 archive_success .append (exported_file )
415429 except FileNotFoundError :
416430 archive_fail .append (exported_file )
431+ except shutil .SameFileError :
432+ # no need to copy if the cached file is the same
433+ archive_success .append (exported_file )
417434
418435 self ._exports_archived = True
419436
0 commit comments