1818
1919CSV_DTYPES = {
2020 "geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
21- "missing_val" : int , "missing_se" :int , "missing_sample_size" : int
21+ "missing_val" : int , "missing_se" : int , "missing_sample_size" : int
2222 }
2323
2424CSVS_BEFORE = {
25- # Common
25+ # All rows unchanged
2626 "csv0" : pd .DataFrame ({
2727 "geo_id" : ["1" , "2" , "3" ],
2828 "val" : [1.000000001 , 2.00000002 , 3.00000003 ],
3333 "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
3434 }),
3535
36+ # One row deleted and one row added
3637 "csv1" : pd .DataFrame ({
3738 "geo_id" : ["1" , "2" , "3" ],
3839 "val" : [1.0 , 2.0 , 3.0 ],
4344 "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
4445 }),
4546
46- # Deleted
47+ # File deleted
4748 "csv2" : pd .DataFrame ({
4849 "geo_id" : ["1" ],
4950 "val" : [1.0 ],
5455 "missing_sample_size" : [Nans .NOT_MISSING ],
5556 }),
5657
57- # Common , but updated with missing columns
58+ # All rows common , but missing columns added
5859 "csv4" : pd .DataFrame ({
5960 "geo_id" : ["1" ],
6061 "val" : [1.0 ],
6162 "se" : [0.1 ],
6263 "sample_size" : [10.0 ]
6364 }),
6465
65- # Common , but missing columns removed
66+ # All rows common , but missing columns removed
6667 "csv5" : pd .DataFrame ({
6768 "geo_id" : ["1" ],
6869 "val" : [1.0 ],
7273 "missing_se" : [Nans .NOT_MISSING ],
7374 "missing_sample_size" : [Nans .NOT_MISSING ],
7475 }),
76+
77+ # All rows common, but no missing columns
78+ "csv6" : pd .DataFrame ({
79+ "geo_id" : ["1" ],
80+ "val" : [1.0 ],
81+ "se" : [0.1 ],
82+ "sample_size" : [10.0 ]
83+ }),
84+
85+ # Row deleted and row added, but no missing columns (will not be uploaded)
86+ "csv7" : pd .DataFrame ({
87+ "geo_id" : ["1" , "2" ],
88+ "val" : [1.0 , 2.0 ],
89+ "se" : [0.1 , 0.2 ],
90+ "sample_size" : [10.0 , 20.0 ]
91+ }),
92+
93+ # Row deleted and row added, but no missing columns
94+ "csv8" : pd .DataFrame ({
95+ "geo_id" : ["1" , "2" ],
96+ "val" : [1.0 , 2.0 ],
97+ "se" : [0.1 , 0.2 ],
98+ "sample_size" : [10.0 , 20.0 ]
99+ }),
75100}
76101
77102CSVS_AFTER = {
78- # Common
103+ # All rows unchanged
79104 "csv0" : pd .DataFrame ({
80105 "geo_id" : ["1" , "2" , "3" ],
81106 "val" : [1.0 , 2.0 , 3.0 ],
86111 "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
87112 }),
88113
114+ # One row deleted and one row added
89115 "csv1" : pd .DataFrame ({
90116 "geo_id" : ["1" , "2" , "4" ],
91117 "val" : [1.0 , 2.1 , 4.0 ],
96122 "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
97123 }),
98124
99- # Added
125+ # File added
100126 "csv3" : pd .DataFrame ({
101127 "geo_id" : ["2" ],
102128 "val" : [2.0000002 ],
107133 "missing_sample_size" : [Nans .NOT_MISSING ],
108134 }),
109135
110- # Common , but updated with missing columns
136+ # All rows common , but missing columns added
111137 "csv4" : pd .DataFrame ({
112138 "geo_id" : ["1" ],
113139 "val" : [1.0 ],
118144 "missing_sample_size" : [Nans .NOT_MISSING ],
119145 }),
120146
121- # Common , but missing columns removed
147+ # All rows common , but missing columns removed
122148 "csv5" : pd .DataFrame ({
123149 "geo_id" : ["1" ],
124150 "val" : [1.0 ],
125151 "se" : [0.1 ],
126152 "sample_size" : [10.0 ]
127153 }),
154+
155+ # All rows common, but no missing columns
156+ "csv6" : pd .DataFrame ({
157+ "geo_id" : ["1" ],
158+ "val" : [1.0 ],
159+ "se" : [0.1 ],
160+ "sample_size" : [10.0 ]
161+ }),
162+
163+ # Row deleted and row added, but no missing columns (will not be uploaded)
164+ "csv7" : pd .DataFrame ({
165+ "geo_id" : ["1" ],
166+ "val" : [1.0 ],
167+ "se" : [0.1 ],
168+ "sample_size" : [10.0 ]
169+ }),
170+
171+ # Row deleted and row added, but no missing columns
172+ "csv8" : pd .DataFrame ({
173+ "geo_id" : ["1" , "3" ],
174+ "val" : [1.0 , 3.0 ],
175+ "se" : [0.1 , 0.3 ],
176+ "sample_size" : [10.0 , 30.0 ]
177+ }),
128178}
129179
130180class TestArchiveDiffer :
@@ -175,17 +225,22 @@ def test_diff_and_filter_exports(self, tmp_path):
175225 # Check return values
176226 assert set (deleted_files ) == {join (cache_dir , "csv2.csv" )}
177227 assert set (common_diffs .keys ()) == {
178- join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" , "csv4.csv" , "csv5.csv" ]}
228+ join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" , "csv4.csv" , "csv5.csv" , "csv6.csv" , "csv7.csv" , "csv8.csv" ]}
179229 assert set (new_files ) == {join (export_dir , "csv3.csv" )}
180230 assert common_diffs [join (export_dir , "csv0.csv" )] is None
181231 assert common_diffs [join (export_dir , "csv1.csv" )] == join (
182232 export_dir , "csv1.csv.diff" )
183233
184234 # Check filesystem for actual files
185235 assert set (listdir (export_dir )) == {
186- "csv0.csv" , "csv1.csv" , "csv1.csv.diff" ,
187- "csv3.csv" , "csv4.csv" , "csv4.csv.diff" ,
188- "csv5.csv" , "csv5.csv.diff"
236+ "csv0.csv" ,
237+ "csv1.csv" , "csv1.csv.diff" ,
238+ "csv3.csv" ,
239+ "csv4.csv" , "csv4.csv.diff" ,
240+ "csv5.csv" , "csv5.csv.diff" ,
241+ "csv6.csv" ,
242+ "csv7.csv" , "csv7.csv.diff" ,
243+ "csv8.csv" , "csv8.csv.diff"
189244 }
190245 assert_frame_equal (
191246 pd .read_csv (join (export_dir , "csv1.csv.diff" ), dtype = CSV_DTYPES ),
@@ -204,7 +259,7 @@ def test_diff_and_filter_exports(self, tmp_path):
204259 arch_diff .filter_exports (common_diffs )
205260
206261 # Check exports directory just has incremental changes
207- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" }
262+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv7.csv" , "csv8.csv" }
208263 assert_frame_equal (
209264 pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
210265 csv1_diff )
@@ -325,13 +380,11 @@ def test_run(self, tmp_path, s3_client):
325380
326381 # Check that the buckets now contain the exported files.
327382 for csv_name , df in CSVS_AFTER .items ():
328- body = s3_client .get_object (
329- Bucket = self .bucket_name ,
330- Key = f"{ self .indicator_prefix } /{ csv_name } .csv" )["Body" ]
383+ body = s3_client .get_object (Bucket = self .bucket_name , Key = f"{ self .indicator_prefix } /{ csv_name } .csv" )["Body" ]
331384 assert_frame_equal (pd .read_csv (body , dtype = CSV_DTYPES ), df )
332385
333386 # Check exports directory just has incremental changes
334- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" }
387+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv7.csv" , "csv8.csv" }
335388 csv1_diff = pd .DataFrame ({
336389 "geo_id" : ["3" , "2" , "4" ],
337390 "val" : [np .nan , 2.1 , 4.0 ],
@@ -539,12 +592,11 @@ def test_run(self, tmp_path):
539592 arch_diff .get_branch (branch_name ).checkout ()
540593 for csv_name , df in CSVS_AFTER .items ():
541594 assert_frame_equal (
542- pd .read_csv (
543- join (cache_dir , f"{ csv_name } .csv" ), dtype = CSV_DTYPES ), df )
595+ pd .read_csv (join (cache_dir , f"{ csv_name } .csv" ), dtype = CSV_DTYPES ), df )
544596 original_branch .checkout ()
545597
546598 # Check exports directory just has incremental changes
547- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" }
599+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv7.csv" , "csv8.csv" }
548600 csv1_diff = pd .DataFrame ({
549601 "geo_id" : ["3" , "2" , "4" ],
550602 "val" : [np .nan , 2.1 , 4.0 ],
0 commit comments