@@ -34,7 +34,10 @@ use itertools::Itertools;
34
34
use parquet:: {
35
35
arrow:: ArrowWriter ,
36
36
basic:: Encoding ,
37
- file:: { FOOTER_SIZE , properties:: WriterProperties } ,
37
+ file:: {
38
+ FOOTER_SIZE , properties:: WriterProperties , reader:: FileReader ,
39
+ serialized_reader:: SerializedFileReader ,
40
+ } ,
38
41
format:: SortingColumn ,
39
42
schema:: types:: ColumnPath ,
40
43
} ;
@@ -409,7 +412,7 @@ impl Stream {
409
412
. map ( |file| file. path ( ) )
410
413
. filter ( |file| {
411
414
file. extension ( ) . is_some_and ( |ext| ext. eq ( "parquet" ) )
412
- && std :: fs :: metadata ( file) . is_ok_and ( |meta| meta . len ( ) > FOOTER_SIZE as u64 )
415
+ && Self :: is_valid_parquet_file ( file, & self . stream_name )
413
416
} )
414
417
. collect ( )
415
418
}
@@ -649,7 +652,7 @@ impl Stream {
649
652
continue ;
650
653
}
651
654
652
- if let Err ( e) = self . finalize_parquet_file ( & part_path, & parquet_path) {
655
+ if let Err ( e) = std :: fs :: rename ( & part_path, & parquet_path) {
653
656
error ! ( "Couldn't rename part file: {part_path:?} -> {parquet_path:?}, error = {e}" ) ;
654
657
} else {
655
658
self . cleanup_arrow_files_and_dir ( & arrow_files) ;
@@ -682,12 +685,10 @@ impl Stream {
682
685
}
683
686
writer. close ( ) ?;
684
687
685
- if part_file. metadata ( ) . expect ( "File was just created" ) . len ( )
686
- < parquet:: file:: FOOTER_SIZE as u64
687
- {
688
+ if !Self :: is_valid_parquet_file ( part_path, & self . stream_name ) {
688
689
error ! (
689
- "Invalid parquet file {part_path:?} detected for stream {}, removing it" ,
690
- & self . stream_name
690
+ "Invalid parquet file {part_path:?} detected for stream {stream_name }, removing it" ,
691
+ stream_name = & self . stream_name
691
692
) ;
692
693
remove_file ( part_path) . expect ( "File should be removable if it is invalid" ) ;
693
694
return Ok ( false ) ;
@@ -696,8 +697,47 @@ impl Stream {
696
697
Ok ( true )
697
698
}
698
699
699
- fn finalize_parquet_file ( & self , part_path : & Path , parquet_path : & Path ) -> std:: io:: Result < ( ) > {
700
- std:: fs:: rename ( part_path, parquet_path)
700
+ /// function to validate parquet files
701
+ fn is_valid_parquet_file ( path : & Path , stream_name : & str ) -> bool {
702
+ // First check file size as a quick validation
703
+ match path. metadata ( ) {
704
+ Ok ( meta) if meta. len ( ) < FOOTER_SIZE as u64 => {
705
+ error ! (
706
+ "Invalid parquet file {path:?} detected for stream {stream_name}, size: {} bytes" ,
707
+ meta. len( )
708
+ ) ;
709
+ return false ;
710
+ }
711
+ Err ( e) => {
712
+ error ! (
713
+ "Cannot read metadata for parquet file {path:?} for stream {stream_name}: {e}"
714
+ ) ;
715
+ return false ;
716
+ }
717
+ _ => { } // File size is adequate, continue validation
718
+ }
719
+
720
+ // Try to open and read the parquet file metadata to verify it's valid
721
+ match std:: fs:: File :: open ( path) {
722
+ Ok ( file) => match SerializedFileReader :: new ( file) {
723
+ Ok ( reader) => {
724
+ if reader. metadata ( ) . file_metadata ( ) . num_rows ( ) == 0 {
725
+ error ! ( "Invalid parquet file {path:?} for stream {stream_name}" ) ;
726
+ false
727
+ } else {
728
+ true
729
+ }
730
+ }
731
+ Err ( e) => {
732
+ error ! ( "Failed to read parquet file {path:?} for stream {stream_name}: {e}" ) ;
733
+ false
734
+ }
735
+ } ,
736
+ Err ( e) => {
737
+ error ! ( "Failed to open parquet file {path:?} for stream {stream_name}: {e}" ) ;
738
+ false
739
+ }
740
+ }
701
741
}
702
742
703
743
fn cleanup_arrow_files_and_dir ( & self , arrow_files : & [ PathBuf ] ) {
@@ -951,7 +991,10 @@ impl Stream {
951
991
shutdown_signal : bool ,
952
992
) -> Result < ( ) , StagingError > {
953
993
let start_flush = Instant :: now ( ) ;
954
- self . flush ( shutdown_signal) ;
994
+ // Force flush for init or shutdown signals to convert all .part files to .arrows
995
+ // For regular cycles, use false to only flush non-current writers
996
+ let forced = init_signal || shutdown_signal;
997
+ self . flush ( forced) ;
955
998
trace ! (
956
999
"Flushing stream ({}) took: {}s" ,
957
1000
self . stream_name,
0 commit comments