From 3ca35f102243bda3d7e0bfa881f5147eef4b536a Mon Sep 17 00:00:00 2001 From: Maximilian Simmoteit Date: Thu, 21 Mar 2024 10:14:33 +0100 Subject: [PATCH] Implement reading qvd files from Python IO --- qvd/qvd_reader.py | 30 ++++++++++++++++----- qvd/test_qvd_reader.py | 8 ++++++ src/lib.rs | 61 +++++++++++++++++++++++++++++++++++------- 3 files changed, 82 insertions(+), 17 deletions(-) diff --git a/qvd/qvd_reader.py b/qvd/qvd_reader.py index e0bb775..0ab5a2f 100644 --- a/qvd/qvd_reader.py +++ b/qvd/qvd_reader.py @@ -1,13 +1,29 @@ -from .qvd import read_qvd +from .qvd import read_qvd, read_qvd_from_buffer import pandas as pd +import io -def read(file_name): - data = read_qvd(file_name) - df = pd.DataFrame.from_dict(data) +def read(file): + data_dict = read_to_dict(file) + df = pd.DataFrame.from_dict(data_dict) return df -def read_to_dict(file_name): - data = read_qvd(file_name) - return data +def read_to_dict(file): + if (isinstance(file, io.TextIOBase) + or isinstance(file, io.BufferedIOBase) + or isinstance(file, io.RawIOBase) + or isinstance(file, io.IOBase)): + try: + unpacked_data = file.read() + except UnicodeDecodeError as e: + raise Exception("Supply a raw file access. Use mode \"rb\" instead of mode \"r\"") + elif isinstance(file, bytes): + unpacked_data = file + elif isinstance(file, str): + return read_qvd(file) + else: + raise Exception("Please supply a raw string or a file") + result_data = read_qvd_from_buffer(unpacked_data) + return result_data + diff --git a/qvd/test_qvd_reader.py b/qvd/test_qvd_reader.py index 2449191..b0e2e23 100644 --- a/qvd/test_qvd_reader.py +++ b/qvd/test_qvd_reader.py @@ -14,3 +14,11 @@ def test_read_size(self): qvd = qvd_reader.read(f'{os.path.dirname(__file__)}/test_files/AAPL.qvd') csv = pd.read_csv(f'{os.path.dirname(__file__)}/test_files/AAPL.csv', float_precision='round_trip') assert np.array_equal(np.sort(qvd.columns, axis=0), np.sort(csv.columns, axis=0)) + + def test_qvd_from_in_memory(self): + with open(f'{os.path.dirname(__file__)}/test_files/AAPL.qvd', 'rb') as fin: + qvd = qvd_reader.read(fin) + csv = pd.read_csv(f'{os.path.dirname(__file__)}/test_files/AAPL.csv', float_precision='round_trip') + assert qvd.shape == csv.shape + assert np.array_equal(np.sort(qvd.columns, axis=0), np.sort(csv.columns, axis=0)) + diff --git a/src/lib.rs b/src/lib.rs index 76ec001..21c34f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ use pyo3::{prelude::*, types::PyDict}; use quick_xml::de::from_str; use qvd_structure::{QvdFieldHeader, QvdTableHeader}; use std::io::SeekFrom; -use std::io::{self, Read}; +use std::io::{self, Read, Cursor}; use std::path::Path; use std::str; use std::{collections::HashMap, fs::File}; @@ -14,6 +14,7 @@ pub mod qvd_structure; #[pymodule] fn qvd(_py: Python, m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(read_qvd, m)?)?; + m.add_function(wrap_pyfunction!(read_qvd_from_buffer, m)?)?; Ok(()) } @@ -49,6 +50,36 @@ fn read_qvd(py: Python, file_name: String) -> PyResult> { Ok(dict.into()) } +#[pyfunction] +fn read_qvd_from_buffer(py: Python, input_buffer: Vec) -> PyResult> { + let xml: String = get_xml_data_from_raw_data(&input_buffer).expect("Error reading qvd data"); + let dict = PyDict::new(py); + let binary_section_offset = xml.as_bytes().len(); + + let qvd_structure: QvdTableHeader = from_str(&xml).unwrap(); + let mut symbol_map: HashMap>> = HashMap::new(); + + // Seek to the end of the XML section + let buf = &input_buffer[binary_section_offset..]; + let rows_start = qvd_structure.offset; + let rows_end = buf.len(); + let rows_section = &buf[rows_start..rows_end]; + let record_byte_size = qvd_structure.record_byte_size; + + for field in qvd_structure.fields.headers { + symbol_map.insert( + field.field_name.clone(), + get_symbols_as_strings(&buf, &field), + ); + let symbol_indexes = get_row_indexes(&rows_section, &field, record_byte_size); + let column_values = + match_symbols_with_indexes(&symbol_map[&field.field_name], &symbol_indexes); + dict.set_item(field.field_name, column_values).unwrap(); + } + Ok(dict.into()) +} + + fn read_qvd_to_buf(mut f: File, binary_section_offset: usize) -> Vec { f.seek(SeekFrom::Start(binary_section_offset as u64)) .unwrap(); @@ -177,23 +208,33 @@ fn bitslice_to_vec(bitslice: &BitSlice) -> Vec { v } +fn extract_xml_data(reader: &mut dyn io::BufRead) -> Result { + let mut buffer = Vec::new(); + // There is a line break, carriage return and a null terminator between the XML and data + // Find the null terminator + reader + .read_until(0, &mut buffer) + .expect("Failed to find null terminator in QVD"); + let xml_string = + str::from_utf8(&buffer[..]).expect("xml section contains invalid UTF-8 chars"); + Ok(xml_string.to_owned()) +} + fn get_xml_data(file_name: &str) -> Result { match read_file(file_name) { Ok(mut reader) => { - let mut buffer = Vec::new(); - // There is a line break, carriage return and a null terminator between the XMl and data - // Find the null terminator - reader - .read_until(0, &mut buffer) - .expect("Failed to read file"); - let xml_string = - str::from_utf8(&buffer[..]).expect("xml section contains invalid UTF-8 chars"); - Ok(xml_string.to_owned()) + extract_xml_data(&mut reader) } Err(e) => Err(e), } } +fn get_xml_data_from_raw_data(raw_data: &Vec) -> Result { + let mut cursor = Cursor::new(raw_data); + extract_xml_data(&mut cursor) +} + + fn read_file

(filename: P) -> io::Result> where P: AsRef,