diff --git a/.cspell/custom-dictionary.txt b/.cspell/custom-dictionary.txt index b6f706ed..43f2eb28 100644 --- a/.cspell/custom-dictionary.txt +++ b/.cspell/custom-dictionary.txt @@ -43,6 +43,8 @@ cdeform cdeformfield cdisp centroidnn +cfel +CFEL chessy clim cmap @@ -64,6 +66,7 @@ cryo cstart cstep csvfile +cumsum custom-dictionary cval cvdist @@ -176,6 +179,7 @@ joblib jpars jupyterlab kernelspec +kmic kmodem KTOF kwds @@ -208,6 +212,8 @@ mdist meshgrid microbunch microbunches +millis +millisec mirrorutil mnpos modindex diff --git a/.sed-dev/bin/Activate.ps1 b/.sed-dev/bin/Activate.ps1 new file mode 100644 index 00000000..b49d77ba --- /dev/null +++ b/.sed-dev/bin/Activate.ps1 @@ -0,0 +1,247 @@ +<# +.Synopsis +Activate a Python virtual environment for the current PowerShell session. + +.Description +Pushes the python executable for a virtual environment to the front of the +$Env:PATH environment variable and sets the prompt to signify that you are +in a Python virtual environment. Makes use of the command line switches as +well as the `pyvenv.cfg` file values present in the virtual environment. + +.Parameter VenvDir +Path to the directory that contains the virtual environment to activate. The +default value for this is the parent of the directory that the Activate.ps1 +script is located within. + +.Parameter Prompt +The prompt prefix to display when this virtual environment is activated. By +default, this prompt is the name of the virtual environment folder (VenvDir) +surrounded by parentheses and followed by a single space (ie. '(.venv) '). + +.Example +Activate.ps1 +Activates the Python virtual environment that contains the Activate.ps1 script. + +.Example +Activate.ps1 -Verbose +Activates the Python virtual environment that contains the Activate.ps1 script, +and shows extra information about the activation as it executes. + +.Example +Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv +Activates the Python virtual environment located in the specified location. + +.Example +Activate.ps1 -Prompt "MyPython" +Activates the Python virtual environment that contains the Activate.ps1 script, +and prefixes the current prompt with the specified string (surrounded in +parentheses) while the virtual environment is active. + +.Notes +On Windows, it may be required to enable this Activate.ps1 script by setting the +execution policy for the user. You can do this by issuing the following PowerShell +command: + +PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser + +For more information on Execution Policies: +https://go.microsoft.com/fwlink/?LinkID=135170 + +#> +Param( + [Parameter(Mandatory = $false)] + [String] + $VenvDir, + [Parameter(Mandatory = $false)] + [String] + $Prompt +) + +<# Function declarations --------------------------------------------------- #> + +<# +.Synopsis +Remove all shell session elements added by the Activate script, including the +addition of the virtual environment's Python executable from the beginning of +the PATH variable. + +.Parameter NonDestructive +If present, do not remove this function from the global namespace for the +session. + +#> +function global:deactivate ([switch]$NonDestructive) { + # Revert to original values + + # The prior prompt: + if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) { + Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt + Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT + } + + # The prior PYTHONHOME: + if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) { + Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME + Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME + } + + # The prior PATH: + if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) { + Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH + Remove-Item -Path Env:_OLD_VIRTUAL_PATH + } + + # Just remove the VIRTUAL_ENV altogether: + if (Test-Path -Path Env:VIRTUAL_ENV) { + Remove-Item -Path env:VIRTUAL_ENV + } + + # Just remove VIRTUAL_ENV_PROMPT altogether. + if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) { + Remove-Item -Path env:VIRTUAL_ENV_PROMPT + } + + # Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether: + if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) { + Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force + } + + # Leave deactivate function in the global namespace if requested: + if (-not $NonDestructive) { + Remove-Item -Path function:deactivate + } +} + +<# +.Description +Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the +given folder, and returns them in a map. + +For each line in the pyvenv.cfg file, if that line can be parsed into exactly +two strings separated by `=` (with any amount of whitespace surrounding the =) +then it is considered a `key = value` line. The left hand string is the key, +the right hand is the value. + +If the value starts with a `'` or a `"` then the first and last character is +stripped from the value before being captured. + +.Parameter ConfigDir +Path to the directory that contains the `pyvenv.cfg` file. +#> +function Get-PyVenvConfig( + [String] + $ConfigDir +) { + Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg" + + # Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue). + $pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue + + # An empty map will be returned if no config file is found. + $pyvenvConfig = @{ } + + if ($pyvenvConfigPath) { + + Write-Verbose "File exists, parse `key = value` lines" + $pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath + + $pyvenvConfigContent | ForEach-Object { + $keyval = $PSItem -split "\s*=\s*", 2 + if ($keyval[0] -and $keyval[1]) { + $val = $keyval[1] + + # Remove extraneous quotations around a string value. + if ("'""".Contains($val.Substring(0, 1))) { + $val = $val.Substring(1, $val.Length - 2) + } + + $pyvenvConfig[$keyval[0]] = $val + Write-Verbose "Adding Key: '$($keyval[0])'='$val'" + } + } + } + return $pyvenvConfig +} + + +<# Begin Activate script --------------------------------------------------- #> + +# Determine the containing directory of this script +$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition +$VenvExecDir = Get-Item -Path $VenvExecPath + +Write-Verbose "Activation script is located in path: '$VenvExecPath'" +Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)" +Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)" + +# Set values required in priority: CmdLine, ConfigFile, Default +# First, get the location of the virtual environment, it might not be +# VenvExecDir if specified on the command line. +if ($VenvDir) { + Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values" +} +else { + Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir." + $VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/") + Write-Verbose "VenvDir=$VenvDir" +} + +# Next, read the `pyvenv.cfg` file to determine any required value such +# as `prompt`. +$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir + +# Next, set the prompt from the command line, or the config file, or +# just use the name of the virtual environment folder. +if ($Prompt) { + Write-Verbose "Prompt specified as argument, using '$Prompt'" +} +else { + Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value" + if ($pyvenvCfg -and $pyvenvCfg['prompt']) { + Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'" + $Prompt = $pyvenvCfg['prompt']; + } + else { + Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)" + Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'" + $Prompt = Split-Path -Path $venvDir -Leaf + } +} + +Write-Verbose "Prompt = '$Prompt'" +Write-Verbose "VenvDir='$VenvDir'" + +# Deactivate any currently active virtual environment, but leave the +# deactivate function in place. +deactivate -nondestructive + +# Now set the environment variable VIRTUAL_ENV, used by many tools to determine +# that there is an activated venv. +$env:VIRTUAL_ENV = $VenvDir + +if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) { + + Write-Verbose "Setting prompt to '$Prompt'" + + # Set the prompt to include the env name + # Make sure _OLD_VIRTUAL_PROMPT is global + function global:_OLD_VIRTUAL_PROMPT { "" } + Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT + New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt + + function global:prompt { + Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) " + _OLD_VIRTUAL_PROMPT + } + $env:VIRTUAL_ENV_PROMPT = $Prompt +} + +# Clear PYTHONHOME +if (Test-Path -Path Env:PYTHONHOME) { + Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME + Remove-Item -Path Env:PYTHONHOME +} + +# Add the venv to the PATH +Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH +$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH" diff --git a/.sed-dev/bin/activate b/.sed-dev/bin/activate new file mode 100644 index 00000000..44ec4b76 --- /dev/null +++ b/.sed-dev/bin/activate @@ -0,0 +1,63 @@ +# This file must be used with "source bin/activate" *from bash* +# you cannot run it directly + +deactivate () { + # reset old environment variables + if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then + PATH="${_OLD_VIRTUAL_PATH:-}" + export PATH + unset _OLD_VIRTUAL_PATH + fi + if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then + PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}" + export PYTHONHOME + unset _OLD_VIRTUAL_PYTHONHOME + fi + + # Call hash to forget past commands. Without forgetting + # past commands the $PATH changes we made may not be respected + hash -r 2> /dev/null + + if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then + PS1="${_OLD_VIRTUAL_PS1:-}" + export PS1 + unset _OLD_VIRTUAL_PS1 + fi + + unset VIRTUAL_ENV + unset VIRTUAL_ENV_PROMPT + if [ ! "${1:-}" = "nondestructive" ] ; then + # Self destruct! + unset -f deactivate + fi +} + +# unset irrelevant variables +deactivate nondestructive + +VIRTUAL_ENV="/home/abdelhak/sed/.sed-dev" +export VIRTUAL_ENV + +_OLD_VIRTUAL_PATH="$PATH" +PATH="$VIRTUAL_ENV/bin:$PATH" +export PATH + +# unset PYTHONHOME if set +# this will fail if PYTHONHOME is set to the empty string (which is bad anyway) +# could use `if (set -u; : $PYTHONHOME) ;` in bash +if [ -n "${PYTHONHOME:-}" ] ; then + _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}" + unset PYTHONHOME +fi + +if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then + _OLD_VIRTUAL_PS1="${PS1:-}" + PS1="(.sed-dev) ${PS1:-}" + export PS1 + VIRTUAL_ENV_PROMPT="(.sed-dev) " + export VIRTUAL_ENV_PROMPT +fi + +# Call hash to forget past commands. Without forgetting +# past commands the $PATH changes we made may not be respected +hash -r 2> /dev/null diff --git a/.sed-dev/bin/activate.csh b/.sed-dev/bin/activate.csh new file mode 100644 index 00000000..4495a1f3 --- /dev/null +++ b/.sed-dev/bin/activate.csh @@ -0,0 +1,26 @@ +# This file must be used with "source bin/activate.csh" *from csh*. +# You cannot run it directly. +# Created by Davide Di Blasi . +# Ported to Python 3.3 venv by Andrew Svetlov + +alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate' + +# Unset irrelevant variables. +deactivate nondestructive + +setenv VIRTUAL_ENV "/home/abdelhak/sed/.sed-dev" + +set _OLD_VIRTUAL_PATH="$PATH" +setenv PATH "$VIRTUAL_ENV/bin:$PATH" + + +set _OLD_VIRTUAL_PROMPT="$prompt" + +if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then + set prompt = "(.sed-dev) $prompt" + setenv VIRTUAL_ENV_PROMPT "(.sed-dev) " +endif + +alias pydoc python -m pydoc + +rehash diff --git a/.sed-dev/bin/activate.fish b/.sed-dev/bin/activate.fish new file mode 100644 index 00000000..5f2d1693 --- /dev/null +++ b/.sed-dev/bin/activate.fish @@ -0,0 +1,69 @@ +# This file must be used with "source /bin/activate.fish" *from fish* +# (https://fishshell.com/); you cannot run it directly. + +function deactivate -d "Exit virtual environment and return to normal shell environment" + # reset old environment variables + if test -n "$_OLD_VIRTUAL_PATH" + set -gx PATH $_OLD_VIRTUAL_PATH + set -e _OLD_VIRTUAL_PATH + end + if test -n "$_OLD_VIRTUAL_PYTHONHOME" + set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME + set -e _OLD_VIRTUAL_PYTHONHOME + end + + if test -n "$_OLD_FISH_PROMPT_OVERRIDE" + set -e _OLD_FISH_PROMPT_OVERRIDE + # prevents error when using nested fish instances (Issue #93858) + if functions -q _old_fish_prompt + functions -e fish_prompt + functions -c _old_fish_prompt fish_prompt + functions -e _old_fish_prompt + end + end + + set -e VIRTUAL_ENV + set -e VIRTUAL_ENV_PROMPT + if test "$argv[1]" != "nondestructive" + # Self-destruct! + functions -e deactivate + end +end + +# Unset irrelevant variables. +deactivate nondestructive + +set -gx VIRTUAL_ENV "/home/abdelhak/sed/.sed-dev" + +set -gx _OLD_VIRTUAL_PATH $PATH +set -gx PATH "$VIRTUAL_ENV/bin" $PATH + +# Unset PYTHONHOME if set. +if set -q PYTHONHOME + set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME + set -e PYTHONHOME +end + +if test -z "$VIRTUAL_ENV_DISABLE_PROMPT" + # fish uses a function instead of an env var to generate the prompt. + + # Save the current fish_prompt function as the function _old_fish_prompt. + functions -c fish_prompt _old_fish_prompt + + # With the original prompt function renamed, we can override with our own. + function fish_prompt + # Save the return status of the last command. + set -l old_status $status + + # Output the venv prompt; color taken from the blue of the Python logo. + printf "%s%s%s" (set_color 4B8BBE) "(.sed-dev) " (set_color normal) + + # Restore the return status of the previous command. + echo "exit $old_status" | . + # Output the original/"old" prompt. + _old_fish_prompt + end + + set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV" + set -gx VIRTUAL_ENV_PROMPT "(.sed-dev) " +end diff --git a/.sed-dev/bin/python b/.sed-dev/bin/python new file mode 120000 index 00000000..cccf4709 --- /dev/null +++ b/.sed-dev/bin/python @@ -0,0 +1 @@ +/software/mamba/2024.01/bin/python \ No newline at end of file diff --git a/.sed-dev/bin/python3 b/.sed-dev/bin/python3 new file mode 120000 index 00000000..d8654aa0 --- /dev/null +++ b/.sed-dev/bin/python3 @@ -0,0 +1 @@ +python \ No newline at end of file diff --git a/.sed-dev/bin/python3.11 b/.sed-dev/bin/python3.11 new file mode 120000 index 00000000..d8654aa0 --- /dev/null +++ b/.sed-dev/bin/python3.11 @@ -0,0 +1 @@ +python \ No newline at end of file diff --git a/.sed-dev/lib64 b/.sed-dev/lib64 new file mode 120000 index 00000000..7951405f --- /dev/null +++ b/.sed-dev/lib64 @@ -0,0 +1 @@ +lib \ No newline at end of file diff --git a/.sed-dev/pyvenv.cfg b/.sed-dev/pyvenv.cfg new file mode 100644 index 00000000..685910b6 --- /dev/null +++ b/.sed-dev/pyvenv.cfg @@ -0,0 +1,5 @@ +home = /software/mamba/2024.01/bin +include-system-site-packages = false +version = 3.11.7 +executable = /software/mamba/2024.01/bin/python3.11 +command = /software/mamba/2024.01/bin/python -m venv /home/abdelhak/sed/.sed-dev diff --git a/src/sed/config/flash_example_config.yaml b/src/sed/config/flash_example_config.yaml index 9fa598c1..21abe6b9 100644 --- a/src/sed/config/flash_example_config.yaml +++ b/src/sed/config/flash_example_config.yaml @@ -10,8 +10,6 @@ core: beamtime_id: 11019101 # the year of the beamtime year: 2023 - # the instrument used - instrument: hextof # hextof, wespe, etc # The paths to the raw and parquet data directories. If these are not # provided, the loader will try to find the data based on year beamtimeID etc # paths: @@ -32,6 +30,7 @@ core: # (Not to be changed by user) beamtime_dir: pg2: "/asap3/flash/gpfs/pg2/" + cfel: "/asap3/fs-flash-o/gpfs/hextof/" binning: # Histogram computation mode to use. @@ -60,6 +59,11 @@ dataframe: # Columns used for jitter correction jitter_cols: [dldPosX, dldPosY, dldTimeSteps] + # The index and formats of the data + index: [trainId, pulseId, electronId] + formats: [per_train, per_pulse, per_electron] + fill_formats: [per_train, per_pulse] # Channels with this format will be forward filled + # Column settings columns: x: dldPosX @@ -212,8 +216,7 @@ dataframe: # metadata collection from scicat # metadata: -# scicat_url: -# scicat_token: +# archiver_url: # The nexus collection routine shall be finalized soon for both instruments nexus: diff --git a/src/sed/config/lab_example_config.yaml b/src/sed/config/lab_example_config.yaml new file mode 100644 index 00000000..42d591e9 --- /dev/null +++ b/src/sed/config/lab_example_config.yaml @@ -0,0 +1,161 @@ +# This file contains the default configuration for the flash loader. + +core: + # defines the loader + loader: cfel + # Since this will run on maxwell most probably, we have a lot of cores at our disposal + num_cores: 10 + # the ID number of the beamtime + beamtime_id: 11021732 + # the year of the beamtime + year: 2025 + + # The paths to the raw and parquet data directories. If these are not + # provided, the loader will try to find the data based on year beamtimeID etc + paths: + # location of the raw data. + raw: "/asap3/fs-flash-o/gpfs/hextof/2025/data/11021732/raw/" + # location of the intermediate parquet files. + processed: "." + + # The beamtime directories for different DAQ systems. + # (Not to be changed by user) + beamtime_dir: + pg2: "/asap3/flash/gpfs/pg2/" + cfel: "/asap3/fs-flash-o/gpfs/hextof/" + + +dataframe: + daq: fl1user3 # DAQ system name to resolve filenames/paths + ubid_offset: 5 # Offset correction to the pulseId + forward_fill_iterations: 0 # Number of iterations to fill the pulseId forward + split_sector_id_from_dld_time: True # Remove reserved bits for dldSectorID from dldTimeSteps column + sector_id_reserved_bits: 3 # Bits reserved for dldSectorID in the dldTimeSteps column + sector_delays: [0., 0., 0., 0., 0., 0., 0., 0.] # Sector delays + + first_event_time_stamp_key: /ScanParam/StartTime + ms_markers_key: /SlowData/exposure_time + millis_counter_key: /DLD/millisecCounter + + # Time and binning settings + tof_binwidth: 2.0576131995767355E-11 # Base time-of-flight bin width in seconds + tof_binning: 8 # Binning parameter for time-of-flight data + + # Columns used for jitter correction + index: [countId] + jitter_cols: [dldPosX, dldPosY, dldTimeSteps] + formats: [per_file, per_train, per_electron] + fill_formats: [per_train] # Channels with this format will be forward filled + + # Column settings + columns: + x: dldPosX + corrected_x: X + kx: kx + y: dldPosY + corrected_y: Y + ky: ky + tof: dldTimeSteps + tof_ns: dldTime + corrected_tof: tm + timestamp: timeStamp + auxiliary: dldAux + sector_id: dldSectorID + delay: delayStage + corrected_delay: pumpProbeTime + + units: + # These are the units of the columns + dldPosX: 'step' + dldPosY: 'step' + dldTimeSteps: 'step' + tof_voltage: 'V' + extractorVoltage: 'V' + extractorCurrent: 'A' + cryoTemperature: 'K' + sampleTemperature: 'K' + dldTime: 'ns' + delay: 'ps' + timeStamp: 's' + energy: 'eV' + E: 'eV' + kx: '1/A' + ky: '1/A' + + # The channels to load. + # channels have the following structure: + # : + # format: per_pulse/per_electron/per_train + # index_key: the hdf5 index key + # dataset_key: the hdf5 dataset key + # slice: int to slice a multidimensional data along axis=1. If not defined, there is no slicing + # dtype: the datatype of the data + # subChannels: further aliases for if the data is multidimensional and needs to be split in different cols + # used currently for the auxiliary channel + # : + # slice: int to slice a multidimensional data along axis=1. Must be defined + # dtype: the datatype of the data + + channels: + # event key + countId: + format: per_file + dataset_key: /DLD/NumOfEvents + # detector x position + dldPosX: + format: per_electron + dataset_key: /DLD/DLD/xPos + # dtype: uint32 + + # detector y position + dldPosY: + format: per_electron + dataset_key: /DLD/DLD/yPos + # dtype: uint32 + + # Detector time-of-flight channel + # if split_sector_id_from_dld_time is set to True, This this will generate + # also the dldSectorID channel + dldTimeSteps: + format: per_electron + dataset_key: /DLD/DLD/times + # dtype: uint32 + + # The auxiliary channel has a special structure where the group further contains + # a multidimensional structure so further aliases are defined below + dldAux: + format: per_train + dataset_key: "/SlowData/hextof/dld/info/Aux" + sub_channels: + sampleBias: + slice: 0 + dtype: float32 + tofVoltage: + slice: 1 + dtype: float64 + extractorVoltage: + slice: 2 + extractorCurrent: + slice: 3 + cryoTemperature: + slice: 4 + sampleTemperature: + slice: 5 + dldTimeBinSize: + slice: 15 + + vuRead: + format: per_train + dataset_key: /SlowData/hextof/logic/kmic1/Sample_VURead + + + +# metadata collection from scicat +# metadata: +# archiver_url: + +# The nexus collection routine shall be finalized soon for both instruments +# nexus: +# reader: "mpes" +# definition: "NXmpes" +# input_files: ["NXmpes_config-HEXTOF.json"] diff --git a/src/sed/core/config.py b/src/sed/core/config.py index d9c7b551..ae6b3ca7 100644 --- a/src/sed/core/config.py +++ b/src/sed/core/config.py @@ -18,7 +18,8 @@ package_dir = os.path.dirname(find_spec("sed").origin) -USER_CONFIG_PATH = user_config_path(appname="sed", appauthor="OpenCOMPES", ensure_exists=True) +USER_CONFIG_PATH = user_config_path(appname="sed", appauthor="OpenCOMPES") +USER_CONFIG_PATH.mkdir(parents=True, exist_ok=True) SYSTEM_CONFIG_PATH = ( Path(os.environ["ALLUSERSPROFILE"]).joinpath("sed") if platform.system() == "Windows" diff --git a/src/sed/core/config_model.py b/src/sed/core/config_model.py index bca9f959..738617f9 100644 --- a/src/sed/core/config_model.py +++ b/src/sed/core/config_model.py @@ -26,6 +26,7 @@ class PathsModel(BaseModel): raw: DirectoryPath processed: Optional[Union[DirectoryPath, NewPath]] = None + meta: Optional[Union[DirectoryPath, NewPath]] = None class CopyToolModel(BaseModel): @@ -58,7 +59,6 @@ class CoreModel(BaseModel): num_cores: Optional[PositiveInt] = None year: Optional[int] = None beamtime_id: Optional[Union[int, str]] = None - instrument: Optional[str] = None beamline: Optional[str] = None copy_tool: Optional[CopyToolModel] = None stream_name_prefixes: Optional[dict] = None @@ -134,6 +134,8 @@ class DataframeModel(BaseModel): # mpes specific settings first_event_time_stamp_key: Optional[str] = None ms_markers_key: Optional[str] = None + # cfel specific settings + millis_counter_key: Optional[str] = None # flash specific settings forward_fill_iterations: Optional[int] = None ubid_offset: Optional[int] = None @@ -141,6 +143,9 @@ class DataframeModel(BaseModel): sector_id_reserved_bits: Optional[int] = None sector_delays: Optional[Sequence[float]] = None daq: Optional[str] = None + index: Optional[Sequence[str]] = None + formats: Optional[Union[Sequence[str], str]] = None + fill_formats: Optional[Union[Sequence[str], str]] = None # SXP specific settings num_trains: Optional[PositiveInt] = None num_pulses: Optional[PositiveInt] = None diff --git a/src/sed/loader/cfel/__init__.py b/src/sed/loader/cfel/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/sed/loader/cfel/buffer_handler.py b/src/sed/loader/cfel/buffer_handler.py new file mode 100644 index 00000000..47b47004 --- /dev/null +++ b/src/sed/loader/cfel/buffer_handler.py @@ -0,0 +1,340 @@ +from __future__ import annotations + +import time +from pathlib import Path + +import h5py +import numpy as np +import dask.dataframe as dd +from joblib import delayed +from joblib import Parallel + +from sed.core.logging import setup_logging +from sed.loader.cfel.dataframe import DataFrameCreator +from sed.loader.flash.buffer_handler import BufferFilePaths +from sed.loader.flash.buffer_handler import BufferHandler as BaseBufferHandler +from sed.loader.flash.utils import InvalidFileError +from sed.loader.flash.utils import get_channels +from sed.loader.flash.utils import get_dtypes + +logger = setup_logging("cfel_buffer_handler") + + +class BufferHandler(BaseBufferHandler): + """ + A class for handling the creation and manipulation of buffer files using DataFrameCreator. + """ + + def __init__( + self, + config: dict, + ) -> None: + """ + Initializes the BufferHandler. + + Args: + config (dict): The configuration dictionary. + """ + super().__init__(config) + + def _validate_h5_files(self, config, h5_paths: list[Path]) -> list[Path]: + valid_h5_paths = [] + for h5_path in h5_paths: + try: + dfc = DataFrameCreator(config_dataframe=config, h5_path=h5_path) + dfc.validate_channel_keys() + valid_h5_paths.append(h5_path) + except InvalidFileError as e: + logger.info(f"Skipping invalid file: {h5_path.stem}\n{e}") + + return valid_h5_paths + + # def _save_buffer_files(self, force_recreate: bool, debug: bool) -> None: + # """ + # Creates the buffer files that are missing, handling multi-file runs properly. + + # Args: + # force_recreate (bool): Flag to force recreation of buffer files. + # debug (bool): Flag to enable debug mode, which serializes the creation. + # """ + # file_sets = self.fp.file_sets_to_process(force_recreate) + # logger.info(f"Reading files: {len(file_sets)} new files of {len(self.fp)} total.") + + # if len(file_sets) == 0: + # return + + # # Sort file sets by filename to ensure proper order + # file_sets = sorted(file_sets, key=lambda x: x['raw'].name) + + # # Get base timestamp from the first file if we have multiple files + # base_timestamp = None + # if len(file_sets) > 1: + # try: + # # Find the first file (ends with _0000) + # first_file_set = None + # for file_set in file_sets: + # if file_set['raw'].stem.endswith('_0000'): + # first_file_set = file_set + # break + + # if first_file_set: + # # Create a temporary DataFrameCreator to extract base timestamp + # first_dfc = DataFrameCreator( + # config_dataframe=self._config, + # h5_path=first_file_set['raw'], + # is_first_file=True + # ) + # base_timestamp = first_dfc.get_base_timestamp() + # first_dfc.h5_file.close() # Clean up + # logger.info(f"Multi-file run detected. Base timestamp: {base_timestamp}") + # except Exception as e: + # logger.warning(f"Could not extract base timestamp: {e}. Processing files independently.") + # base_timestamp = None + + # n_cores = min(len(file_sets), self.n_cores) + # if n_cores > 0: + # if debug: + # for file_set in file_sets: + # is_first_file = file_set['raw'].stem.endswith('_0000') + # self._save_buffer_file(file_set, is_first_file, base_timestamp) + # else: + # # For parallel processing, we need to be careful about the order + # # Process all files in parallel with the correct parameters + # from joblib import delayed, Parallel + + # Parallel(n_jobs=n_cores, verbose=10)( + # delayed(self._save_buffer_file)( + # file_set, + # file_set['raw'].stem.endswith('_0000'), + # base_timestamp + # ) + # for file_set in file_sets + # ) + def _save_buffer_files(self, force_recreate: bool, debug: bool) -> None: + """ + Creates the buffer files that are missing, handling multi-file and single-file runs properly. + + Args: + force_recreate (bool): Flag to force recreation of buffer files. + debug (bool): Flag to enable debug mode, which serializes the creation. + """ + file_sets = self.fp.file_sets_to_process(force_recreate) + logger.info(f"Reading files: {len(file_sets)} new files of {len(self.fp)} total.") + + if not file_sets: + return + + # Sort file sets by filename to ensure deterministic order + file_sets = sorted(file_sets, key=lambda x: x["raw"].name) + + base_timestamp = None + + try: + if len(file_sets) == 1: + # Single-file run → that file IS the first file + first_file_set = file_sets[0] + logger.info( + f"Single-file run detected: {first_file_set['raw'].name}. " + "Extracting base timestamp from this file." + ) + + else: + # Multi-file run → look for _0000 + first_file_set = next( + fs for fs in file_sets + if fs["raw"].stem.endswith("_0000") + ) + logger.info( + f"Multi-file run detected. " + f"Extracting base timestamp from {first_file_set['raw'].name}" + ) + + # Create a temporary DataFrameCreator to extract base timestamp + first_dfc = DataFrameCreator( + config_dataframe=self._config, + h5_path=first_file_set["raw"], + is_first_file=True, + ) + base_timestamp = first_dfc.get_base_timestamp() + first_dfc.h5_file.close() + + logger.info(f"Base timestamp extracted: {base_timestamp}") + + except StopIteration: + logger.warning( + "Multi-file run detected but no '_0000' file found. " + "Base timestamp will not be extracted." + ) + except Exception as e: + logger.warning( + f"Could not extract base timestamp: {e}. " + "Processing files independently." + ) + + # ------------------------------------------------------- + # Calculate index offsets + # We need to read the 'index' channel (usually countId/NumOfEvents) to know the count. + # This requires a quick scan of files. + # ------------------------------------------------------- + index_offsets = {} + current_offset = 0 + + index_alias = self._config.get("index", ["countId"])[0] + try: + channel_config = self._config["channels"][index_alias] + dataset_key = channel_config["dataset_key"] + + # Prefer serial scan for safety and simplicity, though could be parallelized + # For 200 files it might take a few seconds. + logger.info("Calculating index offsets...") + for file_set in file_sets: + try: + with h5py.File(file_set["raw"], "r") as h5_file: + if dataset_key in h5_file: + + dset = h5_file[dataset_key] + # sum of all events in this file + # Use simple read if small enough + n_events = np.sum(dset) + + index_offsets[file_set["raw"].name] = int(current_offset) + current_offset += int(n_events) + else: + index_offsets[file_set["raw"].name] = int(current_offset) + except Exception as e: + logger.warning(f"Failed to read index offset from {file_set['raw'].name}: {e}") + index_offsets[file_set["raw"].name] = int(current_offset) + + logger.debug(f"Total events calculated: {current_offset}") + + except Exception as e: + logger.warning(f"Failed to calculate index offsets: {e}. Indices may reset.") + for fs in file_sets: + index_offsets[fs["raw"].name] = 0 + + # ------------------------------------------------------- + + n_cores = min(len(file_sets), self.n_cores) + if n_cores <= 0: + return + + def is_first_file(file_set) -> bool: + return ( + len(file_sets) == 1 + or file_set["raw"].stem.endswith("_0000") + ) + + if debug: + for file_set in file_sets: + self._save_buffer_file( + file_set, + is_first_file(file_set), + base_timestamp, + index_offset=index_offsets.get(file_set["raw"].name, 0), + ) + else: + # For parallel processing, we need to be careful about the order + # Process all files in parallel with the correct parameters + from joblib import Parallel, delayed + + Parallel(n_jobs=n_cores, verbose=10)( + delayed(self._save_buffer_file)( + file_set, + is_first_file(file_set), + base_timestamp, + index_offset=index_offsets.get(file_set["raw"].name, 0), + ) + for file_set in file_sets + ) + + def _save_buffer_file(self, file_set, is_first_file=True, base_timestamp=None, index_offset=0): + """ + Saves an HDF5 file to a Parquet file using the DataFrameCreator class. + + Args: + file_set: Dictionary containing file paths + is_first_file: Whether this is the first file in a multi-file run + base_timestamp: Base timestamp from the first file (for subsequent files) + index_offset: Offset to apply to the index + """ + start_time = time.time() # Add this line + paths = file_set + + dfc = DataFrameCreator( + config_dataframe=self._config, + h5_path=paths["raw"], + is_first_file=is_first_file, + base_timestamp=base_timestamp, + index_offset=index_offset + ) + df = dfc.df + + df_timed = dfc.df_timed + + # Save electron resolved dataframe + electron_channels = get_channels(self._config, "per_electron") + dtypes = get_dtypes(self._config, df.columns.values) + electron_df = df.dropna(subset=electron_channels).astype(dtypes).reset_index() + logger.debug(f"Saving electron buffer with shape: {electron_df.shape}") + electron_df.to_parquet(paths["electron"]) + + # Create and save timed dataframe + dtypes = get_dtypes(self._config, df_timed.columns.values) + timed_df = df_timed.astype(dtypes) + logger.debug(f"Saving timed buffer with shape: {timed_df.shape}") + timed_df.to_parquet(paths["timed"]) + + logger.debug(f"Processed {paths['raw'].stem} in {time.time() - start_time:.2f}s") + + def process_and_load_dataframe( + self, + h5_paths: list[Path], + folder: Path, + force_recreate: bool = False, + suffix: str = "", + debug: bool = False, + remove_invalid_files: bool = False, + filter_timed_by_electron: bool = True, + ) -> tuple[dd.DataFrame, dd.DataFrame]: + """ + Runs the buffer file creation process. + Does a schema check on the buffer files and creates them if they are missing. + Performs forward filling and splits the sector ID from the DLD time lazily. + + Args: + h5_paths (List[Path]): List of paths to H5 files. + folder (Path): Path to the folder for processed files. + force_recreate (bool): Flag to force recreation of buffer files. + suffix (str): Suffix for buffer file names. + debug (bool): Flag to enable debug mode.): + remove_invalid_files (bool): Flag to remove invalid files. + filter_timed_by_electron (bool): Flag to filter timed data by valid electron events. + + Returns: + Tuple[dd.DataFrame, dd.DataFrame]: The electron and timed dataframes. + """ + self.filter_timed_by_electron = filter_timed_by_electron + if remove_invalid_files: + h5_paths = self._validate_h5_files(self._config, h5_paths) + + self.fp = BufferFilePaths(h5_paths, folder, suffix) + + if not force_recreate: + schema_set = set( + get_channels(self._config, formats="all", index=True, extend_aux=True) + + [self._config["columns"].get("timestamp")], + ) + self._schema_check(self.fp["timed"], schema_set) + + self._schema_check(self.fp["electron"], schema_set) + + self._save_buffer_files(force_recreate, debug) + + # NEW: all files were invalid and skipped + if remove_invalid_files and not self.fp: + self.df = {"electron": None, "timed": None} + return + + self._get_dataframes() + + return self.df["electron"], self.df["timed"] diff --git a/src/sed/loader/cfel/dataframe.py b/src/sed/loader/cfel/dataframe.py new file mode 100644 index 00000000..8c6fd560 --- /dev/null +++ b/src/sed/loader/cfel/dataframe.py @@ -0,0 +1,354 @@ +""" +This module creates pandas DataFrames from HDF5 files for different levels of data granularity +[per electron, per pulse, and per train]. It efficiently handles concatenation of data from +various channels within the HDF5 file, making use of the structured nature data to optimize +join operations. This approach significantly enhances performance compared to earlier. +""" +from __future__ import annotations + +from pathlib import Path + +import h5py +import numpy as np +import pandas as pd + +from sed.core.logging import setup_logging +from sed.loader.flash.utils import get_channels +from sed.loader.flash.utils import InvalidFileError + +logger = setup_logging("cfel_dataframe_creator") + + +class DataFrameCreator: + """ + A class for creating pandas DataFrames from an HDF5 file for HEXTOF lab data at CFEL. + + Attributes: + h5_file (h5py.File): The HDF5 file object. + multi_index (pd.MultiIndex): The multi-index structure for the DataFrame. + _config (dict): The configuration dictionary for the DataFrame. + """ + + def __init__(self, config_dataframe: dict, h5_path: Path, + is_first_file: bool = True, base_timestamp: pd.Timestamp = None, + index_offset: int = 0) -> None: + """ + Initializes the DataFrameCreator class. + + Args: + config_dataframe (dict): The configuration dictionary with only the dataframe key. + h5_path (Path): Path to the h5 file. + is_first_file (bool): Whether this is the first file in a multi-file run. + base_timestamp (pd.Timestamp): Base timestamp from the first file (for subsequent files). + index_offset (int): Offset to apply to the index (countId) for multi-file runs. + """ + self.h5_file = h5py.File(h5_path, "r") + self._config = config_dataframe + self.is_first_file = is_first_file + self.base_timestamp = base_timestamp + self.index_offset = index_offset + + index_alias = self._config.get("index", ["countId"])[0] + + # get cumulative counts, but drop last because slow data only covers N-1 intervals + # Add index_offset + self.index = np.cumsum([0, *self.get_dataset_array(index_alias)])[:-1] + index_offset + + + def get_dataset_key(self, channel: str) -> str: + """ + Checks if 'dataset_key' exists and returns that. + + Args: + channel (str): The name of the channel. + + Returns: + str: The 'dataset_key'. + + Raises: + ValueError: If 'dataset_key' is not provided. + """ + channel_config = self._config["channels"][channel] + if "dataset_key" in channel_config: + return channel_config["dataset_key"] + error = f"For channel: {channel}, provide 'dataset_key'." + raise ValueError(error) + + def get_dataset_array( + self, + channel: str, + ) -> h5py.Dataset: + """ + Returns a numpy array for a given channel name. + + Args: + channel (str): The name of the channel. + slice_ (bool): Applies slicing on the dataset. Default is True. + + Returns: + h5py.Dataset: The channel's data as a h5py.Dataset object. + """ + # Get the data from the necessary h5 file and channel + dataset_key = self.get_dataset_key(channel) + dataset = self.h5_file[dataset_key] + + return dataset + + def get_base_timestamp(self) -> pd.Timestamp: + """ + Extracts the base timestamp from the first file to be used for subsequent files. + + Returns: + pd.Timestamp: The base timestamp from the first file. + """ + if not self.is_first_file: + raise ValueError("get_base_timestamp() should only be called on the first file") + + first_timestamp = self.h5_file[self._config.get("first_event_time_stamp_key")][0] + return pd.to_datetime(first_timestamp.decode()) + + @property + def df_electron(self) -> pd.DataFrame: + """ + Returns a pandas DataFrame for channel names of type [per electron]. + + Returns: + pd.DataFrame: The pandas DataFrame for the 'per_electron' channel's data. + """ + # Get the relevant channels and their slice index + channels = get_channels(self._config, "per_electron") + if channels == []: + return pd.DataFrame() + + series = { + channel: pd.Series( + self.get_dataset_array(channel), + index=pd.RangeIndex( + self.index_offset, + self.index_offset + len(self.get_dataset_array(channel)), + ), + ) + for channel in channels + } + dataframe = pd.concat(series, axis=1) + return dataframe.dropna() + + @property + def df_train(self) -> pd.DataFrame: + """ + Returns a pandas DataFrame for given channel names of type [per train]. + + Returns: + pd.DataFrame: The pandas DataFrame for the 'per_train' channel's data. + """ + series = [] + # Get the relevant channel names + channels = get_channels(self._config, "per_train") + # auxiliary dataset (which is stored in the same dataset as other DLD channels) + aux_alias = self._config.get("aux_alias", "dldAux") + + # For each channel, a pd.Series is created and appended to the list + for channel in channels: + dataset = self.get_dataset_array(channel) + + if channel == aux_alias: + try: + sub_channels = self._config["channels"][aux_alias]["sub_channels"] + except KeyError: + raise KeyError( + f"Provide 'sub_channels' for auxiliary channel '{aux_alias}'.", + ) + for name, values in sub_channels.items(): + series.append( + pd.Series( + dataset[:, values["slice"]], + self.index,# changed together with __init__ line 52 + # works together with __init__ line 50, but has different len of TimeStamps and Index + # self.index[:-1], + name=name, + ), + ) + else: + series.append(pd.Series(dataset, self.index, name=channel))# changed together with __init__ line 52 + # works together with __init__ line 50, but has different len of TimeStamps and Index + # series.append(pd.Series(dataset, self.index[:-1], name=channel)) + # All the channels are concatenated to a single DataFrame + return pd.concat(series, axis=1) + + @property + def df_timestamp(self) -> pd.DataFrame: + """ + Generates a DataFrame of timestamps for each acquisition point. + + - Uses `first_event_time_stamp_key` from the first file as the global StartTime. + - Uses `millisecCounter` (if available) as a monotonic global time across all files. + - If `millisecCounter` is not available, uses cumulative exposure times from `ms_markers_key` + to approximate acquisition times. + - Returns timestamps as seconds since the UNIX epoch (1970-01-01). + + Returns + ------- + pd.DataFrame + DataFrame with a single column containing the computed timestamps. + """ + # ------------------------------------------------------------ + # 1) Establish global StartTime (absolute origin) + # ------------------------------------------------------------ + start_time_key = self._config.get("first_event_time_stamp_key")#"/ScanParam/StartTime" + + if self.is_first_file: + if start_time_key not in self.h5_file: + raise KeyError("StartTime not found in first file") + + start_time_raw = self.h5_file[start_time_key][0] + base_timestamp = pd.to_datetime(start_time_raw.decode()) + logger.warning(f"DEBUG: Taking first file with ScanStart as a timestamp: {base_timestamp}") + + # Persist base timestamp for subsequent files + self.base_timestamp = base_timestamp + else: + if self.base_timestamp is None: + raise RuntimeError("base_timestamp not initialized (first file missing?)") + base_timestamp = self.base_timestamp + + # ------------------------------------------------------------ + # 2) Determine timing offsets + # ------------------------------------------------------------ + millis_key = self._config.get("millis_counter_key", "/DLD/millisecCounter") + exposure_key = self._config.get("ms_markers_key") + + if millis_key in self.h5_file and len(self.h5_file[millis_key]) > 0: + # Preferred: global millisecond counter + offsets = pd.to_timedelta( + np.asarray(self.h5_file[millis_key], dtype=np.float64), + unit="ms", + ) + logger.warning(f"DEBUG: MillisecCounter available, offsets: {offsets}") + + elif exposure_key in self.h5_file: + # Fallback: cumulative exposure time (seconds) + exposure = np.asarray(self.h5_file[exposure_key], dtype=np.float64) + offsets = pd.to_timedelta(np.cumsum(exposure), unit="s") + logger.warning(f"DEBUG: Using cumulative exposure, offsets: {offsets}") + + else: + raise ValueError( + "Cannot construct timestamps: neither millisecCounter nor exposure times available" + ) + + # ------------------------------------------------------------ + # 3) Construct absolute timestamps + # ------------------------------------------------------------ + timestamps = base_timestamp + offsets + + # Convert to UNIX seconds (float) + unix_seconds = (timestamps - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s") + + # ------------------------------------------------------------ + # 4) Build DataFrame + # ------------------------------------------------------------ + ts_alias = self._config["columns"].get("timestamp", "timeStamp") + df = pd.DataFrame({ts_alias: unix_seconds}, index=self.index) + + # # # Suppose df is your timestamp DataFrame + # print("DEBUG of df") + # ts_alias = "timeStamp" # or whatever your config uses + # timestamps = df[ts_alias].to_numpy() + + # # Compare lengths + # if len(timestamps) != len(df.index): + # print(f"Length mismatch: timestamps={len(timestamps)}, index={len(df.index)}") + + # # Detect NaNs (if any were introduced) + # nan_rows = df[df[ts_alias].isna()] + # print("Rows with NaN timestamps (if any):") + # print(nan_rows) + + # # Detect where timestamp differences are huge (likely artificial or missing) + # dt = np.diff(timestamps) + # threshold = np.median(dt) * 10 # e.g., 10× median interval + # anomalous_indices = np.where(dt > threshold)[0] + # print("Indices where timestamp jump is unusually large:") + # print(anomalous_indices) + + # # Optionally, see these rows in the DataFrame + # print(df.iloc[anomalous_indices]) + + return df + + # def validate_channel_keys(self) -> None: + # """ + # Validates if the dataset keys for all channels in the config exist in the h5 file. + + # Raises: + # InvalidFileError: If the dataset keys are missing in the h5 file. + # """ + # invalid_channels = [] + # for channel in self._config["channels"]: + # dataset_key = self.get_dataset_key(channel) + # if dataset_key not in self.h5_file: + # invalid_channels.append(channel) + + # if invalid_channels: + # raise InvalidFileError(invalid_channels) + def validate_channel_keys(self) -> None: + """ + Validates if the dataset keys for all channels in the config exist in the h5 file. + + Raises: + InvalidFileError: If the dataset keys are missing in the h5 file. + """ + invalid_channels = [] + + for channel in self._config["channels"]: + dataset_key = self.get_dataset_key(channel) + + # missing key + if dataset_key not in self.h5_file: + invalid_channels.append(channel) + continue + + # empty dataset + dataset = self.h5_file[dataset_key] + if len(dataset) == 0: + invalid_channels.append(channel) + + if invalid_channels: + raise InvalidFileError(invalid_channels) + + + @property + def df(self) -> pd.DataFrame: + """ + Joins the 'per_electron', 'per_pulse' using concat operation, + returning a single dataframe. + + Returns: + pd.DataFrame: The combined pandas DataFrame. + """ + + self.validate_channel_keys() + df_train = self.df_train + df_timestamp = self.df_timestamp + df = pd.concat((self.df_electron, df_train, df_timestamp), axis=1) + ffill_cols = list(df_train.columns) + list(df_timestamp.columns) + df[ffill_cols] = df[ffill_cols].ffill() + df.index.name = self._config.get("index", ["countId"])[0] + return df + + @property + def df_timed(self) -> pd.DataFrame: + """ + Joins the 'per_electron', 'per_pulse' using concat operation, + returning a single dataframe. + + Returns: + pd.DataFrame: The combined pandas DataFrame. + """ + + self.validate_channel_keys() + df_train = self.df_train + df_timestamp = self.df_timestamp + df = pd.concat((self.df_electron, df_train, df_timestamp), axis=1, join="inner") + df.index.name = self._config.get("index", ["countId"])[0] + return df diff --git a/src/sed/loader/cfel/loader.py b/src/sed/loader/cfel/loader.py new file mode 100644 index 00000000..be8001c3 --- /dev/null +++ b/src/sed/loader/cfel/loader.py @@ -0,0 +1,855 @@ +""" +This module implements the cfel data loader (for hextof's lab data). +This loader currently supports hextof, wespe and instruments with similar structure. +The raw hdf5 data is combined and saved into buffer files and loaded as a dask dataframe. +The dataframe is an amalgamation of all h5 files for a combination of runs, where the NaNs are +automatically forward-filled across different files. +This can then be saved as a parquet for out-of-sed processing and reread back to access other +sed functionality. +""" +from __future__ import annotations + +import re +import time +from collections.abc import Sequence +from pathlib import Path + +import dask.dataframe as dd +import h5py +import numpy as np +import scipy.interpolate as sint +from natsort import natsorted +from typing import Sequence + +from sed.core.logging import set_verbosity +from sed.core.logging import setup_logging +from sed.loader.base.loader import BaseLoader +from sed.loader.cfel.buffer_handler import BufferHandler +from sed.loader.flash.metadata import MetadataRetriever + +import pandas as pd + +# Configure logging +logger = setup_logging("flash_loader") + + +class CFELLoader(BaseLoader): + """ + The class generates multiindexed multidimensional pandas dataframes from the new FLASH + dataformat resolved by both macro and microbunches alongside electrons. + Only the read_dataframe (inherited and implemented) method is accessed by other modules. + + Args: + config (dict, optional): Config dictionary. Defaults to None. + verbose (bool, optional): Option to print out diagnostic information. + Defaults to True. + """ + + __name__ = "cfel" + + supported_file_types = ["h5"] + + def __init__(self, config: dict, verbose: bool = True) -> None: + """ + Initializes the FlashLoader. + + Args: + config (dict): Configuration dictionary. + verbose (bool, optional): Option to print out diagnostic information. + """ + super().__init__(config=config, verbose=verbose) + + set_verbosity(logger, self._verbose) + + self.instrument: str = self._config["core"].get("instrument", "hextof") # default is hextof + self.beamtime_dir: str = None + self.raw_dir: str = None + self.processed_dir: str = None + self.meta_dir: str = None + + @property + def verbose(self) -> bool: + """Accessor to the verbosity flag. + + Returns: + bool: Verbosity flag. + """ + return self._verbose + + @verbose.setter + def verbose(self, verbose: bool): + """Setter for the verbosity. + + Args: + verbose (bool): Option to turn on verbose output. Sets loglevel to INFO. + """ + self._verbose = verbose + set_verbosity(logger, self._verbose) + + def __len__(self) -> int: + """ + Returns the total number of rows in the electron resolved dataframe. + + Returns: + int: Total number of rows. + """ + try: + file_statistics = self.metadata["file_statistics"]["electron"] + except KeyError as exc: + raise KeyError("File statistics missing. Use 'read_dataframe' first.") from exc + + total_rows = sum(stats["num_rows"] for stats in file_statistics.values()) + return total_rows + + + def _initialize_dirs(self) -> None: + """ + Initializes the directories on Maxwell based on configuration. If paths is provided in + the configuration, the raw data directory and parquet data directory are taken from there. + Otherwise, the beamtime_id and year are used to locate the data directories. + The first path that has either online- or express- prefix, or the daq name is taken as the + raw data directory. + + Raises: + ValueError: If required values are missing from the configuration. + FileNotFoundError: If the raw data directories are not found. + """ + # Parses to locate the raw beamtime directory from config file + # Only raw_dir is necessary, processed_dir can be based on raw_dir, if not provided + if "paths" in self._config["core"]: + raw_dir = Path(self._config["core"]["paths"].get("raw", "")) + print(raw_dir) + processed_dir = Path( + self._config["core"]["paths"].get("processed", raw_dir.joinpath("processed")), + ) + meta_dir = Path( + self._config["core"]["paths"].get("meta", raw_dir.joinpath("meta")), + ) + beamtime_dir = Path(raw_dir).parent + + else: + try: + beamtime_id = self._config["core"]["beamtime_id"] + year = self._config["core"]["year"] + + except KeyError as exc: + raise ValueError( + "The beamtime_id and year are required.", + ) from exc + + beamtime_dir = Path( + self._config["core"]["beamtime_dir"][self._config["core"]["beamline"]], + ) + beamtime_dir = beamtime_dir.joinpath(f"{year}/data/{beamtime_id}/") + + # Use pathlib walk to reach the raw data directory + raw_paths: list[Path] = [] + + for path in beamtime_dir.joinpath("raw").glob("**/*"): + if path.is_dir(): + dir_name = path.name + if dir_name.startswith(("online-", "express-")): + raw_paths.append(path.joinpath(self._config["dataframe"]["daq"])) + elif dir_name == self._config["dataframe"]["daq"].upper(): + raw_paths.append(path) + + if not raw_paths: + raise FileNotFoundError("Raw data directories not found.") + + raw_dir = raw_paths[0].resolve() + + processed_dir = beamtime_dir.joinpath("processed") + meta_dir = beamtime_dir.joinpath("meta/fabtrack/") # cspell:ignore fabtrack + + processed_dir.mkdir(parents=True, exist_ok=True) + + self.beamtime_dir = str(beamtime_dir) + self.raw_dir = str(raw_dir) + self.processed_dir = str(processed_dir) + self.meta_dir = str(meta_dir) + + def _file_index(path: Path) -> int: + """ + Extract file index from filename. + Returns 0 for single-file runs. + """ + stem = path.stem # no extension + parts = stem.rsplit("_", 1) + + if len(parts) == 2 and parts[1].isdigit(): + return int(parts[1]) + + return 0 + + @property + def available_runs(self) -> list[int]: + # Get all files in raw_dir with "run" in their names + files = list(Path(self.raw_dir).glob("*run*")) + + # Extract run IDs from filenames + run_ids = set() + for file in files: + match = re.search(r"run(\d+)", file.name) + if match: + run_ids.add(int(match.group(1))) + + # Return run IDs in sorted order + return sorted(list(run_ids)) + + # def get_files_from_run_id( # type: ignore[override] + # self, + # run_id: str | int, + # folders: str | Sequence[str] = None, + # extension: str = "h5", + # ) -> list[str]: + # """ + # Returns a list of filenames for a given run located in the specified directory + # for the specified data acquisition (daq). + + # Args: + # run_id (str | int): The run identifier to locate. + # folders (str | Sequence[str], optional): The directory(ies) where the raw + # data is located. Defaults to config["core"]["base_folder"]. + # extension (str, optional): The file extension. Defaults to "h5". + + # Returns: + # list[str]: A list of path strings representing the collected file names. + + # Raises: + # FileNotFoundError: If no files are found for the given run in the directory. + # """ + # # Define the stream name prefixes based on the data acquisition identifier + # stream_name_prefixes = self._config["core"].get("stream_name_prefixes") + + # if folders is None: + # folders = self._config["core"]["base_folder"] + + # if isinstance(folders, str): + # folders = [folders] + + # daq = self._config["dataframe"]["daq"] + + # # Generate the file patterns to search for in the directory + # if stream_name_prefixes: + # file_pattern = f"{stream_name_prefixes[daq]}_run{run_id}_*." + extension + # else: + # file_pattern = f"*{run_id}*." + extension + + # files: list[Path] = [] + # # Use pathlib to search for matching files in each directory + # for folder in folders: + # files.extend( + # natsorted( + # Path(folder).glob(file_pattern), + # key=lambda filename: str(filename).rsplit("_", maxsplit=1)[-1], + # ), + # ) + + # # Check if any files are found + # if not files: + # raise FileNotFoundError( + # f"No files found for run {run_id} in directory {str(folders)}", + # ) + + # # Return the list of found files + # return [str(file.resolve()) for file in files] + + def get_files_from_run_id( # type: ignore[override] + self, + run_id: str | int, + folders: str | Sequence[str] = None, + extension: str = "h5", + ) -> list[str]: + + stream_name_prefixes = self._config["core"].get("stream_name_prefixes") + + if folders is None: + folders = self._config["core"]["base_folder"] + + if isinstance(folders, str): + folders = [folders] + + daq = self._config["dataframe"]["daq"] + + if stream_name_prefixes: + file_pattern = f"{stream_name_prefixes[daq]}_run{run_id}*.{extension}" + else: + file_pattern = f"*{run_id}*.{extension}" + + def file_index(path: Path) -> int: + stem = path.stem + parts = stem.rsplit("_", 1) + if len(parts) == 2 and parts[1].isdigit(): + return int(parts[1]) + return 0 # single-file run + + files: list[Path] = [] + for folder in folders: + files.extend( + natsorted( + Path(folder).glob(file_pattern), + key=file_index, + ) + ) + + if not files: + raise FileNotFoundError( + f"No files found for run {run_id} in directory {folders}", + ) + + return [str(file.resolve()) for file in files] + + def _resolve_fids( + self, + fids: Sequence[int] | None = None, + runs: Sequence[int] | None = None, + first_files: int | None = None, + ) -> list[int]: + """ + Resolve run IDs or file IDs into a list of file indices into self.files. + Ensures consistent ordering in acquisition time. + + Parameters + ---------- + fids : Sequence[int] | None + Specific file indices to use. + runs : Sequence[int] | None + Run IDs to include. + first_files : int | None + If given, limits the result to the first N files. + + Returns + ------- + list[int] + List of file indices in acquisition order. + """ + if runs is not None: + fids_resolved = [] + for run_id in runs: + if self.raw_dir is None: + self._initialize_dirs() + files_in_run = self.get_files_from_run_id(run_id=run_id, folders=self.raw_dir) + fids_resolved.extend([self.files.index(f) for f in files_in_run]) + elif fids is not None: + fids_resolved = list(fids) + else: + fids_resolved = list(range(len(self.files))) + + if first_files is not None: + fids_resolved = fids_resolved[:first_files] + + return fids_resolved + + + def parse_scicat_metadata(self, token: str = None) -> dict: + """Uses the MetadataRetriever class to fetch metadata from scicat for each run. + + Returns: + dict: Metadata dictionary + token (str, optional):: The scicat token to use for fetching metadata + """ + if "metadata" not in self._config: + return {} + + metadata_retriever = MetadataRetriever(self._config["metadata"], token) + metadata = metadata_retriever.get_metadata( + beamtime_id=self._config["core"]["beamtime_id"], + runs=self.runs, + metadata=self.metadata, + ) + + return metadata + + def parse_local_metadata(self) -> dict: + """Uses the MetadataRetriever class to fetch metadata from local folder for each run. + + Returns: + dict: Metadata dictionary + """ + if "metadata" not in self._config: + return {} + + metadata_retriever = MetadataRetriever(self._config["metadata"]) + metadata = metadata_retriever.get_local_metadata( + beamtime_id=self._config["core"]["beamtime_id"], + beamtime_dir=self.beamtime_dir, + meta_dir=self.meta_dir, + runs=self.runs, + metadata=self.metadata, + ) + + return metadata + + # ------------------------------- + # Count rate with millisecCounter + # ------------------------------- + def get_count_rate_ms( + self, + fids: Sequence[int] | None = None, + *, + mode: str = "file", # "file" or "point" + first_files: int | None = None, + ) -> tuple[np.ndarray, np.ndarray]: + """ + Count-rate calculation using millisecCounter and NumOfEvents. + + Parameters + ---------- + fids : Sequence[int] or None + File IDs to include. Default: all. + mode : {"file", "point"} + - "point": rate per acquisition window + - "file" : one average rate per file + first_files : int or None + If given, only the first N files are used. + + Returns + ------- + rates : np.ndarray + Count rate in Hz. + times : np.ndarray + Time in seconds (window end time for point mode, last time per file for file mode) + """ + millis_key = self._config.get("millis_counter_key", "/DLD/millisecCounter") + counts_key = self._config.get("num_events_key", "/DLD/NumOfEvents") + + fids_resolved = self._resolve_fids(fids=fids, first_files=first_files) + + # ------------------------------- + # 1) Load and concatenate (for point-mode) + # ------------------------------- + ms_all = [] + counts_all = [] + file_ms_min_max = [] # store min/max per file for file-mode + file_counts_total = [] + + for fid in fids_resolved: + with h5py.File(self.files[fid], "r") as h5: + ms = np.asarray(h5[millis_key], dtype=np.float64) + c = np.asarray(h5[counts_key], dtype=np.float64) if counts_key in h5 else np.ones_like(ms) + + if len(ms) != len(c): + raise ValueError(f"Length mismatch in file {self.files[fid]}") + + ms_all.append(ms) + counts_all.append(c) + file_ms_min_max.append((ms[0], ms[-1])) + file_counts_total.append(c.sum()) + + logger.debug(f"[get_count_rate_ms] File {fid}: ms_min={ms[0]}, ms_max={ms[-1]}, counts={c.sum()}") + + # Flatten arrays for point-mode + ms_concat = np.concatenate(ms_all) + counts_concat = np.concatenate(counts_all) + + # Ensure global time order + order = np.argsort(ms_concat) + ms_concat = ms_concat[order] + counts_concat = counts_concat[order] + + # ------------------------------- + # 2) Compute point-resolved rates + # ------------------------------- + if mode == "point": + bin_size = kwds.pop("bin_size", 1) + dt = np.diff(ms_concat) * 1e-3 + if np.any(dt <= 0): + # Handle potential duplicate timestamps or jump back (should not happen with sort) + dt[dt <= 0] = 1e-6 # small epsilon + rates_point = counts_concat[1:] / dt + + if bin_size > 1: + # Apply rolling average for smoothing + rates_point = ( + pd.Series(rates_point) + .rolling(window=bin_size, center=True, min_periods=1) + .mean() + .values + ) + + times_point = ms_concat[1:] * 1e-3 + return rates_point, times_point + + # ------------------------------- + # 3) Compute file-resolved rates (correcting gaps) + # ------------------------------- + rates_file = [] + times_file = [] + for idx, (ms_min, ms_max) in enumerate(file_ms_min_max): + # Duration = internal file window + file_duration = ms_max - ms_min + if file_duration <= 0: + # If single point or overlapping min/max, fallback or raise? + # For single point (duration 0), rate is undefined (inf). + # Start/End timestamps usually imply a range. + # If strictly 0, we can't calculate rate. + logger.warning( + f"[get_count_rate_ms] File {fids_resolved[idx]} has duration <= 0 ({file_duration}). " + "Skipping rate calculation for this file (set to NaN).", + ) + rates_file.append(np.nan) + times_file.append((ms_min + ms_max) / 2 * 1e-3) + continue + + # print(f"Total counts: {file_counts_total[idx]}") + # print(f"File duration: {file_duration}") + rate = file_counts_total[idx] / (file_duration * 1e-3) + rates_file.append(rate) + # times_file.append(ms_max * 1e-3) # last time in file + times_file.append((ms_min + ms_max) / 2 * 1e-3) # midpoint of the file + + + logger.debug( + f"[get_count_rate_ms][file] File {fids_resolved[idx]}: ms_min={ms_min}, ms_max={ms_max}, " + f"counts={file_counts_total[idx]}, duration={file_duration} ms, rate={rate:.2f} Hz" + ) + + return np.array(rates_file), np.array(times_file) + + + # ------------------------------- + # File-based count rate + # ------------------------------- + # def get_count_rate( + # self, + # fids: Sequence[int] | None = None, + # runs: Sequence[int] | None = None, + # ) -> tuple[np.ndarray, np.ndarray]: + # """ + # Returns count rate per file using the total number of events and elapsed time. + # Calculates the count rate using the number of rows and elapsed time for each file. + # Hence the resolution is not very high, but this method is very fast. + + # Args: + # fids (Sequence[int]): A sequence of file IDs. Defaults to all files. + + # Keyword Args: + # runs: A sequence of run IDs. + + # Returns: + # tuple[np.ndarray, np.ndarray]: The count rate and elapsed time in seconds. + + # Raises: + # KeyError: If the file statistics are missing. + # """ + # fids_resolved = self._resolve_fids(fids=fids, runs=runs) + + # all_counts = [self.metadata["file_statistics"]["electron"][str(fid)]["num_rows"] for fid in fids_resolved] + # elapsed_times = [self.get_elapsed_time(fids=[fid]) for fid in fids_resolved] + # print(elapsed_times,all_counts) + + # # count_rate = np.array(all_counts) / np.array(elapsed_times) + # count_rate = np.array(all_counts) / np.array(elapsed_times).flatten() + # print(f"Count rates: {count_rate}") + # times = np.cumsum(elapsed_times) + # return count_rate, times + def get_count_rate( + self, + fids: Sequence[int] | None = None, + runs: Sequence[int] | None = None, + **kwds, + ) -> tuple[np.ndarray, np.ndarray]: + """ + Returns the count rate. By default, returns high-resolution + point-resolved rates using the millisecond counter. + + Args: + fids (Sequence[int], optional): + File IDs to include. Defaults to all files. + runs (Sequence[int], optional): + Run IDs to include. If provided, overrides `fids`. + **kwds: + Additional arguments passed to `get_count_rate_ms`. + - mode: "point" (default) or "file". + + Returns: + tuple[np.ndarray, np.ndarray]: + - count_rate : array of count rates in Hz + - time : array of global times in seconds since scan start + """ + mode = kwds.pop("mode", "point") + return self.get_count_rate_ms(fids=fids, mode=mode, runs=runs, **kwds) + + # ------------------------------- + # Time-resolved count rate (binned) + # ------------------------------- + def get_count_rate_time_resolved( + self, + fids: Sequence[int] | None = None, + time_bin_size: float = 1.0, + runs: Sequence[int] | None = None, + ) -> tuple[np.ndarray, np.ndarray]: + """ + Returns count rate in time bins using metadata timestamps. + Calculates the count rate over time within each file using timestamp binning. + + Args: + fids (Sequence[int]): A sequence of file IDs. Defaults to all files. + time_bin_size (float): Time bin size in seconds for rate calculation. Defaults to 1.0. + + Keyword Args: + runs: A sequence of run IDs. + + Returns: + tuple[np.ndarray, np.ndarray]: The count rate array and time array in seconds. + + Raises: + KeyError: If the file statistics are missing. + """ + fids_resolved = self._resolve_fids(fids=fids, runs=runs) + + all_rates = [] + all_times = [] + cumulative_time = 0.0 + + for fid in fids_resolved: + file_statistics = self.metadata["file_statistics"]["timed"] + time_stamp_alias = self._config["dataframe"]["columns"].get("timestamp", "timeStamp") + time_stamps = file_statistics[str(fid)]["columns"][time_stamp_alias] + + t_min = float(getattr(time_stamps["min"], "total_seconds", lambda: time_stamps["min"])()) + t_max = float(getattr(time_stamps["max"], "total_seconds", lambda: time_stamps["max"])()) + total_counts = self.metadata["file_statistics"]["electron"][str(fid)]["num_rows"] + file_duration = t_max - t_min + print(f"File duration: {file_duration}") + + n_bins = max(int(file_duration / time_bin_size), 1) + counts_per_bin = total_counts / n_bins + rate_per_bin = counts_per_bin / time_bin_size + + bin_centers = np.linspace( + cumulative_time + time_bin_size / 2, + cumulative_time + file_duration - time_bin_size / 2, + n_bins, + ) + + rates = np.full(n_bins, rate_per_bin) + all_rates.extend(rates) + all_times.extend(bin_centers) + + cumulative_time += file_duration + + return np.array(all_rates), np.array(all_times) + + def get_elapsed_time( + self, + fids: Sequence[int] | None = None, + *, + runs: Sequence[int] | None = None, + first_files: int | None = None, + aggregate: bool = False, + ) -> float | list[float]: + """ + Calculates the elapsed acquisition time. + + Uses global timestamp / millisecCounter logic established in + read_dataframe() and df_timestamp. + + Parameters + ---------- + fids : Sequence[int] | None + File IDs to include. + runs : Sequence[int] | None + Run IDs to include. + first_files : int | None + Limit to first N resolved files. + aggregate : bool + If True, return total elapsed time (s), + otherwise return per-file elapsed times. + + Returns + ------- + float | list[float] + Elapsed time(s) in seconds. + """ + + try: + file_statistics = self.metadata["file_statistics"]["timed"] + except Exception as exc: + raise KeyError( + "File statistics missing. Use 'read_dataframe' first." + ) from exc + + ts_alias = self._config["dataframe"]["columns"].get( + "timestamp", + "timeStamp", + ) + + # ---------------------------- + # Resolve files consistently + # ---------------------------- + fids_resolved = self._resolve_fids( + fids=fids, + runs=runs, + first_files=first_files, + ) + + elapsed_per_file: list[float] = [] + + for fid in fids_resolved: + try: + ts_info = file_statistics[str(fid)]["columns"][ts_alias] + print(f"ts_info: {ts_info}") + dt = ts_info["max"] - ts_info["min"] + + # normalize to seconds + if hasattr(dt, "total_seconds"): + dt_s = dt.total_seconds() + else: + dt_s = float(dt) + + if dt_s < 0: + raise ValueError( + f"Negative elapsed time in file {fid}: {dt_s}" + ) + + except KeyError as exc: + filename = ( + Path(self.files[fid]).name + if fid < len(self.files) + else f"file_{fid}" + ) + raise KeyError( + f"Timestamp metadata missing in file {filename} (fid={fid}). " + "Add timestamp column and alias to config before loading." + ) from exc + + elapsed_per_file.append(dt_s) + + if aggregate: + print("aggregate is True") + return sum(elapsed_per_file) + + print(f"Elapsed time: {elapsed_per_file}") + return elapsed_per_file + + def read_dataframe( + self, + files: str | Sequence[str] = None, + folders: str | Sequence[str] = None, + runs: str | int | Sequence[str | int] = None, + ftype: str = "h5", + metadata: dict | None = None, + collect_metadata: bool = False, + **kwds, + ) -> tuple[dd.DataFrame, dd.DataFrame, dict]: + """ + Read express data from the DAQ, generating a parquet in between. + + Args: + files (str | Sequence[str], optional): File path(s) to process. Defaults to None. + folders (str | Sequence[str], optional): Path to folder(s) where files are stored + Path has priority such that if it's specified, the specified files will be ignored. + Defaults to None. + runs (str | int | Sequence[str | int], optional): Run identifier(s). + Corresponding files will be located in the location provided by ``folders``. + Takes precedence over ``files`` and ``folders``. Defaults to None. + ftype (str, optional): The file extension type. Defaults to "h5". + metadata (dict, optional): Additional metadata. Defaults to None. + collect_metadata (bool, optional): Whether to collect metadata. Defaults to False. + + Keyword Args: + detector (str, optional): The detector to use. Defaults to "". + force_recreate (bool, optional): Whether to force recreation of the buffer files. + Defaults to False. + processed_dir (str, optional): The directory to save the processed files. + Defaults to None. + debug (bool, optional): Whether to run buffer creation in serial. Defaults to False. + remove_invalid_files (bool, optional): Whether to exclude invalid files. + Defaults to False. + token (str, optional): The scicat token to use for fetching metadata. If provided, + will be saved to .env file for future use. If not provided, will check environment + variables when collect_metadata is True. + filter_timed_by_electron (bool, optional): When True, the timed dataframe will only + contain data points where valid electron events were detected. When False, all + timed data points are included regardless of electron detection. Defaults to True. + + Returns: + tuple[dd.DataFrame, dd.DataFrame, dict]: A tuple containing the concatenated DataFrame + and metadata. + + Raises: + ValueError: If neither 'runs' nor 'files'/'raw_dir' is provided. + FileNotFoundError: If the conversion fails for some files or no data is available. + ValueError: If collect_metadata is True and no token is available. + """ + if metadata is None: + metadata = {} + + detector = kwds.pop("detector", "") + force_recreate = kwds.pop("force_recreate", False) + processed_dir = kwds.pop("processed_dir", None) + debug = kwds.pop("debug", False) + remove_invalid_files = kwds.pop("remove_invalid_files", False) + token = kwds.pop("token", None) + filter_timed_by_electron = kwds.pop("filter_timed_by_electron", True) + + if len(kwds) > 0: + raise ValueError(f"Unexpected keyword arguments: {kwds.keys()}") + t0 = time.time() + + self._initialize_dirs() + # Prepare a list of names for the runs to read and parquets to write + if runs is not None: + files = [] + runs_ = [str(runs)] if isinstance(runs, (str, int)) else list(map(str, runs)) + for run in runs_: + run_files = self.get_files_from_run_id( + run_id=run, + folders=self.raw_dir, + ) + files.extend(run_files) + self.runs = runs_ + super().read_dataframe(files=files, ftype=ftype) + else: + # This call takes care of files and folders. As we have converted runs into files + # already, they are just stored in the class by this call. + super().read_dataframe( + files=files, + folders=folders, + ftype=ftype, + metadata=metadata, + ) + + bh = BufferHandler( + config=self._config, + ) + + # if processed_dir is None, use self.processed_dir + processed_dir = processed_dir or self.processed_dir + processed_dir = Path(processed_dir) + + # Obtain the parquet filenames, metadata, and schema from the method + # which handles buffer file creation/reading + h5_paths = [Path(file) for file in self.files] + df, df_timed = bh.process_and_load_dataframe( + h5_paths=h5_paths, + folder=processed_dir, + force_recreate=force_recreate, + suffix=detector, + debug=debug, + remove_invalid_files=remove_invalid_files, + filter_timed_by_electron=filter_timed_by_electron, + ) + + scicat_metadata = self.parse_scicat_metadata(token) + scicat_runs = scicat_metadata.get("scientificMetadata", {}) + + if not any(scicat_runs.values()): + logger.warning("No SciCat metadata available, checking local folder") + self.metadata.update(self.parse_local_metadata()) + else: + logger.warning("Metadata taken from SciCat") + if collect_metadata: + self.metadata.update(scicat_metadata) + + self.metadata.update(bh.metadata) + + print(f"loading complete in {time.time() - t0: .2f} s") + + return df, df_timed, self.metadata + + + + +LOADER = CFELLoader diff --git a/src/sed/loader/flash/buffer_handler.py b/src/sed/loader/flash/buffer_handler.py index d56de29f..b68de4d4 100644 --- a/src/sed/loader/flash/buffer_handler.py +++ b/src/sed/loader/flash/buffer_handler.py @@ -1,13 +1,14 @@ from __future__ import annotations import os -from pathlib import Path import time +from pathlib import Path import dask.dataframe as dd import pyarrow.parquet as pq from joblib import delayed from joblib import Parallel +from pandas import MultiIndex from sed.core.dfops import forward_fill_lazy from sed.core.logging import setup_logging @@ -40,11 +41,9 @@ class BufferFilePaths: def __init__( self, - config: dict, h5_paths: list[Path], folder: Path, suffix: str, - remove_invalid_files: bool, ) -> None: """Initializes the BufferFilePaths. @@ -57,9 +56,6 @@ def __init__( folder = folder / "buffer" folder.mkdir(parents=True, exist_ok=True) - if remove_invalid_files: - h5_paths = self.remove_invalid_files(config, h5_paths) - self._file_paths = self._create_file_paths(h5_paths, folder, suffix) def _create_file_paths( @@ -93,18 +89,6 @@ def file_sets_to_process(self, force_recreate: bool = False) -> list[dict[str, P return self._file_paths return [file_set for file_set in self if any(not file_set[key].exists() for key in DF_TYP)] - def remove_invalid_files(self, config, h5_paths: list[Path]) -> list[Path]: - valid_h5_paths = [] - for h5_path in h5_paths: - try: - dfc = DataFrameCreator(config_dataframe=config, h5_path=h5_path) - dfc.validate_channel_keys() - valid_h5_paths.append(h5_path) - except InvalidFileError as e: - logger.info(f"Skipping invalid file: {h5_path.stem}\n{e}") - - return valid_h5_paths - class BufferHandler: """ @@ -125,14 +109,27 @@ def __init__( self.n_cores: int = config["core"].get("num_cores", os.cpu_count() - 1) self.fp: BufferFilePaths = None self.df: dict[str, dd.DataFrame] = {typ: None for typ in DF_TYP} + fill_formats = self._config.get("fill_formats", ["per_train", "per_pulse"]) self.fill_channels: list[str] = get_channels( self._config, - ["per_pulse", "per_train"], + fill_formats, extend_aux=True, ) self.metadata: dict = {} self.filter_timed_by_electron: bool = None + def _validate_h5_files(self, config, h5_paths: list[Path]) -> list[Path]: + valid_h5_paths = [] + for h5_path in h5_paths: + try: + dfc = DataFrameCreator(config_dataframe=config, h5_path=h5_path) + dfc.validate_channel_keys() + valid_h5_paths.append(h5_path) + except InvalidFileError as e: + logger.info(f"Skipping invalid file: {h5_path.stem}\n{e}") + + return valid_h5_paths + def _schema_check(self, files: list[Path], expected_schema_set: set) -> None: """ Checks the schema of the Parquet files. @@ -182,8 +179,7 @@ def _create_timed_dataframe(self, df: dd.DataFrame) -> dd.DataFrame: # Take all timed data rows without filtering df_timed = df[timed_channels] - # Take only first electron per event - return df_timed.loc[:, :, 0] + return df_timed def _save_buffer_file(self, paths: dict[str, Path]) -> None: """Creates the electron and timed buffer files from the raw H5 file.""" @@ -205,6 +201,12 @@ def _save_buffer_file(self, paths: dict[str, Path]) -> None: # Create and save timed dataframe df_timed = self._create_timed_dataframe(df) + # timed dataframe + if isinstance(df.index, MultiIndex): + # drop the electron channels and only take rows with the first electronId + df_timed = df[self.fill_channels].loc[:, :, 0] + else: + df_timed = df[self.fill_channels] dtypes = get_dtypes(self._config, df_timed.columns.values) timed_df = df_timed.astype(dtypes).reset_index() logger.debug(f"Saving timed buffer with shape: {timed_df.shape}") @@ -251,25 +253,26 @@ def _get_dataframes(self) -> None: filling = {} for typ in DF_TYP: # Read the parquet files into a dask dataframe - df = dd.read_parquet(self.fp[typ], calculate_divisions=True) + df = dd.read_parquet(self.fp[typ]) # , calculate_divisions=True) # Get the metadata from the parquet files file_stats[typ] = get_parquet_metadata(self.fp[typ]) # Forward fill the non-electron channels across files overlap = min(file["num_rows"] for file in file_stats[typ].values()) iterations = self._config.get("forward_fill_iterations", 2) - df = forward_fill_lazy( - df=df, - columns=self.fill_channels, - before=overlap, - iterations=iterations, - ) - # TODO: This dict should be returned by forward_fill_lazy - filling[typ] = { - "columns": self.fill_channels, - "overlap": overlap, - "iterations": iterations, - } + if iterations: + df = forward_fill_lazy( + df=df, + columns=self.fill_channels, + before=overlap, + iterations=iterations, + ) + # TODO: This dict should be returned by forward_fill_lazy + filling[typ] = { + "columns": self.fill_channels, + "overlap": overlap, + "iterations": iterations, + } self.df[typ] = df self.metadata.update({"file_statistics": file_stats, "filling": filling}) @@ -311,8 +314,11 @@ def process_and_load_dataframe( Returns: Tuple[dd.DataFrame, dd.DataFrame]: The electron and timed dataframes. """ - self.fp = BufferFilePaths(self._config, h5_paths, folder, suffix, remove_invalid_files) self.filter_timed_by_electron = filter_timed_by_electron + if remove_invalid_files: + h5_paths = self._validate_h5_files(self._config, h5_paths) + + self.fp = BufferFilePaths(h5_paths, folder, suffix) if not force_recreate: schema_set = set( diff --git a/src/sed/loader/flash/dataframe.py b/src/sed/loader/flash/dataframe.py index f50abe10..61bc6aa6 100644 --- a/src/sed/loader/flash/dataframe.py +++ b/src/sed/loader/flash/dataframe.py @@ -12,9 +12,9 @@ import numpy as np import pandas as pd +from sed.core.logging import setup_logging from sed.loader.flash.utils import get_channels from sed.loader.flash.utils import InvalidFileError -from sed.core.logging import setup_logging logger = setup_logging("flash_dataframe_creator") @@ -39,8 +39,8 @@ def __init__(self, config_dataframe: dict, h5_path: Path) -> None: """ logger.debug(f"Initializing DataFrameCreator for file: {h5_path}") self.h5_file = h5py.File(h5_path, "r") - self.multi_index = get_channels(index=True) self._config = config_dataframe + self.multi_index = get_channels(self._config, index=True) def get_index_dataset_key(self, channel: str) -> tuple[str, str]: """ diff --git a/src/sed/loader/flash/instruments.py b/src/sed/loader/flash/instruments.py deleted file mode 100644 index 8ef0146e..00000000 --- a/src/sed/loader/flash/instruments.py +++ /dev/null @@ -1,9 +0,0 @@ -from __future__ import annotations - -from dask import dataframe as dd - - -def wespe_convert(df: dd.DataFrame, df_timed: dd.DataFrame) -> tuple[dd.DataFrame, dd.DataFrame]: - df - df_timed - raise NotImplementedError("This function is not implemented yet.") diff --git a/src/sed/loader/flash/loader.py b/src/sed/loader/flash/loader.py index c2cf79b9..799399cb 100644 --- a/src/sed/loader/flash/loader.py +++ b/src/sed/loader/flash/loader.py @@ -1,6 +1,5 @@ """ This module implements the flash data loader. -This loader currently supports hextof, wespe and instruments with similar structure. The raw hdf5 data is combined and saved into buffer files and loaded as a dask dataframe. The dataframe is an amalgamation of all h5 files for a combination of runs, where the NaNs are automatically forward-filled across different files. @@ -21,7 +20,6 @@ from sed.core.logging import setup_logging from sed.loader.base.loader import BaseLoader from sed.loader.flash.buffer_handler import BufferHandler -from sed.loader.flash.instruments import wespe_convert from sed.loader.flash.metadata import MetadataRetriever # Configure logging @@ -225,10 +223,85 @@ def parse_metadata(self, token: str = None) -> dict: def get_count_rate( self, - fids: Sequence[int] = None, # noqa: ARG002 - **kwds, # noqa: ARG002 - ): - return None, None + fids: Sequence[int] = None, + **kwds, + ) -> tuple[np.ndarray, np.ndarray]: + """ + Calculates the count rate for the specified files. + Returns high-resolution (per-train) rates by counting electrons per trainId. + + Args: + fids (Sequence[int]): A sequence of file IDs. Defaults to all files. + **kwds: Keyword arguments. + + Returns: + tuple[np.ndarray, np.ndarray]: The count rate array (Hz) and time array (seconds). + """ + import h5py + import numpy as np + import pandas as pd + + if fids is None: + fids = range(len(self.files)) + + # Get the electron channel configuration + per_electron_channels = get_channels(self._config["dataframe"], "per_electron") + if not per_electron_channels: + return None, None + + # We need the 'index_key' (trainId) for an electron channel + first_channel = per_electron_channels[0] + channel_config = self._config["dataframe"]["channels"][first_channel] + index_key = channel_config["index_key"] + + all_counts = [] + all_times = [] + + # FLASH repetition rate is usually 10Hz. + # We try to use timestamps if available, otherwise fallback to trainId gaps. + time_stamp_alias = self._config["dataframe"].get("time_stamp_alias", "timeStamp") + + # We need a reference time (t0) from the first selected file + with h5py.File(self.files[fids[0]], "r") as h5: + # Try to find a global start time if any, otherwise use relative + t0 = 0 + if time_stamp_alias in h5: + # This depends on how timestamps are stored in FLASH files + # For now, we use a simple relative time if not easily found. + pass + + for fid in fids: + with h5py.File(self.files[fid], "r") as h5: + # Read trainIds of all electron events + train_ids = np.asarray(h5[index_key]) + + if len(train_ids) == 0: + continue + + # Count electrons per train + df_counts = pd.Series(train_ids).value_counts().sort_index() + counts = df_counts.values + u_train_ids = df_counts.index.values + + # Convert trainIds to relative seconds (assuming 10Hz) + # Note: This is an approximation. A better way would be to + # use the actual timestamps of the trains. + if fid == fids[0]: + t_start_id = u_train_ids[0] + + times = (u_train_ids - t_start_id) * 0.1 + + # Rate per trainId interval (usually 0.1s) + # If we assume exactly 10Hz, duration is 0.1s + rates = counts / 0.1 + + all_counts.append(rates) + all_times.append(times) + + if not all_counts: + return None, None + + return np.concatenate(all_counts), np.concatenate(all_times) def get_elapsed_time(self, fids: Sequence[int] = None, **kwds) -> float | list[float]: # type: ignore[override] """ @@ -401,9 +474,6 @@ def read_dataframe( filter_timed_by_electron=filter_timed_by_electron, ) - if self.instrument == "wespe": - df, df_timed = wespe_convert(df, df_timed) - self.metadata.update(self.parse_metadata(token) if collect_metadata else {}) self.metadata.update(bh.metadata) diff --git a/src/sed/loader/flash/metadata.py b/src/sed/loader/flash/metadata.py index 578fa9fd..9a840a22 100644 --- a/src/sed/loader/flash/metadata.py +++ b/src/sed/loader/flash/metadata.py @@ -1,13 +1,15 @@ """ The module provides a MetadataRetriever class for retrieving metadata -from a Scicat Instance based on beamtime and run IDs. +from a Scicat instance based on beamtime and run IDs. """ from __future__ import annotations +import json +from pathlib import Path import requests +import yaml -from sed.core.config import read_env_var -from sed.core.config import save_env_var +from sed.core.config import read_env_var, save_env_var from sed.core.logging import setup_logging logger = setup_logging("flash_metadata_retriever") @@ -15,51 +17,48 @@ class MetadataRetriever: """ - A class for retrieving metadata from a Scicat instance based - on beamtime and run IDs. + Retrieves metadata from SciCat or local YAML files for a given beamtime and runs. """ def __init__(self, metadata_config: dict, token: str = None) -> None: """ Initializes the MetadataRetriever class. - + Args: - metadata_config (dict): Takes a dict containing at least url for the scicat instance. - token (str, optional): The token to use for fetching metadata. If provided, - will be saved to .env file for future use. + metadata_config (dict): Dict containing at least 'archiver_url' for SciCat. + token (str, optional): Token for fetching metadata. Saved to .env if provided. """ - # Token handling if token: self.token = token save_env_var("SCICAT_TOKEN", self.token) else: - # Try to load token from config or .env file self.token = read_env_var("SCICAT_TOKEN") if not self.token: raise ValueError( - "Token is required for metadata collection. Either provide a token " - "parameter or set the SCICAT_TOKEN environment variable.", + "Token is required for metadata collection. Provide a token " + "or set SCICAT_TOKEN in environment." ) self.url = metadata_config.get("archiver_url") if not self.url: - raise ValueError("No URL provided for fetching metadata from scicat.") + raise ValueError("No URL provided for fetching metadata from SciCat.") - self.headers = { - "Content-Type": "application/json", - "Accept": "application/json", - } + self.headers = {"Content-Type": "application/json", "Accept": "application/json"} + # ---------------------------- + # Remote SciCat metadata + # ---------------------------- def get_metadata( self, beamtime_id: str, runs: list, - metadata: dict = None, + metadata: dict | None = None, ) -> dict: """ - Retrieves metadata for a given beamtime ID and list of runs. - + Retrieves metadata for a beamtime and runs from SciCat. + Returns a dict with 'scientificMetadata' keyed by run ID. + Args: beamtime_id (str): The ID of the beamtime. runs (list): A list of run IDs. @@ -77,12 +76,15 @@ def get_metadata( if metadata is None: metadata = {} + all_runs_metadata: dict[str, dict] = {} + for run in runs: pid = f"{beamtime_id}/{run}" - logger.debug(f"Retrieving metadata for PID: {pid}") metadata_run = self._get_metadata_per_run(pid) - metadata.update(metadata_run) # TODO: Not correct for multiple runs + # Use 'scientificMetadata' if available, otherwise entire dict + all_runs_metadata[run] = metadata_run.get("scientificMetadata", metadata_run) + metadata["scientificMetadata"] = all_runs_metadata logger.debug(f"Retrieved metadata with {len(metadata)} entries") return metadata @@ -103,44 +105,135 @@ def _get_metadata_per_run(self, pid: str) -> dict: headers2["Authorization"] = f"Bearer {self.token}" try: - logger.debug(f"Attempting to fetch metadata with new URL format for PID: {pid}") - dataset_response = requests.get( - self._create_new_dataset_url(pid), - headers=headers2, - timeout=10, - ) - dataset_response.raise_for_status() + logger.debug(f"Fetching metadata (new URL) for PID: {pid}") + response = requests.get(self._create_new_dataset_url(pid), headers=headers2, timeout=10) + response.raise_for_status() # Check if response is an empty object because wrong url for older implementation - if not dataset_response.content: + if not response.content: logger.debug("Empty response, trying old URL format") - dataset_response = requests.get( - self._create_old_dataset_url(pid), - headers=headers2, - timeout=10, - ) + response = requests.get(self._create_old_dataset_url(pid), headers=headers2, timeout=10) # If the dataset request is successful, return the retrieved metadata # as a JSON object - return dataset_response.json() - - except requests.exceptions.RequestException as exception: - logger.warning(f"Failed to retrieve metadata for PID {pid}: {str(exception)}") - return {} # Return an empty dictionary for this run + return response.json() + except requests.exceptions.RequestException as e: + logger.warning(f"Failed to retrieve metadata for PID {pid}: {e}") + return {} def _create_old_dataset_url(self, pid: str) -> str: - return "{burl}/{url}/%2F{npid}".format( - burl=self.url, - url="Datasets", - npid=self._reformat_pid(pid), - ) + return f"{self.url}datasets/%2F{self._reformat_pid(pid)}" def _create_new_dataset_url(self, pid: str) -> str: - return "{burl}/{url}/{npid}".format( - burl=self.url, - url="Datasets", - npid=self._reformat_pid(pid), - ) + return f"{self.url}datasets/{self._reformat_pid(pid)}" def _reformat_pid(self, pid: str) -> str: """SciCat adds a pid-prefix + "/" but at DESY prefix = "" """ - return (pid).replace("/", "%2F") + """Replace '/' with '%2F' for SciCat PID.""" + return pid.replace("/", "%2F") + + # ---------------------------- + # Local metadata + # ---------------------------- + def get_local_metadata( + self, + beamtime_id: str, + beamtime_dir: str | Path, + meta_dir: str | Path, + runs: list, + metadata: dict | None = None, + ) -> dict: + """ + Retrieves metadata for a beamtime and runs from local YAML files. + Returns a dict with 'scientificMetadata' keyed by run ID. + + Args: + beamtime_id (str): The ID of the beamtime. + beamtime_dir (str)|Path: Beamtime directory. + meta_dir (str)|Path: Local metadata directory. + runs (list): A list of run IDs. + metadata (dict, optional): The existing metadata dictionary. + Defaults to None. + + Returns: + Dict: The updated metadata dictionary. + + Raises: + Exception: If the request to retrieve metadata fails. + """ + if metadata is None: + metadata = {} + + beamtime_metadata = self._get_beamtime_metadata(beamtime_dir, beamtime_id) + metadata.update(beamtime_metadata) + + all_runs_metadata: dict[str, dict] = {} + + for run in runs: + logger.debug(f"Retrieving local metadata for run: {run}") + run_metadata = self._get_local_metadata_per_run(meta_dir, run) + all_runs_metadata[run] = run_metadata.get("_data", {}) + + metadata["scientificMetadata"] = all_runs_metadata + logger.debug(f"Retrieved metadata with {len(metadata)} entries") + return metadata + + def _get_beamtime_metadata(self, beamtime_dir: str | Path, beamtime_id: str) -> dict: + """ + Retrieves general metadata from beamtime-metadata-{beamtime_id}.json + + Args: + beamtime_dir (str)|Path: Beamtime directory. + beamtime_id (str): The ID of the beamtime. + + Returns: + Dict: The retrieved metadata dictionary. + + Raises: + Exception: If the request to retrieve metadata fails. + """ + try: + beamtime_dir = Path(beamtime_dir) + filepath = beamtime_dir / f"beamtime-metadata-{beamtime_id}.json" + with filepath.open("r") as f: + return json.load(f) + except Exception as exc: + logger.warning(f"Failed to retrieve metadata for beamtime {beamtime_id}: {exc}") + return {} + + def _get_local_metadata_per_run(self, meta_dir: str | Path, run: str) -> dict: + """ + Retrieves metadata for a specific run from the latest YAML file: + {run}_N.yaml (highest N) or fallback to {run}.yaml + """ + try: + meta_dir = Path(meta_dir) + run = str(run) + candidates: list[tuple[int, Path]] = [] + + # Look for versioned YAML files + for path in meta_dir.glob(f"{run}_*.yaml"): + try: + version = int(path.stem.split("_")[-1]) + candidates.append((version, path)) + except ValueError: + continue + + # Fallback: unversioned single file + if not candidates: + single_file = meta_dir / f"{run}.yaml" + if single_file.exists(): + candidates.append((0, single_file)) + + if not candidates: + raise FileNotFoundError(f"No metadata files found for run {run} in {meta_dir}") + + # Pick the latest version + _, latest_path = max(candidates, key=lambda x: x[0]) + logger.info(f"Loading local metadata from {latest_path.name}") + + run_metadata = yaml.safe_load(latest_path.read_text()) + return run_metadata or {"_data": {}} + + except Exception as exc: + logger.warning(f"Failed to retrieve local metadata for run {run}: {exc}") + return {"_data": {}} diff --git a/src/sed/loader/flash/utils.py b/src/sed/loader/flash/utils.py index 85bca9a4..0f41aaaa 100644 --- a/src/sed/loader/flash/utils.py +++ b/src/sed/loader/flash/utils.py @@ -1,12 +1,6 @@ from __future__ import annotations -# TODO: move to config -MULTI_INDEX = ["trainId", "pulseId", "electronId"] -PULSE_ALIAS = MULTI_INDEX[1] -FORMATS = ["per_electron", "per_pulse", "per_train"] - - def get_channels( config_dataframe: dict = {}, formats: str | list[str] = None, @@ -29,7 +23,9 @@ def get_channels( List[str]: A list of channels with the specified format(s). """ channel_dict = config_dataframe.get("channels", {}) - aux_alias = config_dataframe.get("aux_alias", "dldAux") + index_list = config_dataframe.get("index", ["trainId", "pulseId", "electronId"]) + formats_list = config_dataframe.get("formats", ["per_train", "per_pulse", "per_electron"]) + aux_alias = channel_dict.get("auxiliary", "dldAux") # If 'formats' is a single string, convert it to a list for uniform processing. if isinstance(formats, str): @@ -39,7 +35,7 @@ def get_channels( if formats == ["all"]: channels = get_channels( config_dataframe, - FORMATS, + formats_list, index, extend_aux, ) @@ -47,24 +43,25 @@ def get_channels( channels = [] - # Include channels from multi_index if 'index' is True. + # Include channels from index_list if 'index' is True. if index: - channels.extend(MULTI_INDEX) + channels.extend(index_list) if formats: # If 'formats' is a list, check if all elements are valid. - err_msg = ( - "Invalid format. Please choose from 'per_electron', 'per_pulse', 'per_train', 'all'." - ) for format_ in formats: - if format_ not in FORMATS + ["all"]: - raise ValueError(err_msg) + if format_ not in formats_list + ["all"]: + raise ValueError( + f"Invalid format: {format_}. " f"Valid formats are: {formats_list + ['all']}", + ) # Get the available channels excluding 'pulseId'. available_channels = list(channel_dict.keys()) # pulse alias is an index and should not be included in the list of channels. - if PULSE_ALIAS in available_channels: - available_channels.remove(PULSE_ALIAS) + # Remove index channels if they are present in available_channels. + for channel in index_list: + if channel in available_channels: + available_channels.remove(channel) for format_ in formats: # Gather channels based on the specified format(s). @@ -75,7 +72,7 @@ def get_channels( ) # Include 'dldAuxChannels' if the format is 'per_train' and extend_aux is True. # Otherwise, include 'dldAux'. - if format_ == FORMATS[2] and aux_alias in available_channels: + if format_ == "per_train" and aux_alias in available_channels: if extend_aux: channels.extend( channel_dict[aux_alias]["sub_channels"].keys(), diff --git a/tests/data/loader/cfel/20250411_12h34m03s185_000123.h5 b/tests/data/loader/cfel/20250411_12h34m03s185_000123.h5 new file mode 100644 index 00000000..c7146891 Binary files /dev/null and b/tests/data/loader/cfel/20250411_12h34m03s185_000123.h5 differ diff --git a/tests/data/loader/cfel/config.yaml b/tests/data/loader/cfel/config.yaml new file mode 100644 index 00000000..f80b90d0 --- /dev/null +++ b/tests/data/loader/cfel/config.yaml @@ -0,0 +1,160 @@ +# This file contains the default configuration for the flash loader. + +core: + # defines the loader + loader: cfel + # Since this will run on maxwell most probably, we have a lot of cores at our disposal + num_cores: 10 + # the ID number of the beamtime + beamtime_id: 11021732 + # the year of the beamtime + year: 2025 + + # The paths to the raw and parquet data directories. If these are not + # provided, the loader will try to find the data based on year beamtimeID etc + paths: + # location of the raw data. + raw: "tests/data/loader/cfel/" + # location of the intermediate parquet files. + processed: "tests/data/loader/cfel/parquet" + + # The beamtime directories for different DAQ systems. + # (Not to be changed by user) + beamtime_dir: + pg2: "/asap3/flash/gpfs/pg2/" + cfel: "/asap3/fs-flash-o/gpfs/hextof/" + + +dataframe: + daq: fl1user3 # DAQ system name to resolve filenames/paths + ubid_offset: 5 # Offset correction to the pulseId + forward_fill_iterations: 0 # Number of iterations to fill the pulseId forward + split_sector_id_from_dld_time: True # Remove reserved bits for dldSectorID from dldTimeSteps column + sector_id_reserved_bits: 3 # Bits reserved for dldSectorID in the dldTimeSteps column + sector_delays: [0., 0., 0., 0., 0., 0., 0., 0.] # Sector delays + + first_event_time_stamp_key: /ScanParam/StartTime + ms_markers_key: /SlowData/exposure_time + + # Time and binning settings + tof_binwidth: 2.0576131995767355E-11 # Base time-of-flight bin width in seconds + tof_binning: 8 # Binning parameter for time-of-flight data + + # Columns used for jitter correction + index: [countId] + jitter_cols: [dldPosX, dldPosY, dldTimeSteps] + formats: [per_file, per_train, per_electron] + fill_formats: [per_train] # Channels with this format will be forward filled + + # Column settings + columns: + x: dldPosX + corrected_x: X + kx: kx + y: dldPosY + corrected_y: Y + ky: ky + tof: dldTimeSteps + tof_ns: dldTime + corrected_tof: tm + timestamp: timeStamp + auxiliary: dldAux + sector_id: dldSectorID + delay: delayStage + corrected_delay: pumpProbeTime + + units: + # These are the units of the columns + dldPosX: 'step' + dldPosY: 'step' + dldTimeSteps: 'step' + tof_voltage: 'V' + extractorVoltage: 'V' + extractorCurrent: 'A' + cryoTemperature: 'K' + sampleTemperature: 'K' + dldTime: 'ns' + delay: 'ps' + timeStamp: 's' + energy: 'eV' + E: 'eV' + kx: '1/A' + ky: '1/A' + + # The channels to load. + # channels have the following structure: + # : + # format: per_pulse/per_electron/per_train + # index_key: the hdf5 index key + # dataset_key: the hdf5 dataset key + # slice: int to slice a multidimensional data along axis=1. If not defined, there is no slicing + # dtype: the datatype of the data + # subChannels: further aliases for if the data is multidimensional and needs to be split in different cols + # used currently for the auxiliary channel + # : + # slice: int to slice a multidimensional data along axis=1. Must be defined + # dtype: the datatype of the data + + channels: + # event key + countId: + format: per_file + dataset_key: /DLD/NumOfEvents + # detector x position + dldPosX: + format: per_electron + dataset_key: /DLD/DLD/xPos + # dtype: uint32 + + # detector y position + dldPosY: + format: per_electron + dataset_key: /DLD/DLD/yPos + # dtype: uint32 + + # Detector time-of-flight channel + # if split_sector_id_from_dld_time is set to True, This this will generate + # also the dldSectorID channel + dldTimeSteps: + format: per_electron + dataset_key: /DLD/DLD/times + # dtype: uint32 + + # The auxiliary channel has a special structure where the group further contains + # a multidimensional structure so further aliases are defined below + dldAux: + format: per_train + dataset_key: "/SlowData/hextof/dld/info/Aux" + sub_channels: + sampleBias: + slice: 0 + dtype: float32 + tofVoltage: + slice: 1 + dtype: float64 + extractorVoltage: + slice: 2 + extractorCurrent: + slice: 3 + cryoTemperature: + slice: 4 + sampleTemperature: + slice: 5 + dldTimeBinSize: + slice: 15 + + vuRead: + format: per_train + dataset_key: /SlowData/hextof/logic/kmic1/Sample_VURead + + + +# metadata collection from scicat +# metadata: +# archiver_url: + +# The nexus collection routine shall be finalized soon for both instruments +# nexus: +# reader: "mpes" +# definition: "NXmpes" +# input_files: ["NXmpes_config-HEXTOF.json"] diff --git a/tests/data/loader/cfel/config2.yaml b/tests/data/loader/cfel/config2.yaml new file mode 100644 index 00000000..541830f1 --- /dev/null +++ b/tests/data/loader/cfel/config2.yaml @@ -0,0 +1,163 @@ +# This file contains the default configuration for the flash loader. + +core: + # defines the loader + loader: cfel + # the beamline where experiment took place + beamline: cfel + # Since this will run on maxwell most probably, we have a lot of cores at our disposal + num_cores: 10 + # the ID number of the beamtime + beamtime_id: 11021732 + # the year of the beamtime + year: 2025 + + # The paths to the raw and parquet data directories. If these are not + # provided, the loader will try to find the data based on year beamtimeID etc + paths: + # location of the raw data. + raw: "/asap3/fs-flash-o/gpfs/hextof/2025/data/11021732/raw/" + # location of the intermediate parquet files. + processed: "." + + # The beamtime directories for different DAQ systems. + # (Not to be changed by user) + beamtime_dir: + pg2: "/asap3/flash/gpfs/pg2/" + cfel: "/asap3/fs-flash-o/gpfs/hextof/" + + +dataframe: + daq: fl1user3 # DAQ system name to resolve filenames/paths + ubid_offset: 5 # Offset correction to the pulseId + forward_fill_iterations: 0 # Number of iterations to fill the pulseId forward + split_sector_id_from_dld_time: True # Remove reserved bits for dldSectorID from dldTimeSteps column + sector_id_reserved_bits: 3 # Bits reserved for dldSectorID in the dldTimeSteps column + sector_delays: [0., 0., 0., 0., 0., 0., 0., 0.] # Sector delays + + first_event_time_stamp_key: /ScanParam/StartTime + ms_markers_key: /SlowData/exposure_time + millis_counter_key: /DLD/millisecCounter + + # Time and binning settings + tof_binwidth: 2.0576131995767355E-11 # Base time-of-flight bin width in seconds + tof_binning: 8 # Binning parameter for time-of-flight data + + # Columns used for jitter correction + index: [countId] + jitter_cols: [dldPosX, dldPosY, dldTimeSteps] + formats: [per_file, per_train, per_electron] + fill_formats: [per_train] # Channels with this format will be forward filled + + # Column settings + columns: + x: dldPosX + corrected_x: X + kx: kx + y: dldPosY + corrected_y: Y + ky: ky + tof: dldTimeSteps + tof_ns: dldTime + corrected_tof: tm + timestamp: timeStamp + auxiliary: dldAux + sector_id: dldSectorID + delay: delayStage + corrected_delay: pumpProbeTime + + units: + # These are the units of the columns + dldPosX: 'step' + dldPosY: 'step' + dldTimeSteps: 'step' + tof_voltage: 'V' + extractorVoltage: 'V' + extractorCurrent: 'A' + cryoTemperature: 'K' + sampleTemperature: 'K' + dldTime: 'ns' + delay: 'ps' + timeStamp: 's' + energy: 'eV' + E: 'eV' + kx: '1/A' + ky: '1/A' + + # The channels to load. + # channels have the following structure: + # : + # format: per_pulse/per_electron/per_train + # index_key: the hdf5 index key + # dataset_key: the hdf5 dataset key + # slice: int to slice a multidimensional data along axis=1. If not defined, there is no slicing + # dtype: the datatype of the data + # subChannels: further aliases for if the data is multidimensional and needs to be split in different cols + # used currently for the auxiliary channel + # : + # slice: int to slice a multidimensional data along axis=1. Must be defined + # dtype: the datatype of the data + + channels: + # event key + countId: + format: per_file + dataset_key: /DLD/NumOfEvents + # detector x position + dldPosX: + format: per_electron + dataset_key: /DLD/DLD/xPos + # dtype: uint32 + + # detector y position + dldPosY: + format: per_electron + dataset_key: /DLD/DLD/yPos + # dtype: uint32 + + # Detector time-of-flight channel + # if split_sector_id_from_dld_time is set to True, This this will generate + # also the dldSectorID channel + dldTimeSteps: + format: per_electron + dataset_key: /DLD/DLD/times + # dtype: uint32 + + # The auxiliary channel has a special structure where the group further contains + # a multidimensional structure so further aliases are defined below + dldAux: + format: per_train + dataset_key: "/SlowData/hextof/dld/info/Aux" + sub_channels: + sampleBias: + slice: 0 + dtype: float32 + tofVoltage: + slice: 1 + dtype: float64 + extractorVoltage: + slice: 2 + extractorCurrent: + slice: 3 + cryoTemperature: + slice: 4 + sampleTemperature: + slice: 5 + dldTimeBinSize: + slice: 15 + + vuRead: + format: per_train + dataset_key: /SlowData/hextof/logic/kmic1/Sample_VURead + + + +# metadata collection from scicat +# metadata: +# archiver_url: + +# The nexus collection routine shall be finalized soon for both instruments +# nexus: +# reader: "mpes" +# definition: "NXmpes" +# input_files: ["NXmpes_config-HEXTOF.json"] diff --git a/tests/data/loader/flash/config.yaml b/tests/data/loader/flash/config.yaml index fbbcba25..90101c81 100644 --- a/tests/data/loader/flash/config.yaml +++ b/tests/data/loader/flash/config.yaml @@ -31,6 +31,7 @@ core: # (Not to be changed by user) beamtime_dir: pg2: "/asap3/flash/gpfs/pg2/" + cfel: "/asap3/fs-flash-o/gpfs/hextof/" dataframe: @@ -52,6 +53,10 @@ dataframe: sector_delays: [0., 0., 0., 0., 0., 0., 0., 0.] jitter_cols: ["dldPosX", "dldPosY", "dldTimeSteps"] + # The index and formats of the data + index: [trainId, pulseId, electronId] + formats: [per_train, per_pulse, per_electron] + fill_formats: [per_train, per_pulse] # Channels with this format will be forward filled columns: x: dldPosX corrected_x: X diff --git a/tests/loader/cfel/__init__.py b/tests/loader/cfel/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/loader/cfel/conftest.py b/tests/loader/cfel/conftest.py new file mode 100644 index 00000000..e11a4d0d --- /dev/null +++ b/tests/loader/cfel/conftest.py @@ -0,0 +1,92 @@ +""" This module contains fixtures for the CFEL module tests. +""" +import os +import shutil +from pathlib import Path + +import h5py +import pytest + +from sed.core.config import parse_config + +test_dir = os.path.join(os.path.dirname(__file__), "../..") +# Use CFEL config instead of FLASH config +config_path = os.path.join(test_dir, "data/loader/cfel/config2.yaml") +# Use CFEL test data paths +H5_PATH = "20250411_12h34m03s185_000123.h5" +H5_PATHS = [H5_PATH] + + +@pytest.fixture +def config(): + config_dict = parse_config( + config=config_path, + user_config=None, + system_config=None, + ) + + + return config_dict + + +@pytest.fixture(name="config_dataframe") +def fixture_config_file_dataframe() -> dict: + """Fixture providing a configuration file for CFELLoader tests. + + Returns: + dict: The parsed configuration file. + """ + return parse_config(config_path, folder_config={}, user_config={}, system_config={})[ + "dataframe" + ] + + +@pytest.fixture(name="h5_file") +def fixture_h5_file() -> h5py.File: + """Fixture providing an open h5 file. + + Returns: + h5py.File: The open h5 file. + """ + return h5py.File(os.path.join(test_dir, f"data/loader/cfel/{H5_PATH}"), "r") + + +@pytest.fixture(name="h5_file_copy") +def fixture_h5_file_copy(tmp_path: Path) -> h5py.File: + """Fixture providing a copy of an open h5 file. + + Returns: + h5py.File: The open h5 file copy. + """ + # Create a copy of the h5 file in a temporary directory + original_file_path = os.path.join(test_dir, f"data/loader/cfel/{H5_PATH}") + copy_file_path = tmp_path / "copy.h5" + shutil.copyfile(original_file_path, copy_file_path) + + return h5py.File(copy_file_path, "r+") + + +@pytest.fixture(name="h5_file2_copy") +def fixture_h5_file2_copy(tmp_path: Path) -> h5py.File: + """Fixture providing a copy of an open h5 file. + + Returns: + h5py.File: The open h5 file copy. + """ + # Create a copy of the h5 file in a temporary directory + original_file_path = os.path.join(test_dir, f"data/loader/cfel/{H5_PATHS[0] if len(H5_PATHS) > 1 else H5_PATH}") # Use first file if multiple, else single file + copy_file_path = tmp_path / "copy2.h5" + shutil.copyfile(original_file_path, copy_file_path) + + # Open the copy in 'read-write' mode and return it + return h5py.File(copy_file_path, "r+") + + +@pytest.fixture(name="h5_paths") +def fixture_h5_paths() -> list[Path]: + """Fixture providing a list of h5 file paths. + + Returns: + list: A list of h5 file paths. + """ + return [Path(os.path.join(test_dir, f"data/loader/cfel/{path}")) for path in H5_PATHS] diff --git a/tests/loader/cfel/test_buffer_handler.py b/tests/loader/cfel/test_buffer_handler.py new file mode 100644 index 00000000..97af4ce6 --- /dev/null +++ b/tests/loader/cfel/test_buffer_handler.py @@ -0,0 +1,372 @@ +"""Test cases for the BufferHandler class in the Flash module.""" +from copy import deepcopy +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest +from h5py import File + +from sed.loader.cfel.buffer_handler import BufferFilePaths +from sed.loader.cfel.buffer_handler import BufferHandler +from sed.loader.cfel.dataframe import DataFrameCreator +from sed.loader.cfel.loader import CFELLoader +from sed.loader.flash.utils import get_channels +from sed.loader.flash.utils import InvalidFileError + + +def create_parquet_dir(config: dict, folder: str) -> Path: + """ + Creates a directory for storing Parquet files based on the provided configuration + and folder name. + """ + + parquet_path = Path(config["core"]["paths"]["processed"]) + parquet_path = parquet_path.joinpath(folder) + parquet_path.mkdir(parents=True, exist_ok=True) + return parquet_path + + +def test_buffer_file_paths(config: dict, h5_paths: list[Path]) -> None: + """ + Test the BufferFilePath's ability to identify files that need to be read and + manage buffer file paths using a directory structure. + + This test performs several checks to ensure the BufferFilePath correctly identifies + which HDF5 files need to be read and properly manages the paths for saving buffer + files. It follows these steps: + 1. Creates a directory structure for storing buffer files and initializes the BufferHandler. + 2. Checks if the file_sets_to_process method populates the dict of missing file sets and + verify that initially, all provided files are considered missing. + 3. Checks that the paths for saving buffer files are correctly generated. + 4. Creates a single buffer file and reruns file_sets_to_process to ensure that the BufferHandler + recognizes one less missing file. + 5. Checks if the force_recreate parameter forces the BufferHandler to consider all files + 6. Cleans up by removing the created buffer file. + 7. Tests the handling of suffix in buffer file names (for multidetector setups) by rerunning + the checks with modified file name parameters. + """ + folder = create_parquet_dir(config, "get_files_to_read") + fp = BufferFilePaths(h5_paths, folder, suffix="") + + # check that all files are to be read + assert len(fp.file_sets_to_process()) == len(h5_paths) + # create expected paths + expected_buffer_electron_paths = [ + folder / f"buffer/electron_{Path(path).stem}" for path in h5_paths + ] + expected_buffer_timed_paths = [folder / f"buffer/timed_{Path(path).stem}" for path in h5_paths] + + # check that all buffer paths are correct + assert np.all(fp["electron"] == expected_buffer_electron_paths) + assert np.all(fp["timed"] == expected_buffer_timed_paths) + + # create a single buffer file to check if it changes + path = { + "raw": h5_paths[0], + "electron": expected_buffer_electron_paths[0], + "timed": expected_buffer_timed_paths[0], + } + bh = BufferHandler(config) + bh._save_buffer_file(path, is_first_file=True, base_timestamp=None) + + # check again for files to read and expect one less file + fp = BufferFilePaths(h5_paths, folder, suffix="") + # check that only one file is to be read + assert len(fp.file_sets_to_process()) == len(h5_paths) - 1 + + # check that both files are to be read if force_recreate is set to True + assert len(fp.file_sets_to_process(force_recreate=True)) == len(h5_paths) + + # remove buffer files + Path(path["electron"]).unlink() + Path(path["timed"]).unlink() + + # Test for adding a suffix + fp = BufferFilePaths(h5_paths, folder, "suffix") + + # expected buffer paths with prefix and suffix + for typ in ["electron", "timed"]: + expected_buffer_paths = [ + folder / "buffer" / f"{typ}_{Path(path).stem}_suffix" for path in h5_paths + ] + assert np.all(fp[typ] == expected_buffer_paths) + + +def test_buffer_schema_mismatch(config: dict, h5_paths: list[Path]) -> None: + """ + Test schema mismatch handling in BufferHandler / CFEL loader. + + Steps: + 1) Channel exists in config but NOT in HDF5 → expect InvalidFileError. + 2) Same situation, but ignored via remove_invalid_files=True → should succeed. + 3) True schema mismatch (parquet has column not in config) → expect ValueError. + """ + from copy import deepcopy + + # -------------------------------------------------- + # Step 1: HDF5 missing channel → InvalidFileError + # -------------------------------------------------- + folder_step1 = create_parquet_dir(config, "schema_mismatch_step1") + config_missing_channel = deepcopy(config) + config_missing_channel["dataframe"]["channels"]["gmdTunnel2"] = { + "dataset_key": "/some/cfel/test/dataset", + "format": "per_train", + } + + with pytest.raises(InvalidFileError) as exc: + bh = BufferHandler(config_missing_channel) + bh.process_and_load_dataframe( + h5_paths=h5_paths, + folder=folder_step1, + debug=True, + force_recreate=True, # ← THIS IS REQUIRED + ) + + assert "gmdTunnel2" in str(exc.value) + + # -------------------------------------------------- + # Step 2: Same missing channel, but ignored + # All files become invalid → no buffers → FileNotFoundError + # -------------------------------------------------- + folder_step2 = create_parquet_dir(config, "schema_mismatch_step2") + + # create buffer files normally + bh_base = BufferHandler(config) + bh_base.process_and_load_dataframe( + h5_paths=h5_paths, + folder=folder_step2, + debug=True, + force_recreate=True, + ) + + # now re-run with missing channel ignored + bh_missing = BufferHandler(config_missing_channel) + bh_missing.process_and_load_dataframe( + h5_paths=h5_paths, + folder=folder_step2, + debug=True, + remove_invalid_files=True, + force_recreate=True, + ) + + # correct post-condition + assert bh_missing.df["electron"] is None + assert bh_missing.df["timed"] is None + + # -------------------------------------------------- + # Step 3: TRUE schema mismatch → ValueError + # -------------------------------------------------- + + folder_step3 = create_parquet_dir(config, "schema_mismatch_step3") + + # choose a REAL channel that exists in HDF5 + removed_channel = "dldPosX" + assert removed_channel in config["dataframe"]["channels"] + + # 1) create parquet normally (with that channel) + bh_base = BufferHandler(config) + bh_base.process_and_load_dataframe( + h5_paths=h5_paths, + folder=folder_step3, + debug=True, + force_recreate=True, + ) + + # 2) remove the channel from config + config_removed = deepcopy(config) + del config_removed["dataframe"]["channels"][removed_channel] + + # 3) reload → schema mismatch + with pytest.raises(ValueError) as exc: + bh_removed = BufferHandler(config_removed) + bh_removed.process_and_load_dataframe( + h5_paths=h5_paths, + folder=folder_step3, + debug=True, + ) + + msg = str(exc.value).lower() + assert "available channels do not match the schema" in msg + assert "missing in parquet" in msg or "missing" in msg + + +def test_save_buffer_files(config: dict, h5_paths: list[Path]) -> None: + """ + Test the BufferHandler's ability to save buffer files serially and in parallel. + + This test ensures that the BufferHandler can run both serially and in parallel, saving the + output to buffer files, and then it compares the resulting DataFrames to ensure they are + identical. This verifies that parallel processing does not affect the integrity of the data + saved. After the comparison, it cleans up by removing the created buffer files. + """ + folder_serial = create_parquet_dir(config, "save_buffer_files_serial") + bh_serial = BufferHandler(config) + bh_serial.process_and_load_dataframe(h5_paths, folder_serial, debug=True) + + folder_parallel = create_parquet_dir(config, "save_buffer_files_parallel") + bh_parallel = BufferHandler(config) + bh_parallel.process_and_load_dataframe(h5_paths, folder_parallel) + + df_serial = pd.read_parquet(folder_serial) + df_parallel = pd.read_parquet(folder_parallel) + + pd.testing.assert_frame_equal(df_serial, df_parallel) + + # remove buffer files + for df_type in ["electron", "timed"]: + for path in bh_serial.fp[df_type]: + path.unlink() + for path in bh_parallel.fp[df_type]: + path.unlink() + +def test_save_buffer_files_exception( + config: dict, + h5_paths: list[Path], + h5_file_copy: File, + h5_file2_copy: File, + tmp_path: Path, +) -> None: + """Test BufferHandler exception handling for missing keys and empty datasets.""" + + folder = create_parquet_dir(config, "save_buffer_files_exception") + config_ = deepcopy(config) + + # -------------------------------------------------- + # 1) Missing dataset_key in config → ValueError + # -------------------------------------------------- + channel = "dldPosX" + del config_["dataframe"]["channels"][channel]["dataset_key"] + + with pytest.raises(ValueError): + bh = BufferHandler(config_) + bh.process_and_load_dataframe( + h5_paths, folder, debug=False + ) + + # -------------------------------------------------- + # 2) Empty dataset → InvalidFileError + # -------------------------------------------------- + config_ = deepcopy(config) + empty_channel = "testChannel" + empty_dataset_key = "test/dataset/empty/value" + + config_["dataframe"]["channels"][empty_channel] = { + "dataset_key": empty_dataset_key, + "format": "per_train", + } + + # create empty dataset in first HDF5 file + h5_file_copy.create_dataset(name=empty_dataset_key, shape=(0,)) + + # Expect InvalidFileError because dataset is empty + with pytest.raises(InvalidFileError): + bh = BufferHandler(config_) + bh.process_and_load_dataframe( + [tmp_path / "copy.h5"], + folder, + debug=False, + force_recreate=True, + ) + + # -------------------------------------------------- + # 3) remove_invalid_files=True → no error, only invalid files are skipped + # -------------------------------------------------- + # add empty dataset to second HDF5 file + h5_file2_copy.create_dataset(name=empty_dataset_key, shape=(0,)) + + bh = BufferHandler(config_) + bh.process_and_load_dataframe( + [tmp_path / "copy.h5", tmp_path / "copy2.h5"], + folder, + debug=False, + force_recreate=True, + remove_invalid_files=True, + ) + + # When all files are invalid, the DataFrames should be None + assert bh.df["electron"] is None + assert bh.df["timed"] is None + + # -------------------------------------------------- + # 4) Single invalid file → nothing valid to load + # -------------------------------------------------- + # Only provide one invalid file + bh.process_and_load_dataframe( + [tmp_path / "copy.h5"], + folder, + debug=False, + force_recreate=True, + remove_invalid_files=True, + ) + + assert bh.df["electron"] is None + assert bh.df["timed"] is None + + +def test_get_filled_dataframe(config: dict, h5_paths: list[Path]) -> None: + """Test function to verify the creation of a filled dataframe from the buffer files.""" + folder = create_parquet_dir(config, "get_filled_dataframe") + bh = BufferHandler(config) + bh.process_and_load_dataframe(h5_paths, folder) + + df = pd.read_parquet(folder) + + # The buffer handler's electron dataframe may have additional derived columns + # like dldSectorID that aren't in the saved parquet file + expected_columns = set(list(df.columns) + ["timeStamp", "countId", "dldSectorID"]) + assert set(bh.df["electron"].columns).issubset(expected_columns) + + # For CFEL, check that the timed dataframe contains per_train channels and timestamp + # but excludes per_electron channels (this is CFEL-specific behavior) + per_train_channels = set(get_channels(config["dataframe"], formats=["per_train"], extend_aux=True)) + per_electron_channels = set(get_channels(config["dataframe"], formats=["per_electron"])) + + timed_columns = set(bh.df["timed"].columns) + + # Timed should include per_train channels and timestamp + assert per_train_channels.issubset(timed_columns) + assert "timeStamp" in timed_columns + + # Check that we can read the data + assert len(df) > 0 + assert len(bh.df["electron"]) > 0 + assert len(bh.df["timed"]) > 0 + # remove buffer files + for df_type in ["electron", "timed"]: + for path in bh.fp[df_type]: + path.unlink() + + +def test_cfel_multi_file_handling(config: dict, h5_paths: list[Path]) -> None: + """Test CFEL's multi-file timestamp handling.""" + folder = create_parquet_dir(config, "multi_file_handling") + bh = BufferHandler(config) + + # Test that multi-file processing works with timestamp coordination + bh.process_and_load_dataframe(h5_paths=h5_paths, folder=folder, debug=True) + + # Verify that timestamps are properly coordinated across files + df = pd.read_parquet(folder) + assert "timeStamp" in df.columns # CFEL uses timeStamp, not timestamp + + # Clean up + for df_type in ["electron", "timed"]: + for path in bh.fp[df_type]: + path.unlink() + +def test_cfel_timestamp_base_handling(config: dict, h5_paths: list[Path]) -> None: + """Test CFEL's base timestamp extraction and handling.""" + if len(h5_paths) > 1: + # Test with multiple files to verify base timestamp logic + folder = create_parquet_dir(config, "timestamp_base") + bh = BufferHandler(config) + bh.process_and_load_dataframe(h5_paths=h5_paths, folder=folder, debug=True) + + # Verify processing completed successfully + assert len(bh.fp["electron"]) == len(h5_paths) + + # Clean up + for df_type in ["electron", "timed"]: + for path in bh.fp[df_type]: + path.unlink() diff --git a/tests/loader/cfel/test_cfel_loader.py b/tests/loader/cfel/test_cfel_loader.py new file mode 100644 index 00000000..1127182e --- /dev/null +++ b/tests/loader/cfel/test_cfel_loader.py @@ -0,0 +1,253 @@ +"""Tests for CFEL Loader functionality""" +from __future__ import annotations + +import os +from pathlib import Path +from typing import Literal + +import pytest + +from .test_buffer_handler import create_parquet_dir +from sed.loader.cfel.loader import CFELLoader + + +@pytest.mark.parametrize( + "sub_dir", + ["online-0/fl1user3/", "express-0/fl1user3/", "FL1USER3/"], +) +def test_initialize_dirs( + config: dict, + fs, + sub_dir: Literal["online-0/fl1user3/", "express-0/fl1user3/", "FL1USER3/"], +) -> None: + """ + Test the initialization of paths based on the configuration and directory structures. + + Args: + fs: A fixture for a fake file system. + sub_dir (Literal["online-0/fl1user3/", "express-0/fl1user3/", "FL1USER3/"]): Sub-directory. + """ + config_ = config.copy() + del config_["core"]["paths"] + config_["core"]["beamtime_id"] = "12345678" + config_["core"]["year"] = "2000" + + # Find base path of beamline from config. Here, we use cfel for CFEL loader + base_path = config_["core"]["beamtime_dir"]["cfel"] + expected_path = ( + Path(base_path) / config_["core"]["year"] / "data" / config_["core"]["beamtime_id"] + ) + # Create expected paths + expected_raw_path = expected_path / "raw" / sub_dir + expected_processed_path = expected_path / "processed" + + # Create a fake file system for testing + fs.create_dir(expected_raw_path) + fs.create_dir(expected_processed_path) + + # Instance of class with correct config and call initialize_dirs + fl = CFELLoader(config=config_) + fl._initialize_dirs() + assert str(expected_raw_path) == fl.raw_dir + assert str(expected_processed_path) == fl.processed_dir + + # remove beamtime_id, year and daq from config to raise error + del config_["core"]["beamtime_id"] + with pytest.raises(ValueError) as e: + fl._initialize_dirs() + assert "The beamtime_id and year are required." in str(e.value) + + +def test_initialize_dirs_filenotfound(config: dict) -> None: + """ + Test FileNotFoundError during the initialization of paths. + """ + # Test the FileNotFoundError + config_ = config.copy() + del config_["core"]["paths"] + config_["core"]["beamtime_id"] = "11111111" + config_["core"]["year"] = "2000" + + # Instance of class with correct config and call initialize_dirs + with pytest.raises(FileNotFoundError): + fl = CFELLoader(config=config_) + fl._initialize_dirs() + + +def test_save_read_parquet_cfel(config: dict) -> None: + """ + Test the functionality of saving and reading parquet files with CFELLoader. + + This test performs three main actions: + 1. First call to create and read parquet files. Verifies new files are created. + 2. Second call with the same parameters to check that it only reads from + the existing parquet files without creating new ones. It asserts that the files' modification + times remain unchanged, indicating no new files were created or existing files overwritten. + 3. Third call with `force_recreate=True` to force the recreation of parquet files. + It verifies that the files were indeed overwritten by checking that their modification + times have changed. + """ + config_ = config.copy() + data_parquet_dir = create_parquet_dir(config_, "cfel_save_read") + config_["core"]["paths"]["processed"] = data_parquet_dir + # Update the raw path to point to the CFEL test data directory + config_["core"]["paths"]["raw"] = "tests/data/loader/cfel/" + fl = CFELLoader(config=config_) + + # First call: should create and read the parquet file + df1, _, _ = fl.read_dataframe(runs=[123], force_recreate=True)#was runs = [179] + # Check if new files were created + data_parquet_dir = data_parquet_dir.joinpath("buffer") + new_files = { + file: os.path.getmtime(data_parquet_dir.joinpath(file)) + for file in os.listdir(data_parquet_dir) + } + assert new_files + + # Second call: should only read the parquet file, not create new ones + df2, _, _ = fl.read_dataframe(runs=[123]) + + # Verify no new files were created after the second call + final_files = { + file: os.path.getmtime(data_parquet_dir.joinpath(file)) + for file in os.listdir(data_parquet_dir) + } + assert ( + new_files == final_files + ), "Files were overwritten or new files were created after the second call." + + # Third call: We force_recreate the parquet files + df3, _, _ = fl.read_dataframe(runs=[123], force_recreate=True) + + # Verify files were overwritten + new_files = { + file: os.path.getmtime(data_parquet_dir.joinpath(file)) + for file in os.listdir(data_parquet_dir) + } + assert new_files != final_files, "Files were not overwritten after the third call." + + # remove the parquet files + for file in new_files: + data_parquet_dir.joinpath(file).unlink() + + +def test_get_elapsed_time_fid(config: dict) -> None: + """Test get_elapsed_time method of CFELLoader class""" + # Create an instance of CFELLoader + fl = CFELLoader(config=config) + + # Mock the file_statistics and files + fl.metadata = { + "file_statistics": { + "timed": { + "0": {"columns": {"timeStamp": {"min": 10, "max": 20}}}, + "1": {"columns": {"timeStamp": {"min": 20, "max": 30}}}, + "2": {"columns": {"timeStamp": {"min": 30, "max": 40}}}, + }, + }, + } + fl.files = ["file0", "file1", "file2"] + + # ------------------------- + # Aggregate=True → sum differences + # ------------------------- + elapsed_total = fl.get_elapsed_time(fids=[0, 1], aggregate=True) + expected_total = (20 - 10) + (30 - 20) # 20 + assert elapsed_total == expected_total + + # ------------------------- + # Aggregate=False → list of per-file differences + # ------------------------- + elapsed_list = fl.get_elapsed_time(fids=[0, 1], aggregate=False) + expected_list = [(20 - 10), (30 - 20)] # [10, 10] + assert elapsed_list == expected_list + + # ------------------------- + # Test KeyError when file_statistics is missing + # ------------------------- + fl.metadata = {"something": "else"} + with pytest.raises(KeyError) as e: + fl.get_elapsed_time(fids=[0, 1]) + assert "File statistics missing. Use 'read_dataframe' first." in str(e.value) + + # ------------------------- + # Test KeyError when timeStamp metadata is missing for a file + # ------------------------- + fl.metadata = { + "file_statistics": { + "timed": { + "0": {}, + "1": {"columns": {"timeStamp": {"min": 20, "max": 30}}}, + }, + }, + } + with pytest.raises(KeyError) as e: + fl.get_elapsed_time(fids=[0, 1]) + assert "Timestamp metadata missing in file file0 (fid=0)" in str(e.value) + + +def test_get_elapsed_time_run(config: dict) -> None: + """Test get_elapsed_time method for runs with multiple files""" + config_ = config.copy() + data_parquet_dir = create_parquet_dir(config_, "get_elapsed_time_run") + config_["core"]["paths"]["processed"] = data_parquet_dir + config_["core"]["paths"]["raw"] = "tests/data/loader/cfel/" + + # Create an instance of CFELLoader + fl = CFELLoader(config=config_) + + # Read dataframe for run 123 + fl.read_dataframe(runs=[123]) + + # Extract expected elapsed times per file from metadata + file_stats = fl.metadata["file_statistics"]["electron"] + expected_elapsed_list = [ + file_stats[str(fid)]["columns"]["timeStamp"]["max"] + - file_stats[str(fid)]["columns"]["timeStamp"]["min"] + for fid in range(len(fl.files)) + ] + + # ------------------------- + # Aggregate=False → list of per-file elapsed times + # ------------------------- + elapsed_list = fl.get_elapsed_time(runs=[123], aggregate=False) + assert elapsed_list == expected_elapsed_list + + # ------------------------- + # Aggregate=True → sum of per-file elapsed times + # ------------------------- + elapsed_total = fl.get_elapsed_time(runs=[123], aggregate=True) + expected_total = sum(expected_elapsed_list) + assert elapsed_total == expected_total + + # ------------------------- + # Remove the parquet files created during test + # ------------------------- + buffer_dir = Path(fl.processed_dir, "buffer") + if buffer_dir.exists(): + for file in buffer_dir.iterdir(): + file.unlink() + + +def test_available_runs(monkeypatch: pytest.MonkeyPatch, config: dict) -> None: + """Test available_runs property of CFELLoader class""" + # Create an instance of CFELLoader + fl = CFELLoader(config=config) + + # Mock the raw_dir and files + fl.raw_dir = "/path/to/raw_dir" + files = [ + "run1_file1.h5", + "run3_file1.h5", + "run2_file1.h5", + "run1_file2.h5", + ] + + # Mock the glob method to return the mock files + def mock_glob(*args, **kwargs): # noqa: ARG001 + return [Path(fl.raw_dir, file) for file in files] + + monkeypatch.setattr(Path, "glob", mock_glob) + + # Test available_runs + assert fl.available_runs == [1, 2, 3] diff --git a/tests/loader/cfel/test_dataframe_creator.py b/tests/loader/cfel/test_dataframe_creator.py new file mode 100644 index 00000000..dc04d24d --- /dev/null +++ b/tests/loader/cfel/test_dataframe_creator.py @@ -0,0 +1,212 @@ +"""Tests for DataFrameCreator functionality (per_file, per_train, per_electron)""" +from pathlib import Path + +import h5py +import numpy as np +import pytest +import pandas as pd + +from sed.loader.cfel.dataframe import DataFrameCreator +from sed.loader.flash.utils import get_channels + + +def test_get_dataset_key(config_dataframe: dict, h5_paths: list[Path]) -> None: + df = DataFrameCreator(config_dataframe, h5_paths[0]) + channel = "dldPosX" + dataset_key = df.get_dataset_key(channel) + assert dataset_key == config_dataframe["channels"][channel]["dataset_key"] + + config_copy = config_dataframe.copy() + del config_copy["channels"][channel]["dataset_key"] + df2 = DataFrameCreator(config_copy, h5_paths[0]) + with pytest.raises(ValueError): + df2.get_dataset_key(channel) + + +def test_get_dataset_array(config_dataframe: dict, h5_paths: list[Path]) -> None: + df = DataFrameCreator(config_dataframe, h5_paths[0]) + for channel in config_dataframe["channels"]: + dset = df.get_dataset_array(channel) + assert isinstance(dset, h5py.Dataset) + assert dset.shape[0] > 0 + + +def test_df_per_file(config_dataframe: dict, h5_paths: list[Path]) -> None: + """Test per_file data (countId index)""" + df = DataFrameCreator(config_dataframe, h5_paths[0]) + per_file_channels = get_channels(config_dataframe, "per_file") + if not per_file_channels: + pytest.skip("No per_file channels in config") + + # Index should be countId + df_file = df.df # combined DataFrame includes per_file data + assert "countId" in df_file.index.names or df_file.index.name == "countId" + + # All per_file columns exist in df + for ch in per_file_channels: + assert ch in df_file.columns + + +def test_df_train(config_dataframe: dict, h5_paths: list[Path]) -> None: + """Test df_train (per_train channels)""" + df = DataFrameCreator(config_dataframe, h5_paths[0]) + per_train_channels = get_channels(config_dataframe, "per_train") + aux_alias = config_dataframe.get("aux_alias", "dldAux") + if aux_alias in config_dataframe["channels"]: + subchannels = config_dataframe["channels"][aux_alias].get("sub_channels", {}) + per_train_channels.extend(subchannels.keys()) + + if not per_train_channels: + pytest.skip("No per_train channels in config") + + df_train = df.df_train + assert isinstance(df_train, pd.DataFrame) + + # Index should be single-level trainId (because no pulseId/electronId in current code) + assert df_train.index.name == "trainId" or df_train.index.name is None + + # Columns check + assert set(df_train.columns).issubset(set(per_train_channels)) + + +def test_df_electron(config_dataframe: dict, h5_paths: list[Path]) -> None: + """Test df_electron (per_electron channels)""" + df = DataFrameCreator(config_dataframe, h5_paths[0]) + per_electron_channels = get_channels(config_dataframe, "per_electron") + if not per_electron_channels: + pytest.skip("No per-electron channels in config") + + df_elec = df.df_electron + assert isinstance(df_elec, pd.DataFrame) + + # Index can be RangeIndex (single-level) if trainId/electronId not implemented + idx = df_elec.index + assert idx is not None + # Columns + assert set(df_elec.columns).issubset(set(per_electron_channels)) + # No NaNs + assert not df_elec.isnull().values.any() + +# def test_df_electron(config_dataframe: dict, h5_paths: list[Path]) -> None: +# """Test df_electron (per_electron channels)""" +# df = DataFrameCreator(config_dataframe, h5_paths[0]) +# per_electron_channels = get_channels(config_dataframe, "per_electron") +# if not per_electron_channels: +# pytest.skip("No per-electron channels in config") + +# df_elec = df.df_electron +# assert isinstance(df_elec, pd.DataFrame) +# # MultiIndex: trainId + electronId +# idx = df_elec.index +# assert isinstance(idx, pd.MultiIndex) +# assert set(idx.names) == {"trainId", "electronId"} + +# # Columns +# assert set(df_elec.columns).issubset(set(per_electron_channels)) +# # No NaNs +# assert not df_elec.isnull().values.any() + + +def test_df_timestamp(config_dataframe: dict, h5_paths: list[Path]) -> None: + """Test timestamp DataFrame""" + df = DataFrameCreator(config_dataframe, h5_paths[0]) + ts_df = df.df_timestamp + assert isinstance(ts_df, pd.DataFrame) + ts_col = config_dataframe["columns"].get("timestamp", "timeStamp") + assert ts_col in ts_df.columns + # Length matches main index + assert ts_df.shape[0] == len(df.index) + + +def test_df_combined(config_dataframe: dict, h5_paths: list[Path]) -> None: + dfc = DataFrameCreator(config_dataframe, h5_paths[0]) + df = dfc.df + + assert isinstance(df, pd.DataFrame) + + df_elec = dfc.df_electron + df_train = dfc.df_train + df_ts = dfc.df_timestamp + + # 1) All electron rows must be present in the combined DF + assert df_elec.index.isin(df.index).all() + + # 2) Electron values must be unchanged (dtype upcast is OK) + pd.testing.assert_frame_equal( + df.loc[df_elec.index, df_elec.columns], + df_elec, + check_dtype=False, + ) + + # 3) Columns must be the union + expected_cols = ( + set(df_elec.columns) + | set(df_train.columns) + | set(df_ts.columns) + ) + assert set(df.columns) == expected_cols + + # 4) per_train + timestamp columns must be forward-filled + ffill_cols = list(df_train.columns) + list(df_ts.columns) + assert not df[ffill_cols].isna().any().any() + +# def test_df_combined(config_dataframe: dict, h5_paths: list[Path]) -> None: +# """Test df property (combined DataFrame)""" +# df = DataFrameCreator(config_dataframe, h5_paths[0]) +# combined = df.df +# assert isinstance(combined, pd.DataFrame) + +# # Columns = per_file + per_train + per_electron + timestamp +# expected_cols = set() +# try: +# expected_cols.update(get_channels(config_dataframe, "per_file")) +# except ValueError: +# pass +# try: +# expected_cols.update(get_channels(config_dataframe, "per_train")) +# except ValueError: +# pass +# try: +# expected_cols.update(get_channels(config_dataframe, "per_electron")) +# except ValueError: +# pass +# expected_cols.add(config_dataframe["columns"].get("timestamp", "timeStamp")) + +# # Columns in combined are subset of expected +# assert set(combined.columns).issubset(expected_cols) + + +def test_group_name_not_in_h5( + config_dataframe: dict, + h5_paths: list[Path], +) -> None: + """Test error when dataset_key does not exist in H5 file.""" + + # Pick a non-index channel + channel = next( + ch for ch in config_dataframe["channels"] + if ch != config_dataframe.get("index", ["countId"])[0] + ) + + # Deep copy only what we mutate + config = dict(config_dataframe) + config["channels"] = dict(config_dataframe["channels"]) + config["channels"][channel] = dict(config_dataframe["channels"][channel]) + + # Break ONLY this channel + config["channels"][channel]["dataset_key"] = "/this/does/not/exist" + + dfc = DataFrameCreator(config, h5_paths[0]) + + with pytest.raises(KeyError): + _ = dfc.get_dataset_array(channel) + +# def test_group_name_not_in_h5(config_dataframe: dict, h5_paths: list[Path]) -> None: +# """Test KeyError when a dataset_key is missing""" +# channel = "dldPosX" +# config = config_dataframe.copy() +# config["channels"][channel]["dataset_key"] = "non_existent_dataset" + +# df = DataFrameCreator(config, h5_paths[0]) +# with pytest.raises(KeyError): +# _ = df.df_train diff --git a/tests/loader/cfel/test_metadata.py b/tests/loader/cfel/test_metadata.py new file mode 100644 index 00000000..8431c0bb --- /dev/null +++ b/tests/loader/cfel/test_metadata.py @@ -0,0 +1,64 @@ + +import pytest +from unittest.mock import MagicMock, patch +from sed.loader.cfel.loader import CFELLoader +from sed.core.config import parse_config +import os + +# Dummy config +config = { + "core": { + "instrument": "hextof", + "beamtime_id": "12345", + "year": "2024", + "beamline": "pg2", + "beamtime_dir": {"pg2": "/tmp/beamtime"}, + "paths": {"raw": "/tmp/raw"} + }, + "dataframe": { + "daq": "fadc" + }, + "metadata": { + "scicat_url": "http://fake.url" + } +} + +@pytest.fixture +def loader(): + return CFELLoader(config=config) + +def test_parse_scicat_metadata(loader): + with patch("sed.loader.cfel.loader.MetadataRetriever") as MockRetriever: + instance = MockRetriever.return_value + instance.get_metadata.return_value = {"scientificMetadata": {"key": "value"}} + + loader.runs = ["1"] + meta = loader.parse_scicat_metadata(token="fake_token") + + assert meta == {"scientificMetadata": {"key": "value"}} + instance.get_metadata.assert_called_once_with( + beamtime_id="12345", + runs=["1"], + metadata={}, + ) + +def test_parse_local_metadata(loader): + with patch("sed.loader.cfel.loader.MetadataRetriever") as MockRetriever: + instance = MockRetriever.return_value + instance.get_local_metadata.return_value = {"local": "meta"} + + loader.runs = ["1"] + # Mock paths since _initialize_dirs might not be called or fail + loader.beamtime_dir = "/tmp/bt" + loader.meta_dir = "/tmp/meta" + + meta = loader.parse_local_metadata() + + assert meta == {"local": "meta"} + instance.get_local_metadata.assert_called_once_with( + beamtime_id="12345", + beamtime_dir="/tmp/bt", + meta_dir="/tmp/meta", + runs=["1"], + metadata={}, + ) diff --git a/tests/loader/flash/test_buffer_handler.py b/tests/loader/flash/test_buffer_handler.py index 3eb0e625..62c696c8 100644 --- a/tests/loader/flash/test_buffer_handler.py +++ b/tests/loader/flash/test_buffer_handler.py @@ -45,7 +45,7 @@ def test_buffer_file_paths(config: dict, h5_paths: list[Path]) -> None: the checks with modified file name parameters. """ folder = create_parquet_dir(config, "get_files_to_read") - fp = BufferFilePaths(config, h5_paths, folder, suffix="", remove_invalid_files=False) + fp = BufferFilePaths(h5_paths, folder, suffix="") # check that all files are to be read assert len(fp.file_sets_to_process()) == len(h5_paths) @@ -70,7 +70,7 @@ def test_buffer_file_paths(config: dict, h5_paths: list[Path]) -> None: bh._save_buffer_file(path) # check again for files to read and expect one less file - fp = BufferFilePaths(config, h5_paths, folder, suffix="", remove_invalid_files=False) + fp = BufferFilePaths(h5_paths, folder, suffix="") # check that only one file is to be read assert len(fp.file_sets_to_process()) == len(h5_paths) - 1 @@ -82,7 +82,7 @@ def test_buffer_file_paths(config: dict, h5_paths: list[Path]) -> None: Path(path["timed"]).unlink() # Test for adding a suffix - fp = BufferFilePaths(config, h5_paths, folder, "suffix", remove_invalid_files=False) + fp = BufferFilePaths(h5_paths, folder, "suffix") # expected buffer paths with prefix and suffix for typ in ["electron", "timed"]: diff --git a/tests/loader/flash/test_utils.py b/tests/loader/flash/test_utils.py index 929a9305..d65d8010 100644 --- a/tests/loader/flash/test_utils.py +++ b/tests/loader/flash/test_utils.py @@ -45,8 +45,8 @@ def test_get_channels_by_format(config_dataframe: dict) -> None: # Request channels for 'all' formats using a list. format_all = get_channels(ch_dict, ["all"]) - # Request index channels only. No need for channel_dict. - format_index = get_channels(index=True) + # Request index channels only. + format_index = get_channels(ch_dict, index=True) # Request 'per_electron' format and include index channels. format_index_electron = get_channels(ch_dict, ["per_electron"], index=True) diff --git a/tests/loader/test_loaders.py b/tests/loader/test_loaders.py index a5b357d0..da13fcad 100644 --- a/tests/loader/test_loaders.py +++ b/tests/loader/test_loaders.py @@ -22,7 +22,13 @@ test_data_dir = os.path.join(test_dir, "data") read_types = ["one_file", "files", "one_folder", "folders", "one_run", "runs"] -runs = {"generic": None, "mpes": ["30", "50"], "flash": ["43878", "43878"], "sxp": ["0016", "0016"]} +runs = { + "generic": None, + "mpes": ["30", "50"], + "flash": ["43878", "43878"], + "sxp": ["0016", "0016"], + "cfel": ["123"], +} def get_loader_name_from_loader_object(loader: BaseLoader) -> str: @@ -94,7 +100,7 @@ def test_has_correct_read_dataframe_func(loader: BaseLoader, read_type: str) -> assert callable(loader.read_dataframe) # Fix for race condition during parallel testing - if loader.__name__ in {"flash", "sxp"}: + if loader.__name__ in {"flash", "sxp", "cfel"}: config = deepcopy(loader._config) # pylint: disable=protected-access config["core"]["paths"]["processed"] = Path( config["core"]["paths"]["processed"], @@ -167,7 +173,7 @@ def test_has_correct_read_dataframe_func(loader: BaseLoader, read_type: str) -> assert loaded_dataframe.npartitions == expected_size assert isinstance(loaded_metadata, dict) - if loader.__name__ in {"flash", "sxp"}: + if loader.__name__ in {"flash", "sxp", "cfel"}: loader = cast(FlashLoader, loader) loader._initialize_dirs() for file in os.listdir(Path(loader.processed_dir, "buffer")): @@ -183,7 +189,7 @@ def test_timed_dataframe(loader: BaseLoader) -> None: """ # Fix for race condition during parallel testing - if loader.__name__ in {"flash", "sxp"}: + if loader.__name__ in {"flash", "sxp", "cfel"}: config = deepcopy(loader._config) # pylint: disable=protected-access config["core"]["paths"]["processed"] = Path( config["core"]["paths"]["processed"], @@ -201,7 +207,7 @@ def test_timed_dataframe(loader: BaseLoader) -> None: collect_metadata=False, ) if loaded_timed_dataframe is None: - if loader.__name__ in {"flash", "sxp"}: + if loader.__name__ in {"flash", "sxp", "cfel"}: loader = cast(FlashLoader, loader) loader._initialize_dirs() for file in os.listdir(Path(loader.processed_dir, "buffer")): @@ -211,7 +217,7 @@ def test_timed_dataframe(loader: BaseLoader) -> None: assert set(loaded_timed_dataframe.columns).issubset(set(loaded_dataframe.columns)) assert loaded_timed_dataframe.npartitions == loaded_dataframe.npartitions - if loader.__name__ in {"flash", "sxp"}: + if loader.__name__ in {"flash", "sxp", "cfel"}: loader = cast(FlashLoader, loader) loader._initialize_dirs() for file in os.listdir(Path(loader.processed_dir, "buffer")): @@ -227,7 +233,7 @@ def test_get_count_rate(loader: BaseLoader) -> None: """ # Fix for race condition during parallel testing - if loader.__name__ in {"flash", "sxp"}: + if loader.__name__ in {"flash", "sxp", "cfel"}: config = deepcopy(loader._config) # pylint: disable=protected-access config["core"]["paths"]["processed"] = Path( config["core"]["paths"]["processed"], @@ -246,7 +252,7 @@ def test_get_count_rate(loader: BaseLoader) -> None: ) loaded_time, loaded_countrate = loader.get_count_rate() if loaded_time is None and loaded_countrate is None: - if loader.__name__ in {"flash", "sxp"}: + if loader.__name__ in {"flash", "sxp", "cfel"}: loader = cast(FlashLoader, loader) loader._initialize_dirs() for file in os.listdir(Path(loader.processed_dir, "buffer")): @@ -261,7 +267,7 @@ def test_get_count_rate(loader: BaseLoader) -> None: with pytest.raises(TypeError): loader.get_count_rate(illegal_kwd=True) - if loader.__name__ in {"flash", "sxp"}: + if loader.__name__ in {"flash", "sxp", "cfel"}: loader = cast(FlashLoader, loader) loader._initialize_dirs() for file in os.listdir(Path(loader.processed_dir, "buffer")): @@ -277,7 +283,7 @@ def test_get_elapsed_time(loader: BaseLoader) -> None: """ # Fix for race condition during parallel testing - if loader.__name__ in {"flash", "sxp"}: + if loader.__name__ in {"flash", "sxp", "cfel"}: config = deepcopy(loader._config) # pylint: disable=protected-access config["core"]["paths"]["processed"] = Path( config["core"]["paths"]["processed"], @@ -311,7 +317,7 @@ def test_get_elapsed_time(loader: BaseLoader) -> None: with pytest.raises(TypeError): loader.get_elapsed_time(illegal_kwd=True) - if loader.__name__ in {"flash", "sxp"}: + if loader.__name__ in {"flash", "sxp", "cfel"}: loader = cast(FlashLoader, loader) loader._initialize_dirs() for file in os.listdir(Path(loader.processed_dir, "buffer")):