Source code for ecgprocess.process_xml

'''
A module for extracting metadata, median beats, and raw waveforms from ECG
XML files, allowing for XML validation.

This module provides an API through a reader class, which maps ECG data from
XML files to class attributes. These attributes can be programmatically accessed
and further processed by downstream ECGprocess modules or external programs
leveraging the API.
'''

import pathlib
import warnings
import numpy as np
import ecgprocess.utils.reader_tools as reader_utils
import ecgprocess.utils.ecg_tools as ecg_utils
from lxml import etree
from ecgprocess.utils.reader_tools import(
    BaseReader,
)
from dataclasses import dataclass, field
from typing import (
    Optional, Self, Any, Literal,
)
from ecgprocess.utils.general import(
    ManagedProperty,
)
from ecgprocess.utils.config_tools import(
    ConfigParser,
)
from ecgprocess.errors import (
    is_type,
    _check_readable,
    Warn_MSG,
    Error_MSG,
)
from ecgprocess.constants import (
    ProcessXMLNames as PXMLNam,
    CoreData as Core,
)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# constants
CTypes = Core.DataTypes
CLeads = Core.Leads
CMeta = Core.MetaData
CProc = Core.ProcessingData

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] @dataclass class ECGXMLReader(BaseReader): """ Processes an XML file containing ECG data and extracts the metadata, median beats, and raw waveforms. Parameters ---------- augment_leads : `bool`, default `False` Whether the augmented leads should be calculated if these are not already available in the source file. resample_500 : `bool`, default `True` Whether to resample the ECG to a frequency of 500 Hertz. Note this will internally calculate the ECG duration in seconds. For the duration to be in seconds the sampling frequency/rate should be in seconds not milliseconds. Attributes ---------- augment_leads : `bool` Whether the augmented leads were calculated if these were unavailable. resample : `bool` Whether the ECG was resampled to a 500 Hertz frequency. Methods ------- extract(config, skip_empty, parse_numeric, **kwargs) Processes the XML file content applying optional lead augmentation and resampling. The XML content will be mapped to class attributes. """ # #### properties tags = ManagedProperty(CProc.TAGS, list) raw_data = ManagedProperty(CProc.RAW, dict) _as_array:bool = True # #### parameters, with defaults augment_leads:bool = field(default=False) resample_500:bool = field(default=True) # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
[docs] def __post_init__(self): """Validating inputs.""" is_type(getattr(self, PXMLNam.AUG_LEADS), bool) is_type(getattr(self, PXMLNam.RESAMPLE), bool)
# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
[docs] def __call__(self, path:str, schema:str | None = None, verbose:bool=False, **kwargs:Optional[Any], ) -> Self: """ Reads an `.xml` file containing ECG readings, optionally validates this based on a .xsd schema, and map the XML file to a flat dictionary. Parameters ---------- path : `str` The path to the .xml file. schema : `str`, default `NoneType` A path to an XML schema which will be used to valudate the XML file against. verbose : `bool`, default `False` Whether warnings and process info should be printed. **kwargs : any keyword arguments passed to flatten_dict. Attributes ---------- tags : `list` [`str`] A list of strings with parsed tags matching the `raw_data` keys. raw_data : `dict` [`str`, `any`] The raw parsed data. Returns ------- self : `ECGXMLReader` instance Returns the class instance with updated attributes including the extracted XML data. Raises ------ XMLValidationError If the XML file is not valid based on the supplied schema. """ # #### check input is_type(path, (pathlib.PosixPath, pathlib.WindowsPath, str)) is_type(schema, (type(None), pathlib.PosixPath, pathlib.WindowsPath, str)) is_type(verbose, bool) # #### assign to self self.verbose = verbose # #### confirm file is readable _check_readable(path) # #### validate XML if not schema is None: _check_readable(schema) parsed_xml = reader_utils.validate_xml( xml_path=path, xsd_path=schema, verbose=verbose) else: with open(path, 'rb') as xml: parsed_xml = etree.parse(xml) # map to flatten_dict xml_dict = reader_utils.flatten_dict( reader_utils.xml_to_dict(parsed_xml), **kwargs, ) # ### store keys and data getattr(type(self), CProc.RAW).set_with_setter(self, xml_dict) getattr(type(self), CProc.TAGS).set_with_setter( self, list(xml_dict.keys())) # ### return return self
# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
[docs] def extract(self, config:ConfigParser, bits:np.dtype | None=None, skip_empty:bool=True, parse_numeric:bool=True, **kwargs:Optional[Any]) -> Self: """ Processes the raw ECG data and assign these to class attributes performing resampling and lead augmentation if requested. Parameters ---------- config : `ConfigParser` A class instance of a parsed configuration file, mapping the XML content to class attributes. Specifically this should include dictionary attributes `MetaData`, `WaveForms`, `MedianBeats`, `OtherData`. The `MetaData` includes some privileged keys including essential information to describe an ECG instance, as well as non-privileged information. The difference between `OtherData` and `MetaData` is the way it is processed by other functions or methods with the `OtherData` processed without strong checks on its content. `WaveForms` and `MedianBeats` simply include the lead mappings. Please refer to the `constants.CoreData` class for the specifics parse_numeric : bool, default `True` Whether to check for numeric data accidentally recorded as string and try to parse these to int or float depending on the presence of a decimal separator. skip_empty : `bool`, default `True` Whether empty tags should be skipped or throw an error. bits : `np.dtype`, default `None` np.array bits passed to numpy.array dtype. **kwargs The keyword arguments for reader_tools.get_ecg_data. For the waveforms and medianbeats as_array and bits are hard coded so these will raise an error if supplied as kwargs. Attributes ---------- MetaData : `dict` [`str`, `any`] ECG metadata. Waveforms : `dict` [`str`, `np.array`] The lead specific ECG waveforms. MedianBeats : `dict` [`str`, `np.array`] The lead specific ECG median beats. OtherData : `dict` [`str`, `any`] Other data. Returns ------- self : `ECGXMLReader` instance Returns the class instance with updated attributes including the extracted XML data. """ is_type(config, ConfigParser) is_type(skip_empty, bool) is_type(parse_numeric, bool) # update kwargs kwargs = {**{'parse_numeric': parse_numeric, 'skip_empty': skip_empty, }, **kwargs} # #### get the configs md_cnf = config.get_section(CTypes.MetaData) wf_cnf = config.get_section(CTypes.WaveForms) mb_cnf = config.get_section(CTypes.MedianBeats) od_cnf = config.get_section(CTypes.OtherData) # #### extract metadata meta_data, meta_missing = \ reader_utils.get_ecg_data( getattr(self, CProc.RAW), config=md_cnf, **kwargs, ) # #### extract waveforms wave_data, wave_missing = \ reader_utils.get_ecg_data( getattr(self, CProc.RAW), config=wf_cnf, as_array=self._as_array, bits=bits, **kwargs, ) # #### extract median beats median_data, median_missing = \ reader_utils.get_ecg_data( getattr(self, CProc.RAW), config=mb_cnf, as_array=self._as_array, bits=bits, **kwargs, ) # #### extract other data other_data, other_missing = \ reader_utils.get_ecg_data( getattr(self, CProc.RAW), config=od_cnf, **kwargs, ) # #### resample to 500hz # first calculate the duration try: meta_data[CProc.Duration] =\ (meta_data[CMeta.SN_W]/meta_data[CMeta.SF] ) except TypeError: meta_data[CProc.Duration] = None # only run when the parameter is True, SF is not 500 and there # is a duration meta_data[CProc.SF_NEW] = None if getattr(self, PXMLNam.RESAMPLE) == True: # confirm SF is present if not CMeta.SF in meta_data or meta_data[CMeta.SF] is None: raise KeyError(f'`{CMeta.SF}` is necessary to resample ' 'the ECG signal.') # if it is, check whether there is a need to resample the signals. if int(meta_data[CMeta.SF]) != 500 and\ not meta_data[CProc.Duration] is None: # set the new SF meta_data[CProc.SF_NEW] = 500 # confirm there is waveform data if len(wave_missing) == 12: if skip_empty == False: raise AttributeError( Error_MSG.MISSING_SIGNAL.format('waveform')) else: wave_data = ecg_utils.resampling_500hz( wave_data, duration=meta_data[CProc.Duration]) # confirm there is median beats data if len(median_missing) == 12: if skip_empty == False: raise AttributeError( Error_MSG.MISSING_SIGNAL.format('median beat')) else: median_data = ecg_utils.resampling_500hz( median_data, median=True) # ### See if we need to get the augmented leads if getattr(self, PXMLNam.AUG_LEADS) == True and len(wave_missing) > 0: wave_data = ecg_utils.get_limb_leads(wave_data) median_data = ecg_utils.get_limb_leads(median_data) # ### do we print missing tags if self.verbose == True: missing_tags = [tag for tag in ( meta_missing + wave_missing + median_missing + other_missing ) if tag is not None] if len(missing_tags)>0: warnings.warn(Warn_MSG.MISSING_TAG.format(missing_tags)) # #### assing stuff to attributes setattr(self, CTypes.MetaData, meta_data) setattr(self, CTypes.OtherData, other_data) setattr(self, CTypes.WaveForms, wave_data) setattr(self, CTypes.MedianBeats, median_data) # #### return stuff return self