diff --git a/pipeline/beam_etl/helpers.py b/pipeline/beam_etl/helpers.py index dd25889..1b6f57e 100644 --- a/pipeline/beam_etl/helpers.py +++ b/pipeline/beam_etl/helpers.py @@ -1,14 +1,15 @@ +import logging import xml.etree.ElementTree as ET + from typing import Dict -def iter_parse(root: ET.Element) -> Dict[str,str]: - """Recursively parse the XML tree into a dictionary - Each key/value pair is inside it's own
tag and - the key inside a tag. - The fields that I believe are compulsory (TCIN, UPC and Origin) - are only nested one level deep, while the rest of fields seem - to be always nested two levels deep. But parsing it recursively - helps generalise both cases.""" + +def iter_parse(root: ET.Element) -> Dict[str, str]: + """Recursively parse the XML tree into a dictionary Each key/value pair is + inside its own
tag and the key inside a tag. + The fields that I believe are compulsory (TCIN, UPC and Origin) are only + nested one level deep, while the rest of fields seem to be always nested + two levels deep. But parsing it recursively helps generalise both cases.""" spec_dict = {} for child in root: @@ -22,10 +23,11 @@ def iter_parse(root: ET.Element) -> Dict[str,str]: spec_dict.update(iter_parse(child)) return spec_dict -def parse_raw_specs(raw_specs: str) -> Dict[str,str]: - """Parse a raw specifications XML string into a dictionary - This involves first recursively parsing the XML tree and then - renaming the key values""" + +def parse_raw_specs(raw_specs: str) -> Dict[str, str]: + """Parse a raw specifications XML string into a dictionary. + This involves first recursively parsing the XML tree and then renaming + the key values""" fields_mapping = { "Material": "materials", @@ -37,7 +39,13 @@ def parse_raw_specs(raw_specs: str) -> Dict[str,str]: "TCIN": "tcin", "Origin": "origin", } - xml_root = ET.fromstring(raw_specs) + + try: + xml_root = ET.fromstring(raw_specs) + except ET.ParseError: + logging.error("error parsing xml string: \n%s", raw_specs) + return {} + parsed = iter_parse(xml_root) specs_dict = { fields_mapping[key]: value