diff --git a/pipeline/beam_etl/helpers/parse_xml.py b/pipeline/beam_etl/helpers/parse_xml.py index 266c4bf..1cab2d8 100644 --- a/pipeline/beam_etl/helpers/parse_xml.py +++ b/pipeline/beam_etl/helpers/parse_xml.py @@ -6,6 +6,16 @@ it requires more effort to parse. First I parse the xml into a dictionary. Then, I standardize the found keys into some keys I expect. + +Notes: +Looks like "dimenions" can be "Dimensions (Overall)", "Dimensions" or other +things like "Assembled Dimensions" or "Piece X Dimensions". But this latter two +options are incomplete (lack the height), harder to parse and rare enough that +I'll just drop them + +Package Quantity and Number of Pieces are never found together. I will assume +they refer to the same thing. + """ import logging @@ -13,6 +23,17 @@ import xml.etree.ElementTree as ET from typing import Dict +FIELDS_MAPPING = { + "Material": "materials", + "Package Quantity": "packaging", + "Number of Pieces": "packaging", + "Dimensions (Overall)": "dimensions", + "Dimensions": "dimensions", + "Weight": "weight", + "TCIN": "tcin", + "Origin": "origin", +} + def iter_parse(root: ET.Element) -> Dict[str, str]: """Recursively parse the XML tree into a dictionary Each key/value pair is @@ -39,17 +60,6 @@ def parse_raw_specs(raw_specs: str) -> Dict[str, str]: This involves first recursively parsing the XML tree and then renaming the key values""" - fields_mapping = { - "Material": "materials", - "Package Quantity": "packaging", - "Number of Pieces": "packaging", - "Dimensions (Overall)": "dimensions", - "Dimensions": "dimensions", - "Weight": "weight", - "TCIN": "tcin", - "Origin": "origin", - } - try: xml_root = ET.fromstring(raw_specs) except ET.ParseError: @@ -58,8 +68,8 @@ def parse_raw_specs(raw_specs: str) -> Dict[str, str]: parsed = iter_parse(xml_root) specs_dict = { - fields_mapping[key]: value + FIELDS_MAPPING[key]: value for key, value in parsed.items() - if key in fields_mapping + if key in FIELDS_MAPPING } return specs_dict