import logging import xml.etree.ElementTree as ET from typing import Dict def iter_parse(root: ET.Element) -> Dict[str, str]: """Recursively parse the XML tree into a dictionary Each key/value pair is inside its own
tag and the key inside a tag. The fields that I believe are compulsory (TCIN, UPC and Origin) are only nested one level deep, while the rest of fields seem to be always nested two levels deep. But parsing it recursively helps generalise both cases.""" spec_dict = {} for child in root: if child.tag == "div": if "b" in [x.tag for x in child]: key, *values = child.itertext() key = key.strip(":") value = "".join(values).strip(":") spec_dict[key] = value else: spec_dict.update(iter_parse(child)) return spec_dict def parse_raw_specs(raw_specs: str) -> Dict[str, str]: """Parse a raw specifications XML string into a dictionary. This involves first recursively parsing the XML tree and then renaming the key values""" fields_mapping = { "Material": "materials", "Package Quantity": "packaging", "Number of Pieces": "packaging", "Dimensions (Overall)": "dimensions", "Dimensions": "dimensions", "Weight": "weight", "TCIN": "tcin", "Origin": "origin", } try: xml_root = ET.fromstring(raw_specs) except ET.ParseError: logging.error("error parsing xml string: \n%s", raw_specs) return {} parsed = iter_parse(xml_root) specs_dict = { fields_mapping[key]: value for key, value in parsed.items() if key in fields_mapping } return specs_dict