"""Helper functions to parse the the raw_specifications xml string into a dictionary containing the values we are interested in. I decided to parse raw_speficications instead of using the "specifications" column because the raw data seemed to be cleaner and more consistent, even if it requires more effort to parse. First I parse the xml into a dictionary. Then, I standardize the found keys into some keys I expect. Notes: Looks like "dimenions" can be "Dimensions (Overall)", "Dimensions" or other things like "Assembled Dimensions" or "Piece X Dimensions". But this latter two options are incomplete (lack the height), harder to parse and rare enough that I'll just drop them Package Quantity and Number of Pieces are never found together. I will assume they refer to the same thing. """ import logging import xml.etree.ElementTree as ET from typing import Dict, Optional FIELDS_MAPPING = { "Material": "materials", "Package Quantity": "packaging", "Number of Pieces": "packaging", "Dimensions (Overall)": "dimensions", "Dimensions": "dimensions", "Weight": "weight", "TCIN": "tcin", "Origin": "origin", } def iter_parse(root: ET.Element) -> Dict[str, str]: """Recursively parse the XML tree into a dictionary Each key/value pair is inside its own