dayrize-usecase/pipeline/beam_etl/helpers/parse_xml.py

76 lines
2.5 KiB
Python

"""Helper functions to parse the the raw_specifications xml string into a
dictionary containing the values we are interested in.
I decided to parse raw_speficications instead of using the "specifications"
column because the raw data seemed to be cleaner and more consistent, even if
it requires more effort to parse.
First I parse the xml into a dictionary. Then, I standardize the found keys into
some keys I expect.
Notes:
Looks like "dimenions" can be "Dimensions (Overall)", "Dimensions" or other
things like "Assembled Dimensions" or "Piece X Dimensions". But this latter two
options are incomplete (lack the height), harder to parse and rare enough that
I'll just drop them
Package Quantity and Number of Pieces are never found together. I will assume
they refer to the same thing.
"""
import logging
import xml.etree.ElementTree as ET
from typing import Dict
FIELDS_MAPPING = {
"Material": "materials",
"Package Quantity": "packaging",
"Number of Pieces": "packaging",
"Dimensions (Overall)": "dimensions",
"Dimensions": "dimensions",
"Weight": "weight",
"TCIN": "tcin",
"Origin": "origin",
}
def iter_parse(root: ET.Element) -> Dict[str, str]:
"""Recursively parse the XML tree into a dictionary Each key/value pair is
inside its own <div> tag and the key inside a <b> tag.
The fields that I believe are compulsory (TCIN, UPC and Origin) are only
nested one level deep, while the rest of fields seem to be always nested
two levels deep. But parsing it recursively helps generalise both cases."""
spec_dict = {}
for child in root:
if child.tag == "div":
if "b" in [x.tag for x in child]:
key, *values = child.itertext()
key = key.strip(":")
value = "".join(values).strip(":")
spec_dict[key] = value
else:
spec_dict.update(iter_parse(child))
return spec_dict
def parse_raw_specs(raw_specs: str) -> Dict[str, str]:
"""Parse a raw specifications XML string into a dictionary.
This involves first recursively parsing the XML tree and then renaming
the key values"""
try:
xml_root = ET.fromstring(raw_specs)
except ET.ParseError:
logging.error("error parsing xml string: \n%s", raw_specs)
return {}
parsed = iter_parse(xml_root)
specs_dict = {
FIELDS_MAPPING[key]: value
for key, value in parsed.items()
if key in FIELDS_MAPPING
}
return specs_dict