76 lines
2.5 KiB
Python
76 lines
2.5 KiB
Python
|
"""Helper functions to parse the the raw_specifications xml string into a
|
||
|
dictionary containing the values we are interested in.
|
||
|
I decided to parse raw_speficications instead of using the "specifications"
|
||
|
column because the raw data seemed to be cleaner and more consistent, even if
|
||
|
it requires more effort to parse.
|
||
|
|
||
|
First I parse the xml into a dictionary. Then, I standardize the found keys into
|
||
|
some keys I expect.
|
||
|
|
||
|
Notes:
|
||
|
Looks like "dimenions" can be "Dimensions (Overall)", "Dimensions" or other
|
||
|
things like "Assembled Dimensions" or "Piece X Dimensions". But this latter two
|
||
|
options are incomplete (lack the height), harder to parse and rare enough that
|
||
|
I'll just drop them
|
||
|
|
||
|
Package Quantity and Number of Pieces are never found together. I will assume
|
||
|
they refer to the same thing.
|
||
|
|
||
|
"""
|
||
|
|
||
|
import logging
|
||
|
import xml.etree.ElementTree as ET
|
||
|
|
||
|
from typing import Dict, Optional
|
||
|
|
||
|
FIELDS_MAPPING = {
|
||
|
"Material": "materials",
|
||
|
"Package Quantity": "packaging",
|
||
|
"Number of Pieces": "packaging",
|
||
|
"Dimensions (Overall)": "dimensions",
|
||
|
"Dimensions": "dimensions",
|
||
|
"Weight": "weight",
|
||
|
"TCIN": "tcin",
|
||
|
"Origin": "origin",
|
||
|
}
|
||
|
|
||
|
|
||
|
def iter_parse(root: ET.Element) -> Dict[str, str]:
|
||
|
"""Recursively parse the XML tree into a dictionary Each key/value pair is
|
||
|
inside its own <div> tag and the key inside a <b> tag.
|
||
|
The fields that I believe are compulsory (TCIN, UPC and Origin) are only
|
||
|
nested one level deep, while the rest of fields seem to be always nested
|
||
|
two levels deep. But parsing it recursively helps generalise both cases."""
|
||
|
|
||
|
spec_dict = {}
|
||
|
for child in root:
|
||
|
if child.tag == "div":
|
||
|
if "b" in [x.tag for x in child]:
|
||
|
key, *values = child.itertext()
|
||
|
key = key.strip(":")
|
||
|
value = "".join(values).strip(":")
|
||
|
spec_dict[key] = value
|
||
|
else:
|
||
|
spec_dict.update(iter_parse(child))
|
||
|
return spec_dict
|
||
|
|
||
|
|
||
|
def parse_raw_specs(raw_specs: str) -> Optional[Dict[str, str]]:
|
||
|
"""Parse a raw specifications XML string into a dictionary.
|
||
|
This involves first recursively parsing the XML tree and then renaming
|
||
|
the key values"""
|
||
|
|
||
|
try:
|
||
|
xml_root = ET.fromstring(raw_specs)
|
||
|
except ET.ParseError:
|
||
|
logging.error("error parsing xml string: \n%s", raw_specs)
|
||
|
return None
|
||
|
|
||
|
parsed = iter_parse(xml_root)
|
||
|
specs_dict = {
|
||
|
FIELDS_MAPPING[key]: value
|
||
|
for key, value in parsed.items()
|
||
|
if key in FIELDS_MAPPING
|
||
|
}
|
||
|
return specs_dict
|