refactor: put helpers in a separate folder

main
Ricard Illa 2023-06-22 15:43:26 +02:00
parent 0b58d47acc
commit b3069d4ca2
2 changed files with 56 additions and 1 deletions

View File

@ -0,0 +1,55 @@
import logging
import xml.etree.ElementTree as ET
from typing import Dict
def iter_parse(root: ET.Element) -> Dict[str, str]:
"""Recursively parse the XML tree into a dictionary Each key/value pair is
inside its own <div> tag and the key inside a <b> tag.
The fields that I believe are compulsory (TCIN, UPC and Origin) are only
nested one level deep, while the rest of fields seem to be always nested
two levels deep. But parsing it recursively helps generalise both cases."""
spec_dict = {}
for child in root:
if child.tag == "div":
if "b" in [x.tag for x in child]:
key, *values = child.itertext()
key = key.strip(":")
value = "".join(values).strip(":")
spec_dict[key] = value
else:
spec_dict.update(iter_parse(child))
return spec_dict
def parse_raw_specs(raw_specs: str) -> Dict[str, str]:
"""Parse a raw specifications XML string into a dictionary.
This involves first recursively parsing the XML tree and then renaming
the key values"""
fields_mapping = {
"Material": "materials",
"Package Quantity": "packaging",
"Number of Pieces": "packaging",
"Dimensions (Overall)": "dimensions",
"Dimensions": "dimensions",
"Weight": "weight",
"TCIN": "tcin",
"Origin": "origin",
}
try:
xml_root = ET.fromstring(raw_specs)
except ET.ParseError:
logging.error("error parsing xml string: \n%s", raw_specs)
return {}
parsed = iter_parse(xml_root)
specs_dict = {
fields_mapping[key]: value
for key, value in parsed.items()
if key in fields_mapping
}
return specs_dict

View File

@ -2,7 +2,7 @@
import xml.etree.ElementTree as ET
from helpers import parse_raw_specs, iter_parse
from helpers.parse_xml import parse_raw_specs, iter_parse
def test_parse_raw_specs0():