From b3069d4ca2c61880a66144410454fd21b6a0a9b7 Mon Sep 17 00:00:00 2001 From: Ricard Illa Date: Thu, 22 Jun 2023 15:43:26 +0200 Subject: [PATCH] refactor: put helpers in a separate folder --- pipeline/beam_etl/helpers/parse_xml.py | 55 +++++++++++++++++++ ...t_parse_raw_specs.py => test_parse_xml.py} | 2 +- 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 pipeline/beam_etl/helpers/parse_xml.py rename pipeline/beam_etl/tests/{test_parse_raw_specs.py => test_parse_xml.py} (99%) diff --git a/pipeline/beam_etl/helpers/parse_xml.py b/pipeline/beam_etl/helpers/parse_xml.py new file mode 100644 index 0000000..1b6f57e --- /dev/null +++ b/pipeline/beam_etl/helpers/parse_xml.py @@ -0,0 +1,55 @@ +import logging +import xml.etree.ElementTree as ET + +from typing import Dict + + +def iter_parse(root: ET.Element) -> Dict[str, str]: + """Recursively parse the XML tree into a dictionary Each key/value pair is + inside its own
tag and the key inside a tag. + The fields that I believe are compulsory (TCIN, UPC and Origin) are only + nested one level deep, while the rest of fields seem to be always nested + two levels deep. But parsing it recursively helps generalise both cases.""" + + spec_dict = {} + for child in root: + if child.tag == "div": + if "b" in [x.tag for x in child]: + key, *values = child.itertext() + key = key.strip(":") + value = "".join(values).strip(":") + spec_dict[key] = value + else: + spec_dict.update(iter_parse(child)) + return spec_dict + + +def parse_raw_specs(raw_specs: str) -> Dict[str, str]: + """Parse a raw specifications XML string into a dictionary. + This involves first recursively parsing the XML tree and then renaming + the key values""" + + fields_mapping = { + "Material": "materials", + "Package Quantity": "packaging", + "Number of Pieces": "packaging", + "Dimensions (Overall)": "dimensions", + "Dimensions": "dimensions", + "Weight": "weight", + "TCIN": "tcin", + "Origin": "origin", + } + + try: + xml_root = ET.fromstring(raw_specs) + except ET.ParseError: + logging.error("error parsing xml string: \n%s", raw_specs) + return {} + + parsed = iter_parse(xml_root) + specs_dict = { + fields_mapping[key]: value + for key, value in parsed.items() + if key in fields_mapping + } + return specs_dict diff --git a/pipeline/beam_etl/tests/test_parse_raw_specs.py b/pipeline/beam_etl/tests/test_parse_xml.py similarity index 99% rename from pipeline/beam_etl/tests/test_parse_raw_specs.py rename to pipeline/beam_etl/tests/test_parse_xml.py index fe3b5ef..2a276cb 100644 --- a/pipeline/beam_etl/tests/test_parse_raw_specs.py +++ b/pipeline/beam_etl/tests/test_parse_xml.py @@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET -from helpers import parse_raw_specs, iter_parse +from helpers.parse_xml import parse_raw_specs, iter_parse def test_parse_raw_specs0():