From b3069d4ca2c61880a66144410454fd21b6a0a9b7 Mon Sep 17 00:00:00 2001
From: Ricard Illa <ricard@trkkn.com>
Date: Thu, 22 Jun 2023 15:43:26 +0200
Subject: [PATCH] refactor: put helpers in a separate folder

---
 pipeline/beam_etl/helpers/parse_xml.py        | 55 +++++++++++++++++++
 ...t_parse_raw_specs.py => test_parse_xml.py} |  2 +-
 2 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 pipeline/beam_etl/helpers/parse_xml.py
 rename pipeline/beam_etl/tests/{test_parse_raw_specs.py => test_parse_xml.py} (99%)
diff --git a/pipeline/beam_etl/helpers/parse_xml.py b/pipeline/beam_etl/helpers/parse_xml.py
new file mode 100644
index 0000000..1b6f57e
--- /dev/null
+++ b/pipeline/beam_etl/helpers/parse_xml.py
@@ -0,0 +1,55 @@
+import logging
+import xml.etree.ElementTree as ET
+
+from typing import Dict
+
+
+def iter_parse(root: ET.Element) -> Dict[str, str]:
+    """Recursively parse the XML tree into a dictionary Each key/value pair is
+    inside its own <div> tag and the key inside a <b> tag.
+    The fields that I believe are compulsory (TCIN, UPC and Origin) are only
+    nested one level deep, while the rest of fields seem to be always nested
+    two levels deep. But parsing it recursively helps generalise both cases."""
+
+    spec_dict = {}
+    for child in root:
+        if child.tag == "div":
+            if "b" in [x.tag for x in child]:
+                key, *values = child.itertext()
+                key = key.strip(":")
+                value = "".join(values).strip(":")
+                spec_dict[key] = value
+            else:
+                spec_dict.update(iter_parse(child))
+    return spec_dict
+
+
+def parse_raw_specs(raw_specs: str) -> Dict[str, str]:
+    """Parse a raw specifications XML string into a dictionary.
+    This involves first recursively parsing the XML tree and then renaming
+    the key values"""
+
+    fields_mapping = {
+        "Material": "materials",
+        "Package Quantity": "packaging",
+        "Number of Pieces": "packaging",
+        "Dimensions (Overall)": "dimensions",
+        "Dimensions": "dimensions",
+        "Weight": "weight",
+        "TCIN": "tcin",
+        "Origin": "origin",
+    }
+
+    try:
+        xml_root = ET.fromstring(raw_specs)
+    except ET.ParseError:
+        logging.error("error parsing xml string: \n%s", raw_specs)
+        return {}
+
+    parsed = iter_parse(xml_root)
+    specs_dict = {
+        fields_mapping[key]: value
+        for key, value in parsed.items()
+        if key in fields_mapping
+    }
+    return specs_dict
diff --git a/pipeline/beam_etl/tests/test_parse_raw_specs.py b/pipeline/beam_etl/tests/test_parse_xml.py
similarity index 99%
rename from pipeline/beam_etl/tests/test_parse_raw_specs.py
rename to pipeline/beam_etl/tests/test_parse_xml.py
index fe3b5ef..2a276cb 100644
--- a/pipeline/beam_etl/tests/test_parse_raw_specs.py
+++ b/pipeline/beam_etl/tests/test_parse_xml.py
@@ -2,7 +2,7 @@
 
 
 import xml.etree.ElementTree as ET
-from helpers import parse_raw_specs, iter_parse
+from helpers.parse_xml import parse_raw_specs, iter_parse
 
 
 def test_parse_raw_specs0():