diff --git a/pipeline/beam_etl/helpers/parse_xml.py b/pipeline/beam_etl/helpers/parse_xml.py
new file mode 100644
index 0000000..1b6f57e
--- /dev/null
+++ b/pipeline/beam_etl/helpers/parse_xml.py
@@ -0,0 +1,55 @@
+import logging
+import xml.etree.ElementTree as ET
+
+from typing import Dict
+
+
+def iter_parse(root: ET.Element) -> Dict[str, str]:
+ """Recursively parse the XML tree into a dictionary Each key/value pair is
+ inside its own
tag and the key inside a tag.
+ The fields that I believe are compulsory (TCIN, UPC and Origin) are only
+ nested one level deep, while the rest of fields seem to be always nested
+ two levels deep. But parsing it recursively helps generalise both cases."""
+
+ spec_dict = {}
+ for child in root:
+ if child.tag == "div":
+ if "b" in [x.tag for x in child]:
+ key, *values = child.itertext()
+ key = key.strip(":")
+ value = "".join(values).strip(":")
+ spec_dict[key] = value
+ else:
+ spec_dict.update(iter_parse(child))
+ return spec_dict
+
+
+def parse_raw_specs(raw_specs: str) -> Dict[str, str]:
+ """Parse a raw specifications XML string into a dictionary.
+ This involves first recursively parsing the XML tree and then renaming
+ the key values"""
+
+ fields_mapping = {
+ "Material": "materials",
+ "Package Quantity": "packaging",
+ "Number of Pieces": "packaging",
+ "Dimensions (Overall)": "dimensions",
+ "Dimensions": "dimensions",
+ "Weight": "weight",
+ "TCIN": "tcin",
+ "Origin": "origin",
+ }
+
+ try:
+ xml_root = ET.fromstring(raw_specs)
+ except ET.ParseError:
+ logging.error("error parsing xml string: \n%s", raw_specs)
+ return {}
+
+ parsed = iter_parse(xml_root)
+ specs_dict = {
+ fields_mapping[key]: value
+ for key, value in parsed.items()
+ if key in fields_mapping
+ }
+ return specs_dict
diff --git a/pipeline/beam_etl/tests/test_parse_raw_specs.py b/pipeline/beam_etl/tests/test_parse_xml.py
similarity index 99%
rename from pipeline/beam_etl/tests/test_parse_raw_specs.py
rename to pipeline/beam_etl/tests/test_parse_xml.py
index fe3b5ef..2a276cb 100644
--- a/pipeline/beam_etl/tests/test_parse_raw_specs.py
+++ b/pipeline/beam_etl/tests/test_parse_xml.py
@@ -2,7 +2,7 @@
import xml.etree.ElementTree as ET
-from helpers import parse_raw_specs, iter_parse
+from helpers.parse_xml import parse_raw_specs, iter_parse
def test_parse_raw_specs0():