diff --git a/pipeline/beam_etl/helpers.py b/pipeline/beam_etl/helpers.py deleted file mode 100644 index 1b6f57e..0000000 --- a/pipeline/beam_etl/helpers.py +++ /dev/null @@ -1,55 +0,0 @@ -import logging -import xml.etree.ElementTree as ET - -from typing import Dict - - -def iter_parse(root: ET.Element) -> Dict[str, str]: - """Recursively parse the XML tree into a dictionary Each key/value pair is - inside its own
tag and the key inside a tag. - The fields that I believe are compulsory (TCIN, UPC and Origin) are only - nested one level deep, while the rest of fields seem to be always nested - two levels deep. But parsing it recursively helps generalise both cases.""" - - spec_dict = {} - for child in root: - if child.tag == "div": - if "b" in [x.tag for x in child]: - key, *values = child.itertext() - key = key.strip(":") - value = "".join(values).strip(":") - spec_dict[key] = value - else: - spec_dict.update(iter_parse(child)) - return spec_dict - - -def parse_raw_specs(raw_specs: str) -> Dict[str, str]: - """Parse a raw specifications XML string into a dictionary. - This involves first recursively parsing the XML tree and then renaming - the key values""" - - fields_mapping = { - "Material": "materials", - "Package Quantity": "packaging", - "Number of Pieces": "packaging", - "Dimensions (Overall)": "dimensions", - "Dimensions": "dimensions", - "Weight": "weight", - "TCIN": "tcin", - "Origin": "origin", - } - - try: - xml_root = ET.fromstring(raw_specs) - except ET.ParseError: - logging.error("error parsing xml string: \n%s", raw_specs) - return {} - - parsed = iter_parse(xml_root) - specs_dict = { - fields_mapping[key]: value - for key, value in parsed.items() - if key in fields_mapping - } - return specs_dict