From 725114805aefd9ff98493e56b79283f0892ac446 Mon Sep 17 00:00:00 2001 From: Ricard Illa Date: Thu, 22 Jun 2023 09:40:26 +0200 Subject: [PATCH] feat: added parse_raw_specification --- pipeline/beam_etl/helpers.py | 47 +++++++++++++++++++ pipeline/beam_etl/justfile | 3 ++ .../beam_etl/tests/test_parse_raw_specs.py | 10 ++++ 3 files changed, 60 insertions(+) create mode 100644 pipeline/beam_etl/helpers.py create mode 100644 pipeline/beam_etl/tests/test_parse_raw_specs.py diff --git a/pipeline/beam_etl/helpers.py b/pipeline/beam_etl/helpers.py new file mode 100644 index 0000000..dd25889 --- /dev/null +++ b/pipeline/beam_etl/helpers.py @@ -0,0 +1,47 @@ +import xml.etree.ElementTree as ET +from typing import Dict + +def iter_parse(root: ET.Element) -> Dict[str,str]: + """Recursively parse the XML tree into a dictionary + Each key/value pair is inside it's own
tag and + the key inside a tag. + The fields that I believe are compulsory (TCIN, UPC and Origin) + are only nested one level deep, while the rest of fields seem + to be always nested two levels deep. But parsing it recursively + helps generalise both cases.""" + + spec_dict = {} + for child in root: + if child.tag == "div": + if "b" in [x.tag for x in child]: + key, *values = child.itertext() + key = key.strip(":") + value = "".join(values).strip(":") + spec_dict[key] = value + else: + spec_dict.update(iter_parse(child)) + return spec_dict + +def parse_raw_specs(raw_specs: str) -> Dict[str,str]: + """Parse a raw specifications XML string into a dictionary + This involves first recursively parsing the XML tree and then + renaming the key values""" + + fields_mapping = { + "Material": "materials", + "Package Quantity": "packaging", + "Number of Pieces": "packaging", + "Dimensions (Overall)": "dimensions", + "Dimensions": "dimensions", + "Weight": "weight", + "TCIN": "tcin", + "Origin": "origin", + } + xml_root = ET.fromstring(raw_specs) + parsed = iter_parse(xml_root) + specs_dict = { + fields_mapping[key]: value + for key, value in parsed.items() + if key in fields_mapping + } + return specs_dict diff --git a/pipeline/beam_etl/justfile b/pipeline/beam_etl/justfile index cfb0c3a..2ef45fa 100644 --- a/pipeline/beam_etl/justfile +++ b/pipeline/beam_etl/justfile @@ -2,3 +2,6 @@ input := "../../data/large_target_store_products_dataset_sample - large_target_s run: python -m main --input "{{ input }}" + +test: + python -m pytest diff --git a/pipeline/beam_etl/tests/test_parse_raw_specs.py b/pipeline/beam_etl/tests/test_parse_raw_specs.py new file mode 100644 index 0000000..6f4b4c4 --- /dev/null +++ b/pipeline/beam_etl/tests/test_parse_raw_specs.py @@ -0,0 +1,10 @@ +from helpers import parse_raw_specs + + +def test_parse_raw_specs(): + xml_str = """ +

Specifications

Suggested Age: 6 Years and Up

CPSC Choking Hazard Warnings: Choking_hazard_small_parts

TCIN: 81917300
UPC: 840391145528
Origin: imported

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item\'s label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

+ """ + + expected = {"tcin": " 81917300", "origin": " imported"} + assert parse_raw_specs(xml_str) == expected