From e716fc1cd3cd9f008df030e6d0ea3158b817e93c Mon Sep 17 00:00:00 2001 From: Ricard Illa Date: Fri, 23 Jun 2023 10:05:24 +0200 Subject: [PATCH] feat: added parsing dimensions --- pipeline/beam_etl/helpers/dimensions.py | 67 ++++++++++++ pipeline/beam_etl/tests/test_dimensions.py | 113 +++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 pipeline/beam_etl/helpers/dimensions.py create mode 100644 pipeline/beam_etl/tests/test_dimensions.py diff --git a/pipeline/beam_etl/helpers/dimensions.py b/pipeline/beam_etl/helpers/dimensions.py new file mode 100644 index 0000000..ec735b8 --- /dev/null +++ b/pipeline/beam_etl/helpers/dimensions.py @@ -0,0 +1,67 @@ +"""Functions to parse the dimensions string into a dictionary that represents +those dimenions in cm +""" + +import logging +from typing import Dict, Optional +import re + + +UNIT_CONVERSIONS = {"inches": 2.54, "feet": 30.48, "cm": 1} + + +def parse_dimensions_measure(dimensions: str, measure: str) -> Optional[Dict]: + """Using a regex, parse a measurement out of a dimensions string + I expect to find a value of the form `1.2 inches (W)` specifying the value, + unit and measurement. + Return a dictionary representing the parsed value and its unit. + """ + expr = rf"(?P\d*[.,]?\d*)\s+(?P[a-zA-Z]*)\s+\({measure}\)" + if match := re.search(expr, dimensions): + match_value = match.group("value") + try: + value = float(match_value) + except ValueError: + logging.error("could not parse value `%s` as a float for a dimension") + return None + return { + "value": value, + "unit": match.group("unit").lower(), + } + return None + + +def units_to_cm(value: float, unit: str) -> Optional[float]: + """Convert a given dimension unit into centimeters. + If unrecognized unit, return None""" + try: + conversion = UNIT_CONVERSIONS[unit] + except KeyError: + logging.error("unrecognized unit: %s", unit) + return None + return value * conversion + + +def parse_dimensions(dimensions: Optional[str]) -> Dict[str, Optional[float]]: + """Parse a string representing dimensions""" + if dimensions is None: + return { + "height": None, + "width": None, + "depth": None, + } + height = parse_dimensions_measure(dimensions, "H") + width = parse_dimensions_measure(dimensions, "W") + depth = parse_dimensions_measure(dimensions, "D") + parsed_dimensions = { + "height": height, + "width": width, + "depth": depth, + } + result = {} + for key, value in parsed_dimensions.items(): + if value is None: + result[key] = value + else: + result[key] = units_to_cm(**value) + return result diff --git a/pipeline/beam_etl/tests/test_dimensions.py b/pipeline/beam_etl/tests/test_dimensions.py new file mode 100644 index 0000000..11514b6 --- /dev/null +++ b/pipeline/beam_etl/tests/test_dimensions.py @@ -0,0 +1,113 @@ +"""Test the `parse_dimensions` function and its helpers""" + +from helpers.dimensions import parse_dimensions, parse_dimensions_measure, units_to_cm + + +def test_none(): + """Test None value""" + assert parse_dimensions(None) == {"height": None, "width": None, "depth": None} + + +def test_parse_dimensions(): + """Test a normal example from the sample file""" + dimensions_str = "23 inches (H) x 1 inches (W) x 23 inches (D)" + assert parse_dimensions_measure(dimensions_str, "W") == { + "unit": "inches", + "value": 1.0, + } + assert parse_dimensions_measure(dimensions_str, "H") == { + "unit": "inches", + "value": 23.0, + } + assert parse_dimensions_measure(dimensions_str, "D") == { + "unit": "inches", + "value": 23.0, + } + assert parse_dimensions(dimensions_str) == { + "depth": 58.42, + "height": 58.42, + "width": 2.54, + } + + +def test_parse_dimensions_comma(): + """Test a normal example from the sample file, but the separator is a comma""" + dimensions_str = "23 inches (H), 1 inches (W), 23 inches (D)" + assert parse_dimensions_measure(dimensions_str, "W") == { + "unit": "inches", + "value": 1.0, + } + assert parse_dimensions_measure(dimensions_str, "H") == { + "unit": "inches", + "value": 23.0, + } + assert parse_dimensions_measure(dimensions_str, "D") == { + "unit": "inches", + "value": 23.0, + } + assert parse_dimensions(dimensions_str) == { + "depth": 58.42, + "height": 58.42, + "width": 2.54, + } + + +def test_parse_dimensions_feet(): + """Test a normal example from the sample file, but the units is feet""" + dimensions_str = "23 feet (H) x 1 feet (W) x 23 feet (D)" + assert parse_dimensions_measure(dimensions_str, "W") == { + "unit": "feet", + "value": 1.0, + } + assert parse_dimensions_measure(dimensions_str, "H") == { + "unit": "feet", + "value": 23.0, + } + assert parse_dimensions_measure(dimensions_str, "D") == { + "unit": "feet", + "value": 23.0, + } + assert parse_dimensions(dimensions_str) == { + "depth": 701.04, + "height": 701.04, + "width": 30.48, + } + + +def test_parse_dimensions_missing(): + """Test a normal example from the sample file but some measurement is missing""" + dimensions_str = "23 inches (H) x 23 inches (D)" + assert parse_dimensions_measure(dimensions_str, "W") is None + assert parse_dimensions_measure(dimensions_str, "H") == { + "unit": "inches", + "value": 23.0, + } + assert parse_dimensions_measure(dimensions_str, "D") == { + "unit": "inches", + "value": 23.0, + } + assert parse_dimensions(dimensions_str) == { + "depth": 58.42, + "height": 58.42, + "width": None, + } + + +def test_units_to_cm_inches(): + """Test `units_to_cm` where the units is inches""" + assert units_to_cm(value=0.5, unit="inches") == 1.27 + + +def test_units_to_cm_feet(): + """Test `units_to_cm` where the units is inches""" + assert units_to_cm(value=0.5, unit="feet") == 15.24 + + +def test_units_to_cm_cm(): + """Test `units_to_cm` where the units is already cm""" + assert units_to_cm(value=0.5, unit="cm") == 0.5 + + +def test_units_to_cm_unrecognized(): + """Test `units_to_cm` where the units are not recognized""" + assert units_to_cm(value=0.5, unit="yard") is None