diff --git a/pipeline/beam_etl/helpers/misc.py b/pipeline/beam_etl/helpers/misc.py index fdf9f34..d928b01 100644 --- a/pipeline/beam_etl/helpers/misc.py +++ b/pipeline/beam_etl/helpers/misc.py @@ -8,10 +8,11 @@ def convert_units( value: float, unit: str, unit_conversions: Dict[str, float] ) -> Optional[float]: """Convert a given value and unit into a different unit. + Round the returned value to 2 significant decimals If unrecognized unit, return None""" try: conversion = unit_conversions[unit] except KeyError: logging.error("unrecognized unit: %s", unit) return None - return value * conversion + return round(value * conversion, 2) diff --git a/pipeline/beam_etl/helpers/parse_xml.py b/pipeline/beam_etl/helpers/parse_xml.py index 1cab2d8..2367c9a 100644 --- a/pipeline/beam_etl/helpers/parse_xml.py +++ b/pipeline/beam_etl/helpers/parse_xml.py @@ -21,7 +21,7 @@ they refer to the same thing. import logging import xml.etree.ElementTree as ET -from typing import Dict +from typing import Dict, Optional FIELDS_MAPPING = { "Material": "materials", @@ -55,7 +55,7 @@ def iter_parse(root: ET.Element) -> Dict[str, str]: return spec_dict -def parse_raw_specs(raw_specs: str) -> Dict[str, str]: +def parse_raw_specs(raw_specs: str) -> Optional[Dict[str, str]]: """Parse a raw specifications XML string into a dictionary. This involves first recursively parsing the XML tree and then renaming the key values""" @@ -64,7 +64,7 @@ def parse_raw_specs(raw_specs: str) -> Dict[str, str]: xml_root = ET.fromstring(raw_specs) except ET.ParseError: logging.error("error parsing xml string: \n%s", raw_specs) - return {} + return None parsed = iter_parse(xml_root) specs_dict = { diff --git a/pipeline/beam_etl/helpers/weight.py b/pipeline/beam_etl/helpers/weight.py new file mode 100644 index 0000000..e3035e7 --- /dev/null +++ b/pipeline/beam_etl/helpers/weight.py @@ -0,0 +1,51 @@ +"""Functions to parse the weigth string into a dictionary that represents +the weight in g +""" + +import logging +from typing import Optional +import re + +from helpers.misc import convert_units + + +UNIT_CONVERSIONS = {"pounds": 453.592, "ounces": 28.3495, "g": 1, "kg": 1000} + + +def parse_weight(weight: Optional[str]) -> Optional[float]: + """Parse a weigth string into a dictionary representing the weight unit and + value""" + if weight is None: + return None + expr = r"(?P\d*[.,]?\d*)\s+(?P[a-zA-Z]*)" + + # strip is needed to prevent the regex from lazily + # matching just from the first whitespace separator, + # this could happen because the number part in the + # expression is technically all optional, to avoid + # an expression too complex and unreadable + if match := re.search(expr, weight.strip()): + match_value = match.group("value") + try: + value = float(match_value) + except ValueError: + logging.error("could not parse value `%s` as a float for a weight") + return None + unit = match.group("unit").lower() + return convert_units(value, unit, unit_conversions=UNIT_CONVERSIONS) + + return None + + +def dimensional_weight( + height: Optional[float], width: Optional[float], depth: Optional[float] +) -> Optional[float]: + """The dimensional weight (in kg) is calculated as: + Length * Height * Width (in cm) / 5000. + We'll return it in g here""" + if None in [height, width, depth]: + return None + dimensional_weight_kg = height * width * depth / 5000 + return convert_units( + value=dimensional_weight_kg, unit="kg", unit_conversions=UNIT_CONVERSIONS + ) diff --git a/pipeline/beam_etl/tests/test_convert_units.py b/pipeline/beam_etl/tests/test_convert_units.py index cee3f4e..8798e8d 100644 --- a/pipeline/beam_etl/tests/test_convert_units.py +++ b/pipeline/beam_etl/tests/test_convert_units.py @@ -1,6 +1,7 @@ """Test the `convert_units`""" from helpers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions +from helpers.weight import UNIT_CONVERSIONS as weight_unit_conversions from helpers.misc import convert_units @@ -21,4 +22,29 @@ def test_units_to_cm_cm(): def test_units_to_cm_unrecognized(): """Test `convert_units` to cm from unrecognized unit""" - assert convert_units(0.5, "yard", dimension_unit_conversions) is None + assert convert_units(0.5, "yards", dimension_unit_conversions) is None + + +def test_units_to_g_pounds(): + """Test `convert_units` to g from pounds""" + assert convert_units(0.5, "pounds", weight_unit_conversions) == 226.8 + + +def test_units_to_g_ounces(): + """Test `convert_units` to g from ounces""" + assert convert_units(0.5, "ounces", weight_unit_conversions) == 14.17 + + +def test_units_to_g_g(): + """Test `convert_units` to g from g""" + assert convert_units(0.5, "g", weight_unit_conversions) == 0.5 + + +def test_units_to_g_kg(): + """Test `convert_units` to g from kg""" + assert convert_units(0.5, "kg", weight_unit_conversions) == 500 + + +def test_units_to_g_unrecognized(): + """Test `convert_units` to g from unrecognized unit""" + assert convert_units(0.5, "mg", weight_unit_conversions) is None diff --git a/pipeline/beam_etl/tests/test_parse_xml.py b/pipeline/beam_etl/tests/test_parse_xml.py index 2a276cb..e8639d4 100644 --- a/pipeline/beam_etl/tests/test_parse_xml.py +++ b/pipeline/beam_etl/tests/test_parse_xml.py @@ -150,4 +150,4 @@ def test_parse_raw_specs4(): def test_malformed_xml(): """Test al maformed xml string""" xml_str = "
foo" - assert parse_raw_specs(xml_str) == {} + assert parse_raw_specs(xml_str) is None diff --git a/pipeline/beam_etl/tests/test_weight.py b/pipeline/beam_etl/tests/test_weight.py new file mode 100644 index 0000000..aa87b5f --- /dev/null +++ b/pipeline/beam_etl/tests/test_weight.py @@ -0,0 +1,53 @@ +"""Test the `parse_weight` and `dimensional_weight`""" + +from helpers.weight import parse_weight, dimensional_weight + + +def test_parse_weight_none(): + """Test None value""" + assert parse_weight(None) is None + + +def test_parse_weight(): + """Test one value from the sample file""" + assert parse_weight("0.65 pounds") == 294.83 + + +def test_parse_weight_only_decimals(): + """Test a value with only decimals specified""" + assert parse_weight(".28 pounds") == 127.01 + + +def test_parse_weight_no_decimals(): + """Test a value without decimals""" + assert parse_weight("44 pounds") == 19958.05 + + +def test_parse_weight_ounces(): + """Test a value with ounces""" + assert parse_weight("0.65 ounces") == 18.43 + + +def test_parse_weight_unrecognized(): + """Test a value where the unit is not recognized""" + assert parse_weight("0.65 mg") is None + + +def test_dimensional_weight(): + """Test the dimensional weight calculation""" + assert dimensional_weight(height=1, width=2, depth=3) == 1.2 + + +def test_dimensional_weight_none(): + """Test the dimensional weight calculation where there may be nones in the + input""" + + assert dimensional_weight(height=None, width=2, depth=3) is None + assert dimensional_weight(height=1, width=None, depth=3) is None + assert dimensional_weight(height=1, width=2, depth=None) is None + + assert dimensional_weight(height=None, width=None, depth=3) is None + assert dimensional_weight(height=1, width=None, depth=None) is None + assert dimensional_weight(height=None, width=2, depth=None) is None + + assert dimensional_weight(height=None, width=None, depth=None) is None