diff --git a/pipeline/beam_etl/helpers/materials.py b/pipeline/beam_etl/helpers/materials.py new file mode 100644 index 0000000..e41614f --- /dev/null +++ b/pipeline/beam_etl/helpers/materials.py @@ -0,0 +1,81 @@ +"""Functions to parse the materials string into a list of materials that we can +score later + +Scoreable materials are: + * metal + * wood + * glass + * resin + * fabric + * plastic + +I will try to match materials found into the specification into one of the +scoreable ones. However, I found a few, like "stoneware" and "cardboard" that I +can't fid there, they'll have to remain unscored for now +""" + +from typing import Optional, List +import re + +MATERIAL_MAPPING = { + "polyester": "fabric", + "spandex": "fabric", + "leather": "fabric", + "crystal": "glass", + "hardwood": "wood", + "plywood": "wood", + "mdf": "wood", + "wood": "wood", + "steel": "metal", + "polycarbonate": "plastic", + "polypropylene": "plastic", + "pvc": "plastic", + "resin": "plastic", + "stoneware": "stoneware", + "cardboard": "cardboard", + "paper": "cardboard", +} + + +def material_classifier(material: str) -> str: + """I will to match materials to one of the following: + * metal + * wood + * glass + * resin + * fabric + * plastic + * cardboard + * paper + There's a fair amount of variation in the names, so I just try to see if + a given keyword identifying a material is found. + If no match found, just return the input string itself. + """ + + for key, value in MATERIAL_MAPPING.items(): + if key in material: + return value + return material + + +def clean_material_name(material: str) -> str: + """Do some cleaning to material names like: + * removing annotations found inside parentheses + * removing mounts and percentages + * keep the names as lowercase + So that everything is more homogenious""" + no_paren_annotations = re.sub(r"\(.*\)", "", material) + no_amounts = re.sub(r"\d+%?", "", no_paren_annotations) + return no_amounts.strip().lower() + + +def parse_materials(materials: Optional[str]) -> Optional[List[str]]: + """Parse a string of materials as specified in raw_specifications into a + list of standardized material names. + Return a sorted list of unique materials for more consistency""" + if materials is None: + return None + material_ls = [ + material_classifier(clean_material_name(x)) for x in materials.split(",") + ] + return sorted(set(material_ls)) diff --git a/pipeline/beam_etl/tests/test_materials.py b/pipeline/beam_etl/tests/test_materials.py new file mode 100644 index 0000000..dde291f --- /dev/null +++ b/pipeline/beam_etl/tests/test_materials.py @@ -0,0 +1,37 @@ +"""Test the `parse_materials` function and its helpers""" + +from helpers.materials import parse_materials, clean_material_name, material_classifier + + +def test_none(): + """Test None value""" + assert parse_materials(None) is None + + +def test_amounts(): + """Test a materials string containing amounts""" + assert parse_materials(" 83% Recycled Polyester, 17% Spandex") == ["fabric"] + assert clean_material_name(" 83% Recycled Polyester") == "recycled polyester" + assert material_classifier("recycled polyester") == "fabric" + + +def test_annotations(): + """Test a materials string containing annotations between parentheses""" + assert parse_materials(" Cardboard (Frame)") == ["cardboard"] + assert clean_material_name(" Cardboard (Frame)") == "cardboard" + assert material_classifier("cardboard") == "cardboard" + + +def test_keyword(): + """Test a string where the material is a word inside the string""" + assert parse_materials(" Walnut Wood (Frame)") == ["wood"] + assert clean_material_name("walnut wood") == "walnut wood" + assert material_classifier("walnut wood") == "wood" + + +def test_multiple_materials(): + """Test a string with two different materials""" + assert parse_materials(" MDF (Medium-Density Fiberboard), Metal (Frame)") == [ + "metal", + "wood", + ]