dayrize-usecase/pipeline/beam_etl/helpers/materials.py

82 lines
2.2 KiB
Python
Raw Normal View History

2023-06-22 17:11:31 +02:00
"""Functions to parse the materials string into a list of materials that we can
score later
Scoreable materials are:
* metal
* wood
* glass
* resin
* fabric
* plastic
I will try to match materials found into the specification into one of the
scoreable ones. However, I found a few, like "stoneware" and "cardboard" that I
can't fid there, they'll have to remain unscored for now
"""
from typing import Optional, List
import re
MATERIAL_MAPPING = {
"polyester": "fabric",
"spandex": "fabric",
"leather": "fabric",
"crystal": "glass",
"hardwood": "wood",
"plywood": "wood",
"mdf": "wood",
"wood": "wood",
"steel": "metal",
"polycarbonate": "plastic",
"polypropylene": "plastic",
"pvc": "plastic",
"resin": "plastic",
"stoneware": "stoneware",
"cardboard": "cardboard",
"paper": "cardboard",
}
def material_classifier(material: str) -> str:
"""I will to match materials to one of the following:
* metal
* wood
* glass
* resin
* fabric
* plastic
* cardboard
* paper
There's a fair amount of variation in the names, so I just try to see if
a given keyword identifying a material is found.
If no match found, just return the input string itself.
"""
for key, value in MATERIAL_MAPPING.items():
if key in material:
return value
return material
def clean_material_name(material: str) -> str:
"""Do some cleaning to material names like:
* removing annotations found inside parentheses
* removing mounts and percentages
* keep the names as lowercase
So that everything is more homogenious"""
no_paren_annotations = re.sub(r"\(.*\)", "", material)
no_amounts = re.sub(r"\d+%?", "", no_paren_annotations)
return no_amounts.strip().lower()
def parse_materials(materials: Optional[str]) -> Optional[List[str]]:
"""Parse a string of materials as specified in raw_specifications into a
list of standardized material names.
Return a sorted list of unique materials for more consistency"""
if materials is None:
return None
material_ls = [
material_classifier(clean_material_name(x)) for x in materials.split(",")
]
return sorted(set(material_ls))