"""Functions to clean and standardize the origin string In the sample data, the only strings describing the origin I found were * made in the usa * imported * assem usa w/foreign/dom. parts * made in the usa or imported Just with different combinations of lower and upper case and surrounded by more or less whitespace. I'll simplify it into "usa" for local products, "imported" for imported ones and "mixed" for the rest (and I'll later score it as 0.5). Any other value will be logged as an error and assigned "mixed" There should not be missing origin strings, but if a "None" happens, set it to `mixed` also. """ from typing import Optional import logging ORIGIN_MAPPING = { "assem usa w/foreign/dom. parts": "mixed", "imported": "imported", "made in the usa": "usa", "made in the usa or imported": "mixed", } def clean_origin_name(origin: Optional[str]) -> str: """Clean and standardize product origin""" if origin is None: logging.error("origin string not found, setting it to `mixed`") return "mixed" origin = origin.lower().strip() try: return ORIGIN_MAPPING[origin] except KeyError: logging.error("could not parse origin `%s`, setting it to `mixed`", origin) return "mixed"