dayrize-usecase/pipeline/beam_etl/helpers/origin.py

46 lines
1.2 KiB
Python

"""Functions to clean and standardize the origin string
In the sample data, the only strings describing the origin I found were
* made in the usa
* imported
* assem usa w/foreign/dom. parts
* made in the usa or imported
Just with different combinations of lower and upper case and surrounded by more
or less whitespace.
I'll simplify it into "usa" for local products, "imported" for imported ones
and "mixed" for the rest (and I'll later score it as 0.5).
Any other value will be logged as an error and assigned "mixed"
There should not be missing origin strings, but if a "None" happens, set it to
`mixed` also.
"""
from typing import Optional
import logging
ORIGIN_MAPPING = {
"assem usa w/foreign/dom. parts": "mixed",
"imported": "imported",
"made in the usa": "usa",
"made in the usa or imported": "mixed",
}
def clean_origin_name(origin: Optional[str]) -> str:
"""Clean and standardize product origin"""
if origin is None:
logging.error("origin string not found, setting it to `mixed`")
return "mixed"
origin = origin.lower().strip()
try:
return ORIGIN_MAPPING[origin]
except KeyError:
logging.error("could not parse origin `%s`, setting it to `mixed`", origin)
return "mixed"