46 lines
1.2 KiB
Python
46 lines
1.2 KiB
Python
"""Functions to clean and standardize the origin string
|
|
|
|
In the sample data, the only strings describing the origin I found were
|
|
* made in the usa
|
|
* imported
|
|
* assem usa w/foreign/dom. parts
|
|
* made in the usa or imported
|
|
|
|
Just with different combinations of lower and upper case and surrounded by more
|
|
or less whitespace.
|
|
|
|
I'll simplify it into "usa" for local products, "imported" for imported ones
|
|
and "mixed" for the rest (and I'll later score it as 0.5).
|
|
|
|
Any other value will be logged as an error and assigned "mixed"
|
|
|
|
There should not be missing origin strings, but if a "None" happens, set it to
|
|
`mixed` also.
|
|
|
|
"""
|
|
|
|
from typing import Optional
|
|
import logging
|
|
|
|
ORIGIN_MAPPING = {
|
|
"assem usa w/foreign/dom. parts": "mixed",
|
|
"imported": "imported",
|
|
"made in the usa": "usa",
|
|
"made in the usa or imported": "mixed",
|
|
}
|
|
|
|
|
|
def clean_origin_name(origin: Optional[str]) -> str:
|
|
"""Clean and standardize product origin"""
|
|
|
|
if origin is None:
|
|
logging.error("origin string not found, setting it to `mixed`")
|
|
return "mixed"
|
|
|
|
origin = origin.lower().strip()
|
|
try:
|
|
return ORIGIN_MAPPING[origin]
|
|
except KeyError:
|
|
logging.error("could not parse origin `%s`, setting it to `mixed`", origin)
|
|
return "mixed"
|