feat: added clean_origin_name

main
Ricard Illa 2023-06-22 17:32:08 +02:00
parent b018abe00e
commit 050323d583
2 changed files with 94 additions and 0 deletions

View File

@ -0,0 +1,45 @@
"""Functions to clean and standardize the origin string
In the sample data, the only strings describing the origin I found were
* made in the usa
* imported
* assem usa w/foreign/dom. parts
* made in the usa or imported
Just with different combinations of lower and upper case and surrounded by more
or less whitespace.
I'll simplify it into "usa" for local products, "imported" for imported ones
and "mixed" for the rest (and I'll later score it as 0.5).
Any other value will be logged as an error and assigned "mixed"
There should not be missing origin strings, but if a "None" happens, set it to
`mixed` also.
"""
from typing import Optional
import logging
ORIGIN_MAPPING = {
"assem usa w/foreign/dom. parts": "mixed",
"imported": "imported",
"made in the usa": "usa",
"made in the usa or imported": "mixed",
}
def clean_origin_name(origin: Optional[str]) -> str:
"""Clean and standardize product origin"""
if origin is None:
logging.error("origin string not found, setting it to `mixed`")
return "mixed"
origin = origin.lower().strip()
try:
return ORIGIN_MAPPING[origin]
except KeyError:
logging.error("could not parse origin `%s`, setting it to `mixed`", origin)
return "mixed"

View File

@ -0,0 +1,49 @@
"""Test the `clean_material_name`"""
from helpers.origin import clean_origin_name
def test_none():
"""Test None value"""
assert clean_origin_name(None) == "mixed"
def test_unexpected():
"""Test an unexpected origin name"""
assert clean_origin_name("foo") == "mixed"
def test_clean_origin_name0():
"""Test a sample input for clean_origin_name"""
assert clean_origin_name(" Assem USA w/foreign/dom. parts") == "mixed"
def test_clean_origin_name1():
"""Test a sample input for clean_origin_name"""
assert clean_origin_name(" Imported") == "imported"
def test_clean_origin_name2():
"""Test a sample input for clean_origin_name"""
assert clean_origin_name(" Made in the USA") == "usa"
def test_clean_origin_name3():
"""Test a sample input for clean_origin_name"""
assert clean_origin_name(" Made in the USA or Imported") == "mixed"
def test_clean_origin_name4():
"""Test a sample input for clean_origin_name"""
assert clean_origin_name(" imported") == "imported"
def test_clean_origin_name5():
"""Test a sample input for clean_origin_name"""
assert clean_origin_name(" made in the USA") == "usa"
def test_clean_origin_name6():
"""Test a sample input for clean_origin_name"""
assert clean_origin_name(" made in the USA or imported") == "mixed"