dayrize-usecase/pipeline/beam_etl/helpers/parse_row.py

93 lines
2.7 KiB
Python

"""Helper function to parse rows into cleaner data that can be inserted into
the destination database"""
import logging
from typing import TypedDict, Dict, Optional, List
from helpers.parse_xml import parse_raw_specs
from helpers.materials import parse_materials
from helpers.origin import clean_origin_name
from helpers.dimensions import parse_dimensions
from helpers.weight import parse_weight, dimensional_weight
class CleanRow(TypedDict):
"""Type to represent clean rows to be inserted in the database"""
gtin13: str
tcin: str
primary_category: str
materials: Optional[List[str]]
packaging: int
origin: str
height: Optional[float]
depth: Optional[float]
width: Optional[float]
weight: Optional[float]
def parse_row(element: Dict[str, str]) -> Optional[CleanRow]:
"""Parse a dictionary representing a row for the CSV input into a cleaner
dictionary representing a row to be inserted in the database"""
# gtin13 should always be there
try:
gtin13 = element["gtin13"]
except KeyError:
logging.error("gtin13 missing")
return None
# primary category should always be there
try:
primary_category = element["primary_category"]
except KeyError:
logging.error("primary_category missing")
return None
specifications = parse_raw_specs(element["raw_specifications"])
if specifications is None:
logging.error("could not parse raw_specifications")
return None
# TCIN should be a mandatory field
try:
tcin = specifications["tcin"]
except KeyError:
logging.error("TCIN missing")
return None
materials = parse_materials(specifications.get("materials"))
# if packaging is not specified, assume only one unit is found in the
# package
packaging = specifications.get("packaging", 1)
try:
packaging = int(packaging)
except ValueError:
logging.error("could not cast packaging %s into an integer")
packaging = 1
origin = clean_origin_name(specifications.get("origin"))
dimensions = parse_dimensions(specifications.get("dimensions"))
height = dimensions["height"]
width = dimensions["width"]
depth = dimensions["depth"]
weight = parse_weight(specifications.get("weight"))
if weight is None:
weight = dimensional_weight(height=height, width=weight, depth=depth)
return {
"gtin13": gtin13,
"tcin": tcin,
"primary_category": primary_category,
"materials": materials,
"packaging": packaging,
"origin": origin,
"height": height,
"width": width,
"depth": depth,
"weight": weight,
}