dayrize-usecase/pipeline/beam_etl/helpers/parse_row.py

93 lines
2.7 KiB
Python
Raw Normal View History

2023-06-23 15:29:57 +02:00
"""Helper function to parse rows into cleaner data that can be inserted into
the destination database"""
import logging
from typing import TypedDict, Dict, Optional, List
from helpers.parse_xml import parse_raw_specs
from helpers.materials import parse_materials
from helpers.origin import clean_origin_name
from helpers.dimensions import parse_dimensions
from helpers.weight import parse_weight, dimensional_weight
class CleanRow(TypedDict):
"""Type to represent clean rows to be inserted in the database"""
gtin13: str
tcin: str
2023-06-23 15:29:57 +02:00
primary_category: str
materials: Optional[List[str]]
packaging: int
origin: str
height: Optional[float]
depth: Optional[float]
width: Optional[float]
weight: Optional[float]
def parse_row(element: Dict[str, str]) -> Optional[CleanRow]:
"""Parse a dictionary representing a row for the CSV input into a cleaner
dictionary representing a row to be inserted in the database"""
# gtin13 should always be there
try:
gtin13 = element["gtin13"]
except KeyError:
logging.error("gtin13 missing")
return None
# primary category should always be there
try:
primary_category = element["primary_category"]
except KeyError:
logging.error("primary_category missing")
return None
specifications = parse_raw_specs(element["raw_specifications"])
if specifications is None:
logging.error("could not parse raw_specifications")
return None
# TCIN should be a mandatory field
2023-06-23 15:29:57 +02:00
try:
tcin = specifications["tcin"]
2023-06-23 15:29:57 +02:00
except KeyError:
logging.error("TCIN missing")
return None
materials = parse_materials(specifications.get("materials"))
# if packaging is not specified, assume only one unit is found in the
# package
packaging = specifications.get("packaging", 1)
try:
packaging = int(packaging)
except ValueError:
logging.error("could not cast packaging %s into an integer")
packaging = 1
origin = clean_origin_name(specifications.get("origin"))
dimensions = parse_dimensions(specifications.get("dimensions"))
height = dimensions["height"]
width = dimensions["width"]
depth = dimensions["depth"]
weight = parse_weight(specifications.get("weight"))
if weight is None:
weight = dimensional_weight(height=height, width=weight, depth=depth)
return {
"gtin13": gtin13,
"tcin": tcin,
"primary_category": primary_category,
"materials": materials,
"packaging": packaging,
"origin": origin,
"height": height,
"width": width,
"depth": depth,
"weight": weight,
}