refactor: some cleanup on etl's code structure

main
Ricard Illa 2023-06-26 12:36:19 +02:00
parent b28ddc350d
commit 39d279f089
19 changed files with 83 additions and 46 deletions

View File

@ -0,0 +1,5 @@
"""Helper classes for the pipeline"""
from helpers.upsert_products_to_pg import UpsertProductsToPg
from helpers.read_from_csv import ReadFromCsv
from helpers.process_rows import ProcessRows

View File

@ -0,0 +1,4 @@
"""Helper parser functions to extract and clean data from the input CSV file
Only `parse_row` needs to be exported."""
from helpers.parsers.parse_row import parse_row

View File

@ -6,7 +6,7 @@ import logging
from typing import Dict, Optional
import re
from helpers.misc import convert_units
from helpers.parsers.misc import convert_units
UNIT_CONVERSIONS = {"inches": 2.54, "feet": 30.48, "cm": 1}

View File

@ -4,11 +4,11 @@ the destination database"""
import logging
from typing import TypedDict, Dict, Optional, List
from helpers.parse_xml import parse_raw_specs
from helpers.materials import parse_materials
from helpers.origin import clean_origin_name
from helpers.dimensions import parse_dimensions
from helpers.weight import parse_weight, dimensional_weight
from helpers.parsers.parse_xml import parse_raw_specs
from helpers.parsers.materials import parse_materials
from helpers.parsers.origin import clean_origin_name
from helpers.parsers.dimensions import parse_dimensions
from helpers.parsers.weight import parse_weight, dimensional_weight
class CleanRow(TypedDict):

View File

@ -6,7 +6,7 @@ import logging
from typing import Optional
import re
from helpers.misc import convert_units
from helpers.parsers.misc import convert_units
UNIT_CONVERSIONS = {"pounds": 453.592, "ounces": 28.3495, "g": 1, "kg": 1000}

View File

@ -0,0 +1,19 @@
"""Module containing necessary functionality to write to the PostgreSQL sink"""
import logging
import apache_beam as beam
from helpers.parsers import parse_row
class ProcessRows(beam.DoFn):
"""DoFn to process and parse rows from the input file into structured
dictionaries"""
# pylint: disable=abstract-method,arguments-differ
def process(self, element):
if (row := parse_row(element)) is not None:
yield row
else:
logging.warning("could not successfully parse this row: %s", element)

View File

@ -0,0 +1,28 @@
"""Module containing ReadFromCsv DoFn to create a PTransform to read from a CSV
input file"""
import io
import logging
import csv
import apache_beam as beam
from apache_beam.io.filesystems import FileSystems
class ReadFromCsv(beam.DoFn):
"""This custom DoFn will read from a CSV file and yield each row as a
dictionary where the row names are the keys and the cells are the values
"""
# pylint: disable=abstract-method,arguments-differ
def process(self, element):
logging.info("reading from input file: %s", element)
with FileSystems.open(element) as file:
text_wrapper = io.TextIOWrapper(file)
reader = csv.reader(text_wrapper)
try:
header = next(reader)
except StopIteration:
return
for row in reader:
yield dict(zip(header, row))

View File

@ -1,36 +1,11 @@
"""Module containing the IO parts of the pipeline"""
"""Module containing necessary functionality to write to the PostgreSQL sink"""
#!/usr/bin/env python
import io
import logging
import csv
from typing import Dict
import apache_beam as beam
import psycopg2
from apache_beam.io.filesystems import FileSystems
class ReadFromCsv(beam.DoFn):
"""This custom DoFn will read from a CSV file and yield each row as a
dictionary where the row names are the keys and the cells are the values
"""
# pylint: disable=abstract-method,arguments-differ
def process(self, element):
logging.info("reading from input file: %s", element)
with FileSystems.open(element) as file:
text_wrapper = io.TextIOWrapper(file)
reader = csv.reader(text_wrapper)
try:
header = next(reader)
except StopIteration:
return
for row in reader:
yield dict(zip(header, row))
class WriteToPostgreSQL(beam.DoFn):
"""DoFn to write elements to a PostgreSQL database"""
@ -68,8 +43,13 @@ class WriteToPostgreSQL(beam.DoFn):
def process(self, element):
if self.connection is not None:
cursor = self.connection.cursor()
logging.info(
"inserting the following element into the database: %s", element
)
self.execute_insert(element, cursor)
cursor.close()
else:
logging.error("something went wrong with the connection to postresql")
def teardown(self):
if self.connection is not None:

View File

@ -6,11 +6,9 @@ database
import logging
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from helpers.data_io import ReadFromCsv, UpsertProductsToPg
from helpers.parse_row import parse_row
from helpers import UpsertProductsToPg, ReadFromCsv, ProcessRows
class SustainabilityScoreOptions(PipelineOptions):
@ -47,7 +45,7 @@ def main():
pipeline \
| beam.Create([opts.input]) \
| beam.ParDo(ReadFromCsv()) \
| beam.Map(parse_row) \
| beam.ParDo(ProcessRows()) \
| beam.ParDo(UpsertProductsToPg(
connection_details=pg_connection_details,
table=opts.pg_table,

View File

@ -1,8 +1,8 @@
"""Test the `convert_units`"""
from helpers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions
from helpers.weight import UNIT_CONVERSIONS as weight_unit_conversions
from helpers.misc import convert_units
from helpers.parsers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions
from helpers.parsers.weight import UNIT_CONVERSIONS as weight_unit_conversions
from helpers.parsers.misc import convert_units
def test_units_to_cm_inches():

View File

@ -1,6 +1,6 @@
"""Test the `parse_dimensions` function and its helpers"""
from helpers.dimensions import parse_dimensions, parse_dimensions_measure
from helpers.parsers.dimensions import parse_dimensions, parse_dimensions_measure
def test_none():

View File

@ -1,6 +1,10 @@
"""Test the `parse_materials` function and its helpers"""
from helpers.materials import parse_materials, clean_material_name, material_classifier
from helpers.parsers.materials import (
parse_materials,
clean_material_name,
material_classifier,
)
def test_none():

View File

@ -1,6 +1,6 @@
"""Test the `clean_material_name`"""
from helpers.origin import clean_origin_name
from helpers.parsers.origin import clean_origin_name
def test_none():
@ -46,4 +46,3 @@ def test_clean_origin_name5():
def test_clean_origin_name6():
"""Test a sample input for clean_origin_name"""
assert clean_origin_name(" made in the USA or imported") == "mixed"

View File

@ -2,7 +2,7 @@
import xml.etree.ElementTree as ET
from helpers.parse_xml import parse_raw_specs, iter_parse
from helpers.parsers.parse_xml import parse_raw_specs, iter_parse
def test_parse_raw_specs0():

View File

@ -1,6 +1,6 @@
"""Test the `parse_weight` and `dimensional_weight`"""
from helpers.weight import parse_weight, dimensional_weight
from helpers.parsers.weight import parse_weight, dimensional_weight
def test_parse_weight_none():