refactor: some cleanup on etl's code structure
parent
b28ddc350d
commit
39d279f089
|
@ -0,0 +1,5 @@
|
|||
"""Helper classes for the pipeline"""
|
||||
|
||||
from helpers.upsert_products_to_pg import UpsertProductsToPg
|
||||
from helpers.read_from_csv import ReadFromCsv
|
||||
from helpers.process_rows import ProcessRows
|
|
@ -0,0 +1,4 @@
|
|||
"""Helper parser functions to extract and clean data from the input CSV file
|
||||
Only `parse_row` needs to be exported."""
|
||||
|
||||
from helpers.parsers.parse_row import parse_row
|
|
@ -6,7 +6,7 @@ import logging
|
|||
from typing import Dict, Optional
|
||||
import re
|
||||
|
||||
from helpers.misc import convert_units
|
||||
from helpers.parsers.misc import convert_units
|
||||
|
||||
|
||||
UNIT_CONVERSIONS = {"inches": 2.54, "feet": 30.48, "cm": 1}
|
|
@ -4,11 +4,11 @@ the destination database"""
|
|||
import logging
|
||||
from typing import TypedDict, Dict, Optional, List
|
||||
|
||||
from helpers.parse_xml import parse_raw_specs
|
||||
from helpers.materials import parse_materials
|
||||
from helpers.origin import clean_origin_name
|
||||
from helpers.dimensions import parse_dimensions
|
||||
from helpers.weight import parse_weight, dimensional_weight
|
||||
from helpers.parsers.parse_xml import parse_raw_specs
|
||||
from helpers.parsers.materials import parse_materials
|
||||
from helpers.parsers.origin import clean_origin_name
|
||||
from helpers.parsers.dimensions import parse_dimensions
|
||||
from helpers.parsers.weight import parse_weight, dimensional_weight
|
||||
|
||||
|
||||
class CleanRow(TypedDict):
|
|
@ -6,7 +6,7 @@ import logging
|
|||
from typing import Optional
|
||||
import re
|
||||
|
||||
from helpers.misc import convert_units
|
||||
from helpers.parsers.misc import convert_units
|
||||
|
||||
|
||||
UNIT_CONVERSIONS = {"pounds": 453.592, "ounces": 28.3495, "g": 1, "kg": 1000}
|
|
@ -0,0 +1,19 @@
|
|||
"""Module containing necessary functionality to write to the PostgreSQL sink"""
|
||||
|
||||
import logging
|
||||
|
||||
import apache_beam as beam
|
||||
|
||||
from helpers.parsers import parse_row
|
||||
|
||||
|
||||
class ProcessRows(beam.DoFn):
|
||||
"""DoFn to process and parse rows from the input file into structured
|
||||
dictionaries"""
|
||||
|
||||
# pylint: disable=abstract-method,arguments-differ
|
||||
def process(self, element):
|
||||
if (row := parse_row(element)) is not None:
|
||||
yield row
|
||||
else:
|
||||
logging.warning("could not successfully parse this row: %s", element)
|
|
@ -0,0 +1,28 @@
|
|||
"""Module containing ReadFromCsv DoFn to create a PTransform to read from a CSV
|
||||
input file"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
import csv
|
||||
|
||||
import apache_beam as beam
|
||||
from apache_beam.io.filesystems import FileSystems
|
||||
|
||||
|
||||
class ReadFromCsv(beam.DoFn):
|
||||
"""This custom DoFn will read from a CSV file and yield each row as a
|
||||
dictionary where the row names are the keys and the cells are the values
|
||||
"""
|
||||
|
||||
# pylint: disable=abstract-method,arguments-differ
|
||||
def process(self, element):
|
||||
logging.info("reading from input file: %s", element)
|
||||
with FileSystems.open(element) as file:
|
||||
text_wrapper = io.TextIOWrapper(file)
|
||||
reader = csv.reader(text_wrapper)
|
||||
try:
|
||||
header = next(reader)
|
||||
except StopIteration:
|
||||
return
|
||||
for row in reader:
|
||||
yield dict(zip(header, row))
|
|
@ -1,36 +1,11 @@
|
|||
"""Module containing the IO parts of the pipeline"""
|
||||
"""Module containing necessary functionality to write to the PostgreSQL sink"""
|
||||
|
||||
#!/usr/bin/env python
|
||||
|
||||
import io
|
||||
import logging
|
||||
import csv
|
||||
from typing import Dict
|
||||
|
||||
import apache_beam as beam
|
||||
import psycopg2
|
||||
|
||||
from apache_beam.io.filesystems import FileSystems
|
||||
|
||||
|
||||
class ReadFromCsv(beam.DoFn):
|
||||
"""This custom DoFn will read from a CSV file and yield each row as a
|
||||
dictionary where the row names are the keys and the cells are the values
|
||||
"""
|
||||
|
||||
# pylint: disable=abstract-method,arguments-differ
|
||||
def process(self, element):
|
||||
logging.info("reading from input file: %s", element)
|
||||
with FileSystems.open(element) as file:
|
||||
text_wrapper = io.TextIOWrapper(file)
|
||||
reader = csv.reader(text_wrapper)
|
||||
try:
|
||||
header = next(reader)
|
||||
except StopIteration:
|
||||
return
|
||||
for row in reader:
|
||||
yield dict(zip(header, row))
|
||||
|
||||
|
||||
class WriteToPostgreSQL(beam.DoFn):
|
||||
"""DoFn to write elements to a PostgreSQL database"""
|
||||
|
@ -68,8 +43,13 @@ class WriteToPostgreSQL(beam.DoFn):
|
|||
def process(self, element):
|
||||
if self.connection is not None:
|
||||
cursor = self.connection.cursor()
|
||||
logging.info(
|
||||
"inserting the following element into the database: %s", element
|
||||
)
|
||||
self.execute_insert(element, cursor)
|
||||
cursor.close()
|
||||
else:
|
||||
logging.error("something went wrong with the connection to postresql")
|
||||
|
||||
def teardown(self):
|
||||
if self.connection is not None:
|
|
@ -6,11 +6,9 @@ database
|
|||
import logging
|
||||
|
||||
import apache_beam as beam
|
||||
|
||||
from apache_beam.options.pipeline_options import PipelineOptions
|
||||
|
||||
from helpers.data_io import ReadFromCsv, UpsertProductsToPg
|
||||
from helpers.parse_row import parse_row
|
||||
from helpers import UpsertProductsToPg, ReadFromCsv, ProcessRows
|
||||
|
||||
|
||||
class SustainabilityScoreOptions(PipelineOptions):
|
||||
|
@ -47,7 +45,7 @@ def main():
|
|||
pipeline \
|
||||
| beam.Create([opts.input]) \
|
||||
| beam.ParDo(ReadFromCsv()) \
|
||||
| beam.Map(parse_row) \
|
||||
| beam.ParDo(ProcessRows()) \
|
||||
| beam.ParDo(UpsertProductsToPg(
|
||||
connection_details=pg_connection_details,
|
||||
table=opts.pg_table,
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
"""Test the `convert_units`"""
|
||||
|
||||
from helpers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions
|
||||
from helpers.weight import UNIT_CONVERSIONS as weight_unit_conversions
|
||||
from helpers.misc import convert_units
|
||||
from helpers.parsers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions
|
||||
from helpers.parsers.weight import UNIT_CONVERSIONS as weight_unit_conversions
|
||||
from helpers.parsers.misc import convert_units
|
||||
|
||||
|
||||
def test_units_to_cm_inches():
|
|
@ -1,6 +1,6 @@
|
|||
"""Test the `parse_dimensions` function and its helpers"""
|
||||
|
||||
from helpers.dimensions import parse_dimensions, parse_dimensions_measure
|
||||
from helpers.parsers.dimensions import parse_dimensions, parse_dimensions_measure
|
||||
|
||||
|
||||
def test_none():
|
|
@ -1,6 +1,10 @@
|
|||
"""Test the `parse_materials` function and its helpers"""
|
||||
|
||||
from helpers.materials import parse_materials, clean_material_name, material_classifier
|
||||
from helpers.parsers.materials import (
|
||||
parse_materials,
|
||||
clean_material_name,
|
||||
material_classifier,
|
||||
)
|
||||
|
||||
|
||||
def test_none():
|
|
@ -1,6 +1,6 @@
|
|||
"""Test the `clean_material_name`"""
|
||||
|
||||
from helpers.origin import clean_origin_name
|
||||
from helpers.parsers.origin import clean_origin_name
|
||||
|
||||
|
||||
def test_none():
|
||||
|
@ -46,4 +46,3 @@ def test_clean_origin_name5():
|
|||
def test_clean_origin_name6():
|
||||
"""Test a sample input for clean_origin_name"""
|
||||
assert clean_origin_name(" made in the USA or imported") == "mixed"
|
||||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
from helpers.parse_xml import parse_raw_specs, iter_parse
|
||||
from helpers.parsers.parse_xml import parse_raw_specs, iter_parse
|
||||
|
||||
|
||||
def test_parse_raw_specs0():
|
|
@ -1,6 +1,6 @@
|
|||
"""Test the `parse_weight` and `dimensional_weight`"""
|
||||
|
||||
from helpers.weight import parse_weight, dimensional_weight
|
||||
from helpers.parsers.weight import parse_weight, dimensional_weight
|
||||
|
||||
|
||||
def test_parse_weight_none():
|
Loading…
Reference in New Issue