refactor: some cleanup on etl's code structure
parent
b28ddc350d
commit
39d279f089
|
@ -0,0 +1,5 @@
|
||||||
|
"""Helper classes for the pipeline"""
|
||||||
|
|
||||||
|
from helpers.upsert_products_to_pg import UpsertProductsToPg
|
||||||
|
from helpers.read_from_csv import ReadFromCsv
|
||||||
|
from helpers.process_rows import ProcessRows
|
|
@ -0,0 +1,4 @@
|
||||||
|
"""Helper parser functions to extract and clean data from the input CSV file
|
||||||
|
Only `parse_row` needs to be exported."""
|
||||||
|
|
||||||
|
from helpers.parsers.parse_row import parse_row
|
|
@ -6,7 +6,7 @@ import logging
|
||||||
from typing import Dict, Optional
|
from typing import Dict, Optional
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from helpers.misc import convert_units
|
from helpers.parsers.misc import convert_units
|
||||||
|
|
||||||
|
|
||||||
UNIT_CONVERSIONS = {"inches": 2.54, "feet": 30.48, "cm": 1}
|
UNIT_CONVERSIONS = {"inches": 2.54, "feet": 30.48, "cm": 1}
|
|
@ -4,11 +4,11 @@ the destination database"""
|
||||||
import logging
|
import logging
|
||||||
from typing import TypedDict, Dict, Optional, List
|
from typing import TypedDict, Dict, Optional, List
|
||||||
|
|
||||||
from helpers.parse_xml import parse_raw_specs
|
from helpers.parsers.parse_xml import parse_raw_specs
|
||||||
from helpers.materials import parse_materials
|
from helpers.parsers.materials import parse_materials
|
||||||
from helpers.origin import clean_origin_name
|
from helpers.parsers.origin import clean_origin_name
|
||||||
from helpers.dimensions import parse_dimensions
|
from helpers.parsers.dimensions import parse_dimensions
|
||||||
from helpers.weight import parse_weight, dimensional_weight
|
from helpers.parsers.weight import parse_weight, dimensional_weight
|
||||||
|
|
||||||
|
|
||||||
class CleanRow(TypedDict):
|
class CleanRow(TypedDict):
|
|
@ -6,7 +6,7 @@ import logging
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from helpers.misc import convert_units
|
from helpers.parsers.misc import convert_units
|
||||||
|
|
||||||
|
|
||||||
UNIT_CONVERSIONS = {"pounds": 453.592, "ounces": 28.3495, "g": 1, "kg": 1000}
|
UNIT_CONVERSIONS = {"pounds": 453.592, "ounces": 28.3495, "g": 1, "kg": 1000}
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""Module containing necessary functionality to write to the PostgreSQL sink"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import apache_beam as beam
|
||||||
|
|
||||||
|
from helpers.parsers import parse_row
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessRows(beam.DoFn):
|
||||||
|
"""DoFn to process and parse rows from the input file into structured
|
||||||
|
dictionaries"""
|
||||||
|
|
||||||
|
# pylint: disable=abstract-method,arguments-differ
|
||||||
|
def process(self, element):
|
||||||
|
if (row := parse_row(element)) is not None:
|
||||||
|
yield row
|
||||||
|
else:
|
||||||
|
logging.warning("could not successfully parse this row: %s", element)
|
|
@ -0,0 +1,28 @@
|
||||||
|
"""Module containing ReadFromCsv DoFn to create a PTransform to read from a CSV
|
||||||
|
input file"""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import csv
|
||||||
|
|
||||||
|
import apache_beam as beam
|
||||||
|
from apache_beam.io.filesystems import FileSystems
|
||||||
|
|
||||||
|
|
||||||
|
class ReadFromCsv(beam.DoFn):
|
||||||
|
"""This custom DoFn will read from a CSV file and yield each row as a
|
||||||
|
dictionary where the row names are the keys and the cells are the values
|
||||||
|
"""
|
||||||
|
|
||||||
|
# pylint: disable=abstract-method,arguments-differ
|
||||||
|
def process(self, element):
|
||||||
|
logging.info("reading from input file: %s", element)
|
||||||
|
with FileSystems.open(element) as file:
|
||||||
|
text_wrapper = io.TextIOWrapper(file)
|
||||||
|
reader = csv.reader(text_wrapper)
|
||||||
|
try:
|
||||||
|
header = next(reader)
|
||||||
|
except StopIteration:
|
||||||
|
return
|
||||||
|
for row in reader:
|
||||||
|
yield dict(zip(header, row))
|
|
@ -1,36 +1,11 @@
|
||||||
"""Module containing the IO parts of the pipeline"""
|
"""Module containing necessary functionality to write to the PostgreSQL sink"""
|
||||||
|
|
||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
import io
|
|
||||||
import logging
|
import logging
|
||||||
import csv
|
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
import apache_beam as beam
|
import apache_beam as beam
|
||||||
import psycopg2
|
import psycopg2
|
||||||
|
|
||||||
from apache_beam.io.filesystems import FileSystems
|
|
||||||
|
|
||||||
|
|
||||||
class ReadFromCsv(beam.DoFn):
|
|
||||||
"""This custom DoFn will read from a CSV file and yield each row as a
|
|
||||||
dictionary where the row names are the keys and the cells are the values
|
|
||||||
"""
|
|
||||||
|
|
||||||
# pylint: disable=abstract-method,arguments-differ
|
|
||||||
def process(self, element):
|
|
||||||
logging.info("reading from input file: %s", element)
|
|
||||||
with FileSystems.open(element) as file:
|
|
||||||
text_wrapper = io.TextIOWrapper(file)
|
|
||||||
reader = csv.reader(text_wrapper)
|
|
||||||
try:
|
|
||||||
header = next(reader)
|
|
||||||
except StopIteration:
|
|
||||||
return
|
|
||||||
for row in reader:
|
|
||||||
yield dict(zip(header, row))
|
|
||||||
|
|
||||||
|
|
||||||
class WriteToPostgreSQL(beam.DoFn):
|
class WriteToPostgreSQL(beam.DoFn):
|
||||||
"""DoFn to write elements to a PostgreSQL database"""
|
"""DoFn to write elements to a PostgreSQL database"""
|
||||||
|
@ -68,8 +43,13 @@ class WriteToPostgreSQL(beam.DoFn):
|
||||||
def process(self, element):
|
def process(self, element):
|
||||||
if self.connection is not None:
|
if self.connection is not None:
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
|
logging.info(
|
||||||
|
"inserting the following element into the database: %s", element
|
||||||
|
)
|
||||||
self.execute_insert(element, cursor)
|
self.execute_insert(element, cursor)
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
else:
|
||||||
|
logging.error("something went wrong with the connection to postresql")
|
||||||
|
|
||||||
def teardown(self):
|
def teardown(self):
|
||||||
if self.connection is not None:
|
if self.connection is not None:
|
|
@ -6,11 +6,9 @@ database
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import apache_beam as beam
|
import apache_beam as beam
|
||||||
|
|
||||||
from apache_beam.options.pipeline_options import PipelineOptions
|
from apache_beam.options.pipeline_options import PipelineOptions
|
||||||
|
|
||||||
from helpers.data_io import ReadFromCsv, UpsertProductsToPg
|
from helpers import UpsertProductsToPg, ReadFromCsv, ProcessRows
|
||||||
from helpers.parse_row import parse_row
|
|
||||||
|
|
||||||
|
|
||||||
class SustainabilityScoreOptions(PipelineOptions):
|
class SustainabilityScoreOptions(PipelineOptions):
|
||||||
|
@ -47,7 +45,7 @@ def main():
|
||||||
pipeline \
|
pipeline \
|
||||||
| beam.Create([opts.input]) \
|
| beam.Create([opts.input]) \
|
||||||
| beam.ParDo(ReadFromCsv()) \
|
| beam.ParDo(ReadFromCsv()) \
|
||||||
| beam.Map(parse_row) \
|
| beam.ParDo(ProcessRows()) \
|
||||||
| beam.ParDo(UpsertProductsToPg(
|
| beam.ParDo(UpsertProductsToPg(
|
||||||
connection_details=pg_connection_details,
|
connection_details=pg_connection_details,
|
||||||
table=opts.pg_table,
|
table=opts.pg_table,
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
"""Test the `convert_units`"""
|
"""Test the `convert_units`"""
|
||||||
|
|
||||||
from helpers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions
|
from helpers.parsers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions
|
||||||
from helpers.weight import UNIT_CONVERSIONS as weight_unit_conversions
|
from helpers.parsers.weight import UNIT_CONVERSIONS as weight_unit_conversions
|
||||||
from helpers.misc import convert_units
|
from helpers.parsers.misc import convert_units
|
||||||
|
|
||||||
|
|
||||||
def test_units_to_cm_inches():
|
def test_units_to_cm_inches():
|
|
@ -1,6 +1,6 @@
|
||||||
"""Test the `parse_dimensions` function and its helpers"""
|
"""Test the `parse_dimensions` function and its helpers"""
|
||||||
|
|
||||||
from helpers.dimensions import parse_dimensions, parse_dimensions_measure
|
from helpers.parsers.dimensions import parse_dimensions, parse_dimensions_measure
|
||||||
|
|
||||||
|
|
||||||
def test_none():
|
def test_none():
|
|
@ -1,6 +1,10 @@
|
||||||
"""Test the `parse_materials` function and its helpers"""
|
"""Test the `parse_materials` function and its helpers"""
|
||||||
|
|
||||||
from helpers.materials import parse_materials, clean_material_name, material_classifier
|
from helpers.parsers.materials import (
|
||||||
|
parse_materials,
|
||||||
|
clean_material_name,
|
||||||
|
material_classifier,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_none():
|
def test_none():
|
|
@ -1,6 +1,6 @@
|
||||||
"""Test the `clean_material_name`"""
|
"""Test the `clean_material_name`"""
|
||||||
|
|
||||||
from helpers.origin import clean_origin_name
|
from helpers.parsers.origin import clean_origin_name
|
||||||
|
|
||||||
|
|
||||||
def test_none():
|
def test_none():
|
||||||
|
@ -46,4 +46,3 @@ def test_clean_origin_name5():
|
||||||
def test_clean_origin_name6():
|
def test_clean_origin_name6():
|
||||||
"""Test a sample input for clean_origin_name"""
|
"""Test a sample input for clean_origin_name"""
|
||||||
assert clean_origin_name(" made in the USA or imported") == "mixed"
|
assert clean_origin_name(" made in the USA or imported") == "mixed"
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
|
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from helpers.parse_xml import parse_raw_specs, iter_parse
|
from helpers.parsers.parse_xml import parse_raw_specs, iter_parse
|
||||||
|
|
||||||
|
|
||||||
def test_parse_raw_specs0():
|
def test_parse_raw_specs0():
|
|
@ -1,6 +1,6 @@
|
||||||
"""Test the `parse_weight` and `dimensional_weight`"""
|
"""Test the `parse_weight` and `dimensional_weight`"""
|
||||||
|
|
||||||
from helpers.weight import parse_weight, dimensional_weight
|
from helpers.parsers.weight import parse_weight, dimensional_weight
|
||||||
|
|
||||||
|
|
||||||
def test_parse_weight_none():
|
def test_parse_weight_none():
|
Loading…
Reference in New Issue