Compare commits

...

10 Commits

Author SHA1 Message Date
Ricard Illa e9446f42f3 feat: added a few more plots to notebooks 2023-06-26 13:03:06 +02:00
Ricard Illa 39d279f089 refactor: some cleanup on etl's code structure 2023-06-26 12:36:19 +02:00
Ricard Illa b28ddc350d doc: added readme to etl 2023-06-26 12:09:50 +02:00
Ricard Illa 4ba29e7e1d feat: postgresql port doesn't need to be exposed 2023-06-26 11:57:50 +02:00
Ricard Illa 0dd81715d4 feat: ignore dbt/.user.yaml 2023-06-26 11:54:17 +02:00
Ricard Illa 543004b51c feat: added pylint and pytest for etl 2023-06-26 11:53:40 +02:00
Ricard Illa ac1101ae96 fix: fix dbt writeable directories permissions 2023-06-26 10:04:53 +02:00
Ricard Illa 7d898c6297 fix: airflow's BashOperator's cwd cannot be templated 2023-06-26 10:04:28 +02:00
Ricard Illa 06c76f0b65
docs: added dbt readme 2023-06-26 08:51:54 +02:00
Ricard Illa 743c57a0d1
feat: minor changes 2023-06-26 08:49:16 +02:00
35 changed files with 598 additions and 174 deletions

3
.gitignore vendored
View File

@ -5,9 +5,12 @@ state
.direnv
venv
.installed_deps
.img_name
*.egg-info
__pycache__
.terraform
.terraform.lock.hcl
dbt/.user.yml

View File

@ -3,6 +3,7 @@
set -xe
sudo chown airflow:airflow airflow
sudo chown airflow:airflow /dbt/logs /dbt/target
airflow db init
# Allow non-authenticated access to UI for Airflow 2.*

View File

@ -16,8 +16,11 @@ The following parameters are available:
* `input`: location of the CSV input file
* `beam_etl_path`: location of the apache beam pipeline
* `dbt_path`: location of the dbt project
* `products_table`: products_table table name
I decided not to configure the rest of the table locations because that makes
more sense to be defined in DBT.
Ideally, I would parametrize the dbt path as well. But the `cwd` parameter of
`BashOperator` is not a template and implementing that is not worth it for such
a minor improvement.

View File

@ -24,7 +24,6 @@ CONFIG = {
"input": f"{ HOME }/gcs/data/{ CSV_FNAME }",
"beam_etl_path": "/etl/main.py",
"products_table": "sustainability_score.products",
"scored_table": "sustainability_score.scored_products",
}

7
dbt/README.md Normal file
View File

@ -0,0 +1,7 @@
# Sustainability score
These DBT models are used to compute the product scores out of the products
source table.
The `scored_products` incremental model is built of the `product` source, using
the `material_lookup` and `origin_lookup` lookup tables as helpers.

View File

@ -10,7 +10,7 @@ x-airflow-common:
- ./state/dbt-data/target:/dbt/target
- ./dags:/home/airflow/airflow/dags
- ./data:/home/airflow/gcs/data:ro
- ./etl:/etl:ro
- ./etl/src:/etl:ro
- ./dbt:/dbt
environment:
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
@ -43,8 +43,6 @@ services:
postgres:
image: postgres:15.3-alpine
restart: "unless-stopped"
ports:
- 5432:5432
volumes:
- ./state/postgres/data:/var/lib/postgresql/data
environment:

9
etl/Dockerfile Normal file
View File

@ -0,0 +1,9 @@
FROM python:3.8-slim
COPY requirements.txt /requirements.txt
COPY dev-requirements.txt /dev-requirements.txt
RUN pip install --upgrade pip && \
pip install -r /requirements.txt -r /dev-requirements.txt
WORKDIR /src

54
etl/Makefile Normal file
View File

@ -0,0 +1,54 @@
.PHONY: clean build_img
PYTHON_VERSION = 3.8
PYTHON_SUBVERSION = 3.8.12
PYENV_VERSIONS = $(HOME)/.pyenv/versions
PYTHON_BIN = $(PYENV_VERSIONS)/$(PYTHON_SUBVERSION)/bin/python$(PYTHON_VERSION)
VENV = venv
PYTHON_VENV = $(VENV)/bin/python
PIP = $(PYTHON_VENV) -m pip
PIP_COMPILE = venv/bin/pip-compile
IMG_NAME=etl_beam_env
all: oci_img .installed_deps
build_img: .img_name
.img_name: Dockerfile requirements.txt dev-requirements.txt
docker build -t $(IMG_NAME) .
echo $(IMG_NAME) > $@
.installed_deps: requirements.txt dev-requirements.txt $(PYTHON_VENV)
$(PIP) install \
-r requirements.txt \
-r dev-requirements.txt
touch $@
requirements.txt: pyproject.toml $(PIP_COMPILE)
$(PIP_COMPILE) \
--resolver=backtracking \
--output-file $@ \
$<
dev-requirements.txt: pyproject.toml $(PIP_COMPILE)
$(PIP_COMPILE) \
--extra=dev \
--resolver=backtracking \
--output-file $@ \
$<
$(PIP_COMPILE): $(PYTHON_VENV)
$(PIP) install pip-tools
$(PYTHON_VENV): $(PYTHON_BIN)
virtualenv --python=$^ $(VENV)
$(PIP) install --upgrade pip
$(PYTHON_BIN):
pyenv install $(PYTHON_SUBVERSION)
clean:
rm -rf *.egg-info venv installed_deps

47
etl/README.md Normal file
View File

@ -0,0 +1,47 @@
This is the ETL ppipeline to read elements from a CSV file, parsing/cleaning
them up and inserting into a PostgreSQL
It has been tested only with DirectRunner, but it could be moved to run on
DataFlow easily.
## Running
This is intended to be scheduled by Airflow but it the necessary packages are
available it can also be run manually with:
```sh
python3 /etl/main.py \
--runner=DirectRunner \
--input="$CSV_INPUT_FILE" \
--pg_hostname="$PG_HOSTNAME" \
--pg_port="$PG_PORT" \
--pg_username="$PG_USERNAME" \
--pg_password="$PG_PASSWORD" \
--pg_database="$PG_DATABASE" \
--pg_table="$PG_TABLE"
```
## Testing and linting
To help with development and testing a `Dockerfile`, a `Makefile` and
`justfile` files are also provided.
The `Makefile` provides a mechanism to
* automate the generation of `dev-requirements.txt` and `requirements.txt` out
of `pyproject.toml`
* automate the creation of a python virtual environment which contains the
right python version (installed by pyenv) and the packages defined in `pyproject.toml`
* automate the building of an OCI image with the necessary dependencies
The provided `Dockerfile` is used to build an image with the necessary packages
to run `pytest` and `pylint`.
The provided `justfile` provides the commands to run `pytest` and `pylint` from
a container.
If [`just`](https://github.com/casey/just) is installed, `pytest` and `pylint` can be run like so:
```sh
just test
just lint
```

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml
@ -208,7 +208,7 @@ protobuf==4.23.3
# grpc-google-iam-v1
# grpcio-status
# proto-plus
psycopg2==2.9.6
psycopg2-binary==2.9.6
# via beam-etl (pyproject.toml)
pyarrow==11.0.0
# via apache-beam
@ -275,6 +275,8 @@ typing-extensions==4.6.3
# via
# apache-beam
# astroid
# black
# pylint
tzdata==2023.3
# via pandas
urllib3==1.26.16

16
etl/justfile Normal file
View File

@ -0,0 +1,16 @@
build_img:
make build_img
test: build_img
docker run \
-v $(pwd)/src:/src \
--rm \
-it $(cat .img_name) \
python -m pytest
lint: build_img
docker run \
-v $(pwd)/src:/src \
--rm \
-it $(cat .img_name) \
pylint --init-hook "import sys; sys.path.append('/src')" /src

View File

@ -1,7 +1,7 @@
[project]
name = "beam_etl"
version = "0.1"
dependencies = ["wheel", "apache-beam[gcp]", "pandas", "psycopg2"]
dependencies = ["wheel", "apache-beam[gcp]", "pandas", "psycopg2-binary"]
[project.optional-dependencies]
dev = ["pytest", "pylint", "black"]

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml
@ -177,7 +177,7 @@ protobuf==4.23.3
# grpc-google-iam-v1
# grpcio-status
# proto-plus
psycopg2==2.9.6
psycopg2-binary==2.9.6
# via beam-etl (pyproject.toml)
pyarrow==11.0.0
# via apache-beam

0
etl/src/__init__.py Normal file
View File

View File

@ -0,0 +1,5 @@
"""Helper classes for the pipeline"""
from helpers.upsert_products_to_pg import UpsertProductsToPg
from helpers.read_from_csv import ReadFromCsv
from helpers.process_rows import ProcessRows

View File

@ -0,0 +1,4 @@
"""Helper parser functions to extract and clean data from the input CSV file
Only `parse_row` needs to be exported."""
from helpers.parsers.parse_row import parse_row

View File

@ -6,7 +6,7 @@ import logging
from typing import Dict, Optional
import re
from helpers.misc import convert_units
from helpers.parsers.misc import convert_units
UNIT_CONVERSIONS = {"inches": 2.54, "feet": 30.48, "cm": 1}

View File

@ -4,11 +4,11 @@ the destination database"""
import logging
from typing import TypedDict, Dict, Optional, List
from helpers.parse_xml import parse_raw_specs
from helpers.materials import parse_materials
from helpers.origin import clean_origin_name
from helpers.dimensions import parse_dimensions
from helpers.weight import parse_weight, dimensional_weight
from helpers.parsers.parse_xml import parse_raw_specs
from helpers.parsers.materials import parse_materials
from helpers.parsers.origin import clean_origin_name
from helpers.parsers.dimensions import parse_dimensions
from helpers.parsers.weight import parse_weight, dimensional_weight
class CleanRow(TypedDict):

View File

@ -6,7 +6,7 @@ import logging
from typing import Optional
import re
from helpers.misc import convert_units
from helpers.parsers.misc import convert_units
UNIT_CONVERSIONS = {"pounds": 453.592, "ounces": 28.3495, "g": 1, "kg": 1000}

View File

@ -0,0 +1,19 @@
"""Module containing necessary functionality to write to the PostgreSQL sink"""
import logging
import apache_beam as beam
from helpers.parsers import parse_row
class ProcessRows(beam.DoFn):
"""DoFn to process and parse rows from the input file into structured
dictionaries"""
# pylint: disable=abstract-method,arguments-differ
def process(self, element):
if (row := parse_row(element)) is not None:
yield row
else:
logging.warning("could not successfully parse this row: %s", element)

View File

@ -0,0 +1,28 @@
"""Module containing ReadFromCsv DoFn to create a PTransform to read from a CSV
input file"""
import io
import logging
import csv
import apache_beam as beam
from apache_beam.io.filesystems import FileSystems
class ReadFromCsv(beam.DoFn):
"""This custom DoFn will read from a CSV file and yield each row as a
dictionary where the row names are the keys and the cells are the values
"""
# pylint: disable=abstract-method,arguments-differ
def process(self, element):
logging.info("reading from input file: %s", element)
with FileSystems.open(element) as file:
text_wrapper = io.TextIOWrapper(file)
reader = csv.reader(text_wrapper)
try:
header = next(reader)
except StopIteration:
return
for row in reader:
yield dict(zip(header, row))

View File

@ -1,56 +1,34 @@
"""Module containing the IO parts of the pipeline"""
"""Module containing necessary functionality to write to the PostgreSQL sink"""
#!/usr/bin/env python
import io
import logging
import csv
from typing import Dict
import apache_beam as beam
import psycopg2
from apache_beam.io.filesystems import FileSystems
class ReadFromCsv(beam.DoFn):
"""This custom DoFn will read from a CSV file and yield each row as a
dictionary where the row names are the keys and the cells are the values
"""
def process(self, in_file):
logging.info("reading from input file: %s", in_file)
with FileSystems.open(in_file) as file:
text_wrapper = io.TextIOWrapper(file)
reader = csv.reader(text_wrapper)
try:
header = next(reader)
except StopIteration:
return
for row in reader:
yield dict(zip(header, row))
class WriteToPostgreSQL(beam.DoFn):
"""DoFn to write elements to a PostgreSQL database"""
def __init__(
self, hostname, port, username, password, database, table, table_key=None
):
self.connection_details = {
"host": hostname,
"port": port,
"user": username,
"password": password,
"database": database,
}
# pylint: disable=abstract-method
def __init__(self, connection_details: Dict[str, str], table, table_key=None):
# pylint: disable=super-init-not-called
self.connection_details = connection_details
self.table = table
self.table_key = table_key
self.connection = None
def setup(self):
self.connection = psycopg2.connect(**self.connection_details)
self.connection.autocommit = True
def execute_insert(self, row, cursor):
def execute_insert(self, row: Dict, cursor):
"""Given a dictionary reporesenting a row and a postgresql cursor,
insert the row into the database so that the dict keys are the colum
names and the dict values the cell values.
If a table_key is specified (`self.table_key` is set) handle conflicts
by doing nothing"""
colnames = ",".join(row.keys())
values = ",".join(["%s"] * len(row))
sql = f"""
@ -61,19 +39,33 @@ class WriteToPostgreSQL(beam.DoFn):
sql = sql + f" ON CONFLICT ({ self.table_key }) DO NOTHING"
cursor.execute(sql, list(row.values()))
# pylint: disable=arguments-differ
def process(self, element):
cursor = self.connection.cursor()
self.execute_insert(element, cursor)
cursor.close()
if self.connection is not None:
cursor = self.connection.cursor()
logging.info(
"inserting the following element into the database: %s", element
)
self.execute_insert(element, cursor)
cursor.close()
else:
logging.error("something went wrong with the connection to postresql")
def teardown(self):
self.connection.close()
if self.connection is not None:
self.connection.close()
class UpsertProductsToPg(WriteToPostgreSQL):
"""DoFn to write products to PostgreSQL with our upsert logic"""
# pylint: disable=abstract-method
def execute_insert(self, row, cursor):
"""Our upsert logic is the following:
When any of primary_category, packaging, origin, height, depth or with
has changed, update the element and set the ingestion_time value to the
current timestamp
"""
colnames = ",".join(row.keys())
values = ",".join(["%s"] * len(row))
sql = f"""
@ -87,12 +79,18 @@ class UpsertProductsToPg(WriteToPostgreSQL):
packaging = EXCLUDED.packaging,
origin = EXCLUDED.origin,
weight = EXCLUDED.weight,
height = EXCLUDED.height,
depth = EXCLUDED.depth,
width = EXCLUDED.width,
ingestion_time = NOW()::TIMESTAMP
WHERE
{ self.table }.primary_category != EXCLUDED.primary_category OR
{ self.table }.materials != EXCLUDED.materials OR
{ self.table }.packaging != EXCLUDED.packaging OR
{ self.table }.origin != EXCLUDED.origin OR
{ self.table }.weight != EXCLUDED.weight
{ self.table }.weight != EXCLUDED.weight OR
{ self.table }.height != EXCLUDED.height OR
{ self.table }.depth != EXCLUDED.depth OR
{ self.table }.width != EXCLUDED.width
"""
cursor.execute(sql, list(row.values()))

View File

@ -1,16 +1,14 @@
#!/usr/bin/env python
"""This Apache Beam pipeline reads rows as elements from a CSV input file,
extracts and parses relevant values, and upserts the elements to a PostgreSQL
database
"""
import logging
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from helpers.data_io import ReadFromCsv, UpsertProductsToPg
from helpers.parse_row import parse_row
# def __init__(self, hostname, port, username, password, database):
from helpers import UpsertProductsToPg, ReadFromCsv, ProcessRows
class SustainabilityScoreOptions(PipelineOptions):
@ -33,18 +31,23 @@ def main():
beam_options = PipelineOptions()
opts = beam_options.view_as(SustainabilityScoreOptions)
pg_connection_details = {
"host": opts.pg_hostname,
"port": opts.pg_port,
"user": opts.pg_username,
"password": opts.pg_password,
"database": opts.pg_database,
}
with beam.Pipeline(options=beam_options) as pipeline:
# fmt: off
# pylint: disable=expression-not-assigned
pipeline \
| beam.Create([opts.input]) \
| beam.ParDo(ReadFromCsv()) \
| beam.Map(parse_row) \
| beam.ParDo(ProcessRows()) \
| beam.ParDo(UpsertProductsToPg(
hostname=opts.pg_hostname,
port=opts.pg_port,
username=opts.pg_username,
password=opts.pg_password,
database=opts.pg_database,
connection_details=pg_connection_details,
table=opts.pg_table,
table_key="tcin",
))

View File

@ -1,8 +1,8 @@
"""Test the `convert_units`"""
from helpers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions
from helpers.weight import UNIT_CONVERSIONS as weight_unit_conversions
from helpers.misc import convert_units
from helpers.parsers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions
from helpers.parsers.weight import UNIT_CONVERSIONS as weight_unit_conversions
from helpers.parsers.misc import convert_units
def test_units_to_cm_inches():

View File

@ -1,6 +1,6 @@
"""Test the `parse_dimensions` function and its helpers"""
from helpers.dimensions import parse_dimensions, parse_dimensions_measure
from helpers.parsers.dimensions import parse_dimensions, parse_dimensions_measure
def test_none():

View File

@ -1,6 +1,10 @@
"""Test the `parse_materials` function and its helpers"""
from helpers.materials import parse_materials, clean_material_name, material_classifier
from helpers.parsers.materials import (
parse_materials,
clean_material_name,
material_classifier,
)
def test_none():

View File

@ -1,6 +1,6 @@
"""Test the `clean_material_name`"""
from helpers.origin import clean_origin_name
from helpers.parsers.origin import clean_origin_name
def test_none():
@ -46,4 +46,3 @@ def test_clean_origin_name5():
def test_clean_origin_name6():
"""Test a sample input for clean_origin_name"""
assert clean_origin_name(" made in the USA or imported") == "mixed"

View File

@ -2,7 +2,7 @@
import xml.etree.ElementTree as ET
from helpers.parse_xml import parse_raw_specs, iter_parse
from helpers.parsers.parse_xml import parse_raw_specs, iter_parse
def test_parse_raw_specs0():

View File

@ -1,6 +1,6 @@
"""Test the `parse_weight` and `dimensional_weight`"""
from helpers.weight import parse_weight, dimensional_weight
from helpers.parsers.weight import parse_weight, dimensional_weight
def test_parse_weight_none():

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long