feat: added a few more plots to notebooks

refactor: some cleanup on etl's code structure
doc: added readme to etl
2023-06-26 13:03:06 +02:00 · 2023-06-26 12:36:19 +02:00 · 2023-06-26 12:09:50 +02:00 · 2023-06-26 11:57:50 +02:00 · 2023-06-26 11:54:17 +02:00 · 2023-06-26 11:53:40 +02:00
35 changed files with 598 additions and 174 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,9 +5,12 @@ state
 .direnv
 venv
 .installed_deps
+.img_name

 *.egg-info
 __pycache__

 .terraform
 .terraform.lock.hcl
+
+dbt/.user.yml
--- a/airflow_img/airflow-init.sh
+++ b/airflow_img/airflow-init.sh
@ -3,6 +3,7 @@
 set -xe

 sudo chown airflow:airflow airflow
+sudo chown airflow:airflow /dbt/logs /dbt/target
 airflow db init

 # Allow non-authenticated access to UI for Airflow 2.*
--- a/dags/sustainability_score/README.md
+++ b/dags/sustainability_score/README.md
@ -16,8 +16,11 @@ The following parameters are available:

 * `input`: location of the CSV input file
 * `beam_etl_path`: location of the apache beam pipeline
-* `dbt_path`: location of the dbt project
 * `products_table`: products_table table name

 I decided not to configure the rest of the table locations because that makes
 more sense to be defined in DBT.
+
+Ideally, I would parametrize the dbt path as well. But the `cwd` parameter of
+`BashOperator` is not a template and implementing that is not worth it for such
+a minor improvement.
--- a/dags/sustainability_score/init.py
+++ b/dags/sustainability_score/init.py
@ -24,7 +24,6 @@ CONFIG = {
    "input": f"{ HOME }/gcs/data/{ CSV_FNAME }",
    "beam_etl_path": "/etl/main.py",
    "products_table": "sustainability_score.products",
-    "scored_table": "sustainability_score.scored_products",
 }


--- a/dbt/README.md
+++ b/dbt/README.md
@ -0,0 +1,7 @@
+# Sustainability score
+
+These DBT models are used to compute the product scores out of the products
+source table.
+
+The `scored_products` incremental model is built of the `product` source, using
+the `material_lookup` and `origin_lookup` lookup tables as helpers.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -10,7 +10,7 @@ x-airflow-common:
      - ./state/dbt-data/target:/dbt/target
      - ./dags:/home/airflow/airflow/dags
      - ./data:/home/airflow/gcs/data:ro
-      - ./etl:/etl:ro
+      - ./etl/src:/etl:ro
      - ./dbt:/dbt
    environment:
      AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
@ -43,8 +43,6 @@ services:
  postgres:
    image: postgres:15.3-alpine
    restart: "unless-stopped"
-    ports:
-      - 5432:5432
    volumes:
      - ./state/postgres/data:/var/lib/postgresql/data
    environment:
--- a/etl/Dockerfile
+++ b/etl/Dockerfile
@ -0,0 +1,9 @@
+FROM python:3.8-slim
+
+COPY requirements.txt /requirements.txt
+COPY dev-requirements.txt /dev-requirements.txt
+
+RUN pip install --upgrade pip && \
+    pip install -r /requirements.txt -r /dev-requirements.txt
+
+WORKDIR /src
--- a/etl/Makefile
+++ b/etl/Makefile
@ -0,0 +1,54 @@
+.PHONY: clean build_img
+
+PYTHON_VERSION = 3.8
+PYTHON_SUBVERSION = 3.8.12
+
+PYENV_VERSIONS = $(HOME)/.pyenv/versions
+PYTHON_BIN = $(PYENV_VERSIONS)/$(PYTHON_SUBVERSION)/bin/python$(PYTHON_VERSION)
+
+VENV = venv
+PYTHON_VENV = $(VENV)/bin/python
+PIP = $(PYTHON_VENV) -m pip
+PIP_COMPILE = venv/bin/pip-compile
+
+IMG_NAME=etl_beam_env
+
+all: oci_img .installed_deps
+
+build_img: .img_name
+
+.img_name: Dockerfile requirements.txt dev-requirements.txt
+	docker build -t $(IMG_NAME) .
+	echo $(IMG_NAME) > $@
+
+.installed_deps: requirements.txt dev-requirements.txt $(PYTHON_VENV)
+	$(PIP) install \
+		-r requirements.txt \
+		-r dev-requirements.txt
+	touch $@
+
+requirements.txt: pyproject.toml $(PIP_COMPILE)
+	$(PIP_COMPILE) \
+		--resolver=backtracking \
+		--output-file $@ \
+		$<
+
+dev-requirements.txt: pyproject.toml $(PIP_COMPILE)
+	$(PIP_COMPILE) \
+		--extra=dev \
+		--resolver=backtracking \
+		--output-file $@ \
+		$<
+
+$(PIP_COMPILE): $(PYTHON_VENV)
+	$(PIP) install pip-tools
+
+$(PYTHON_VENV): $(PYTHON_BIN)
+	virtualenv --python=$^ $(VENV)
+	$(PIP) install --upgrade pip
+
+$(PYTHON_BIN):
+	pyenv install $(PYTHON_SUBVERSION)
+
+clean:
+	rm -rf *.egg-info venv installed_deps
--- a/etl/README.md
+++ b/etl/README.md
@ -0,0 +1,47 @@
+This is the ETL ppipeline to read elements from a CSV file, parsing/cleaning
+them up and inserting into a PostgreSQL
+It has been tested only with DirectRunner, but it could be moved to run on
+DataFlow easily.
+
+## Running
+
+This is intended to be scheduled by Airflow but it the necessary packages are
+available it can also be run manually with:
+
+```sh
+python3 /etl/main.py \
+    --runner=DirectRunner \
+    --input="$CSV_INPUT_FILE" \
+    --pg_hostname="$PG_HOSTNAME" \
+    --pg_port="$PG_PORT" \
+    --pg_username="$PG_USERNAME" \
+    --pg_password="$PG_PASSWORD" \
+    --pg_database="$PG_DATABASE" \
+    --pg_table="$PG_TABLE"
+```
+
+## Testing and linting
+
+To help with development and testing a `Dockerfile`, a `Makefile` and
+`justfile` files are also provided.
+
+The `Makefile` provides a mechanism to
+
+* automate the generation of `dev-requirements.txt` and `requirements.txt` out
+  of `pyproject.toml`
+* automate the creation of a python virtual environment which contains the
+  right python version (installed by pyenv) and the packages defined in `pyproject.toml`
+* automate the building of an OCI image with the necessary dependencies
+
+The provided `Dockerfile` is used to build an image with the necessary packages
+to run `pytest` and `pylint`.
+
+The provided `justfile` provides the commands to run `pytest` and `pylint` from
+a container.
+
+If [`just`](https://github.com/casey/just) is installed, `pytest` and `pylint` can be run like so:
+
+```sh
+just test
+just lint
+```
--- a/etl/dev-requirements.txt
+++ b/etl/dev-requirements.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
 #    pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml
@ -208,7 +208,7 @@ protobuf==4.23.3
    #   grpc-google-iam-v1
    #   grpcio-status
    #   proto-plus
-psycopg2==2.9.6
+psycopg2-binary==2.9.6
    # via beam-etl (pyproject.toml)
 pyarrow==11.0.0
    # via apache-beam
@ -275,6 +275,8 @@ typing-extensions==4.6.3
    # via
    #   apache-beam
    #   astroid
+    #   black
+    #   pylint
 tzdata==2023.3
    # via pandas
 urllib3==1.26.16
--- a/etl/justfile
+++ b/etl/justfile
@ -0,0 +1,16 @@
+build_img:
+    make build_img
+
+test: build_img
+    docker run \
+        -v $(pwd)/src:/src \
+        --rm \
+        -it $(cat .img_name) \
+        python -m pytest
+
+lint: build_img
+    docker run \
+        -v $(pwd)/src:/src \
+        --rm \
+        -it $(cat .img_name) \
+        pylint --init-hook "import sys; sys.path.append('/src')" /src
--- a/etl/pyproject.toml
+++ b/etl/pyproject.toml
@ -1,7 +1,7 @@
 [project]
 name = "beam_etl"
 version = "0.1"
-dependencies = ["wheel", "apache-beam[gcp]", "pandas", "psycopg2"]
+dependencies = ["wheel", "apache-beam[gcp]", "pandas", "psycopg2-binary"]

 [project.optional-dependencies]
 dev = ["pytest", "pylint", "black"]
--- a/etl/requirements.txt
+++ b/etl/requirements.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
 #    pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml
@ -177,7 +177,7 @@ protobuf==4.23.3
    #   grpc-google-iam-v1
    #   grpcio-status
    #   proto-plus
-psycopg2==2.9.6
+psycopg2-binary==2.9.6
    # via beam-etl (pyproject.toml)
 pyarrow==11.0.0
    # via apache-beam
--- a/etl/src/init.py
+++ b/etl/src/init.py
--- a/etl/src/helpers/init.py
+++ b/etl/src/helpers/init.py
@ -0,0 +1,5 @@
+"""Helper classes for the pipeline"""
+
+from helpers.upsert_products_to_pg import UpsertProductsToPg
+from helpers.read_from_csv import ReadFromCsv
+from helpers.process_rows import ProcessRows
--- a/etl/src/helpers/parsers/init.py
+++ b/etl/src/helpers/parsers/init.py
@ -0,0 +1,4 @@
+"""Helper parser functions to extract and clean data from the input CSV file
+Only `parse_row` needs to be exported."""
+
+from helpers.parsers.parse_row import parse_row
--- a/etl/src/helpers/parsers/dimensions.py
+++ b/etl/src/helpers/parsers/dimensions.py
@ -6,7 +6,7 @@ import logging
 from typing import Dict, Optional
 import re

-from helpers.misc import convert_units
+from helpers.parsers.misc import convert_units


 UNIT_CONVERSIONS = {"inches": 2.54, "feet": 30.48, "cm": 1}
--- a/etl/src/helpers/parsers/materials.py
+++ b/etl/src/helpers/parsers/materials.py
--- a/etl/src/helpers/parsers/misc.py
+++ b/etl/src/helpers/parsers/misc.py
--- a/etl/src/helpers/parsers/origin.py
+++ b/etl/src/helpers/parsers/origin.py
--- a/etl/src/helpers/parsers/parse_row.py
+++ b/etl/src/helpers/parsers/parse_row.py
@ -4,11 +4,11 @@ the destination database"""
 import logging
 from typing import TypedDict, Dict, Optional, List

-from helpers.parse_xml import parse_raw_specs
-from helpers.materials import parse_materials
-from helpers.origin import clean_origin_name
-from helpers.dimensions import parse_dimensions
-from helpers.weight import parse_weight, dimensional_weight
+from helpers.parsers.parse_xml import parse_raw_specs
+from helpers.parsers.materials import parse_materials
+from helpers.parsers.origin import clean_origin_name
+from helpers.parsers.dimensions import parse_dimensions
+from helpers.parsers.weight import parse_weight, dimensional_weight


 class CleanRow(TypedDict):
--- a/etl/src/helpers/parsers/parse_xml.py
+++ b/etl/src/helpers/parsers/parse_xml.py
--- a/etl/src/helpers/parsers/weight.py
+++ b/etl/src/helpers/parsers/weight.py
@ -6,7 +6,7 @@ import logging
 from typing import Optional
 import re

-from helpers.misc import convert_units
+from helpers.parsers.misc import convert_units


 UNIT_CONVERSIONS = {"pounds": 453.592, "ounces": 28.3495, "g": 1, "kg": 1000}
--- a/etl/src/helpers/process_rows.py
+++ b/etl/src/helpers/process_rows.py
@ -0,0 +1,19 @@
+"""Module containing necessary functionality to write to the PostgreSQL sink"""
+
+import logging
+
+import apache_beam as beam
+
+from helpers.parsers import parse_row
+
+
+class ProcessRows(beam.DoFn):
+    """DoFn to process and parse rows from the input file into structured
+    dictionaries"""
+
+    # pylint: disable=abstract-method,arguments-differ
+    def process(self, element):
+        if (row := parse_row(element)) is not None:
+            yield row
+        else:
+            logging.warning("could not successfully parse this row: %s", element)
--- a/etl/src/helpers/read_from_csv.py
+++ b/etl/src/helpers/read_from_csv.py
@ -0,0 +1,28 @@
+"""Module containing ReadFromCsv DoFn to create a PTransform to read from a CSV
+input file"""
+
+import io
+import logging
+import csv
+
+import apache_beam as beam
+from apache_beam.io.filesystems import FileSystems
+
+
+class ReadFromCsv(beam.DoFn):
+    """This custom DoFn will read from a CSV file and yield each row as a
+    dictionary where the row names are the keys and the cells are the values
+    """
+
+    # pylint: disable=abstract-method,arguments-differ
+    def process(self, element):
+        logging.info("reading from input file: %s", element)
+        with FileSystems.open(element) as file:
+            text_wrapper = io.TextIOWrapper(file)
+            reader = csv.reader(text_wrapper)
+            try:
+                header = next(reader)
+            except StopIteration:
+                return
+            for row in reader:
+                yield dict(zip(header, row))
--- a/etl/src/helpers/upsert_products_to_pg.py
+++ b/etl/src/helpers/upsert_products_to_pg.py
@ -1,56 +1,34 @@
-"""Module containing the IO parts of the pipeline"""
+"""Module containing necessary functionality to write to the PostgreSQL sink"""

-#!/usr/bin/env python
-
-import io
 import logging
-import csv
+from typing import Dict

 import apache_beam as beam
 import psycopg2

-from apache_beam.io.filesystems import FileSystems
-
-
-class ReadFromCsv(beam.DoFn):
-    """This custom DoFn will read from a CSV file and yield each row as a
-    dictionary where the row names are the keys and the cells are the values
-    """
-
-    def process(self, in_file):
-        logging.info("reading from input file: %s", in_file)
-        with FileSystems.open(in_file) as file:
-            text_wrapper = io.TextIOWrapper(file)
-            reader = csv.reader(text_wrapper)
-            try:
-                header = next(reader)
-            except StopIteration:
-                return
-            for row in reader:
-                yield dict(zip(header, row))
-

 class WriteToPostgreSQL(beam.DoFn):
    """DoFn to write elements to a PostgreSQL database"""

-    def __init__(
-        self, hostname, port, username, password, database, table, table_key=None
-    ):
-        self.connection_details = {
-            "host": hostname,
-            "port": port,
-            "user": username,
-            "password": password,
-            "database": database,
-        }
+    # pylint: disable=abstract-method
+    def __init__(self, connection_details: Dict[str, str], table, table_key=None):
+        # pylint: disable=super-init-not-called
+        self.connection_details = connection_details
        self.table = table
        self.table_key = table_key
+        self.connection = None

    def setup(self):
        self.connection = psycopg2.connect(**self.connection_details)
        self.connection.autocommit = True

-    def execute_insert(self, row, cursor):
+    def execute_insert(self, row: Dict, cursor):
+        """Given a dictionary reporesenting a row and a postgresql cursor,
+        insert the row into the database so that the dict keys are the colum
+        names and the dict values the cell values.
+        If a table_key is specified (`self.table_key` is set) handle conflicts
+        by doing nothing"""
+
        colnames = ",".join(row.keys())
        values = ",".join(["%s"] * len(row))
        sql = f"""
@ -61,19 +39,33 @@ class WriteToPostgreSQL(beam.DoFn):
            sql = sql + f" ON CONFLICT ({ self.table_key }) DO NOTHING"
        cursor.execute(sql, list(row.values()))

+    # pylint: disable=arguments-differ
    def process(self, element):
-        cursor = self.connection.cursor()
-        self.execute_insert(element, cursor)
-        cursor.close()
+        if self.connection is not None:
+            cursor = self.connection.cursor()
+            logging.info(
+                "inserting the following element into the database: %s", element
+            )
+            self.execute_insert(element, cursor)
+            cursor.close()
+        else:
+            logging.error("something went wrong with the connection to postresql")

    def teardown(self):
-        self.connection.close()
+        if self.connection is not None:
+            self.connection.close()


 class UpsertProductsToPg(WriteToPostgreSQL):
    """DoFn to write products to PostgreSQL with our upsert logic"""

+    # pylint: disable=abstract-method
    def execute_insert(self, row, cursor):
+        """Our upsert logic is the following:
+        When any of primary_category, packaging, origin, height, depth or with
+        has changed, update the element and set the ingestion_time value to the
+        current timestamp
+        """
        colnames = ",".join(row.keys())
        values = ",".join(["%s"] * len(row))
        sql = f"""
@ -87,12 +79,18 @@ class UpsertProductsToPg(WriteToPostgreSQL):
               packaging = EXCLUDED.packaging,
               origin = EXCLUDED.origin,
               weight = EXCLUDED.weight,
+               height = EXCLUDED.height,
+               depth = EXCLUDED.depth,
+               width = EXCLUDED.width,
               ingestion_time = NOW()::TIMESTAMP
           WHERE
               { self.table }.primary_category != EXCLUDED.primary_category OR
               { self.table }.materials != EXCLUDED.materials OR
               { self.table }.packaging != EXCLUDED.packaging OR
               { self.table }.origin != EXCLUDED.origin OR
-               { self.table }.weight != EXCLUDED.weight
+               { self.table }.weight != EXCLUDED.weight OR
+               { self.table }.height != EXCLUDED.height OR
+               { self.table }.depth != EXCLUDED.depth OR
+               { self.table }.width != EXCLUDED.width
        """
        cursor.execute(sql, list(row.values()))
--- a/etl/src/main.py
+++ b/etl/src/main.py
@ -1,16 +1,14 @@
-#!/usr/bin/env python
+"""This Apache Beam pipeline reads rows as elements from a CSV input file,
+extracts and parses relevant values, and upserts the elements to a PostgreSQL
+database
+"""

 import logging

 import apache_beam as beam
-
 from apache_beam.options.pipeline_options import PipelineOptions

-from helpers.data_io import ReadFromCsv, UpsertProductsToPg
-from helpers.parse_row import parse_row
-
-
-#     def __init__(self, hostname, port, username, password, database):
+from helpers import UpsertProductsToPg, ReadFromCsv, ProcessRows


 class SustainabilityScoreOptions(PipelineOptions):
@ -33,18 +31,23 @@ def main():
    beam_options = PipelineOptions()
    opts = beam_options.view_as(SustainabilityScoreOptions)

+    pg_connection_details = {
+        "host": opts.pg_hostname,
+        "port": opts.pg_port,
+        "user": opts.pg_username,
+        "password": opts.pg_password,
+        "database": opts.pg_database,
+    }
+
    with beam.Pipeline(options=beam_options) as pipeline:
        # fmt: off
+        # pylint: disable=expression-not-assigned
        pipeline \
            | beam.Create([opts.input]) \
            | beam.ParDo(ReadFromCsv()) \
-            | beam.Map(parse_row) \
+            | beam.ParDo(ProcessRows()) \
            | beam.ParDo(UpsertProductsToPg(
-                hostname=opts.pg_hostname,
-                port=opts.pg_port,
-                username=opts.pg_username,
-                password=opts.pg_password,
-                database=opts.pg_database,
+                connection_details=pg_connection_details,
                table=opts.pg_table,
                table_key="tcin",
            ))
--- a/etl/src/tests/parsers/test_convert_units.py
+++ b/etl/src/tests/parsers/test_convert_units.py
@ -1,8 +1,8 @@
 """Test the `convert_units`"""

-from helpers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions
-from helpers.weight import UNIT_CONVERSIONS as weight_unit_conversions
-from helpers.misc import convert_units
+from helpers.parsers.dimensions import UNIT_CONVERSIONS as dimension_unit_conversions
+from helpers.parsers.weight import UNIT_CONVERSIONS as weight_unit_conversions
+from helpers.parsers.misc import convert_units


 def test_units_to_cm_inches():
--- a/etl/src/tests/parsers/test_dimensions.py
+++ b/etl/src/tests/parsers/test_dimensions.py
@ -1,6 +1,6 @@
 """Test the `parse_dimensions` function and its helpers"""

-from helpers.dimensions import parse_dimensions, parse_dimensions_measure
+from helpers.parsers.dimensions import parse_dimensions, parse_dimensions_measure


 def test_none():
--- a/etl/src/tests/parsers/test_materials.py
+++ b/etl/src/tests/parsers/test_materials.py
@ -1,6 +1,10 @@
 """Test the `parse_materials` function and its helpers"""

-from helpers.materials import parse_materials, clean_material_name, material_classifier
+from helpers.parsers.materials import (
+    parse_materials,
+    clean_material_name,
+    material_classifier,
+)


 def test_none():
--- a/etl/src/tests/parsers/test_origin.py
+++ b/etl/src/tests/parsers/test_origin.py
@ -1,6 +1,6 @@
 """Test the `clean_material_name`"""

-from helpers.origin import clean_origin_name
+from helpers.parsers.origin import clean_origin_name


 def test_none():
@ -46,4 +46,3 @@ def test_clean_origin_name5():
 def test_clean_origin_name6():
    """Test a sample input for clean_origin_name"""
    assert clean_origin_name(" made in the USA or imported") == "mixed"
-
--- a/etl/src/tests/parsers/test_parse_xml.py
+++ b/etl/src/tests/parsers/test_parse_xml.py
@ -2,7 +2,7 @@


 import xml.etree.ElementTree as ET
-from helpers.parse_xml import parse_raw_specs, iter_parse
+from helpers.parsers.parse_xml import parse_raw_specs, iter_parse


 def test_parse_raw_specs0():
--- a/etl/src/tests/parsers/test_weight.py
+++ b/etl/src/tests/parsers/test_weight.py
@ -1,6 +1,6 @@
 """Test the `parse_weight`  and `dimensional_weight`"""

-from helpers.weight import parse_weight, dimensional_weight
+from helpers.parsers.weight import parse_weight, dimensional_weight


 def test_parse_weight_none():
--- a/notebooks/analysis.ipynb
+++ b/notebooks/analysis.ipynb
--- a/notebooks/exploration.ipynb
+++ b/notebooks/exploration.ipynb
Author	SHA1	Message	Date
Ricard Illa	e9446f42f3	feat: added a few more plots to notebooks	2023-06-26 13:03:06 +02:00
Ricard Illa	39d279f089	refactor: some cleanup on etl's code structure	2023-06-26 12:36:19 +02:00
Ricard Illa	b28ddc350d	doc: added readme to etl	2023-06-26 12:09:50 +02:00
Ricard Illa	4ba29e7e1d	feat: postgresql port doesn't need to be exposed	2023-06-26 11:57:50 +02:00
Ricard Illa	0dd81715d4	feat: ignore dbt/.user.yaml	2023-06-26 11:54:17 +02:00
Ricard Illa	543004b51c	feat: added pylint and pytest for etl	2023-06-26 11:53:40 +02:00
Ricard Illa	ac1101ae96	fix: fix dbt writeable directories permissions	2023-06-26 10:04:53 +02:00
Ricard Illa	7d898c6297	fix: airflow's BashOperator's cwd cannot be templated	2023-06-26 10:04:28 +02:00
Ricard Illa	06c76f0b65	docs: added dbt readme	2023-06-26 08:51:54 +02:00
Ricard Illa	743c57a0d1	feat: minor changes	2023-06-26 08:49:16 +02:00