feat: added pylint and pytest for etl

main
Ricard Illa 2023-06-26 11:53:40 +02:00
parent ac1101ae96
commit 543004b51c
25 changed files with 139 additions and 34 deletions

1
.gitignore vendored
View File

@ -5,6 +5,7 @@ state
.direnv .direnv
venv venv
.installed_deps .installed_deps
.img_name
*.egg-info *.egg-info
__pycache__ __pycache__

View File

@ -10,7 +10,7 @@ x-airflow-common:
- ./state/dbt-data/target:/dbt/target - ./state/dbt-data/target:/dbt/target
- ./dags:/home/airflow/airflow/dags - ./dags:/home/airflow/airflow/dags
- ./data:/home/airflow/gcs/data:ro - ./data:/home/airflow/gcs/data:ro
- ./etl:/etl:ro - ./etl/src:/etl:ro
- ./dbt:/dbt - ./dbt:/dbt
environment: environment:
AIRFLOW__CORE__LOAD_EXAMPLES: 'false' AIRFLOW__CORE__LOAD_EXAMPLES: 'false'

9
etl/Dockerfile Normal file
View File

@ -0,0 +1,9 @@
FROM python:3.8-slim
COPY requirements.txt /requirements.txt
COPY dev-requirements.txt /dev-requirements.txt
RUN pip install --upgrade pip && \
pip install -r /requirements.txt -r /dev-requirements.txt
WORKDIR /src

54
etl/Makefile Normal file
View File

@ -0,0 +1,54 @@
.PHONY: clean build_img
PYTHON_VERSION = 3.8
PYTHON_SUBVERSION = 3.8.12
PYENV_VERSIONS = $(HOME)/.pyenv/versions
PYTHON_BIN = $(PYENV_VERSIONS)/$(PYTHON_SUBVERSION)/bin/python$(PYTHON_VERSION)
VENV = venv
PYTHON_VENV = $(VENV)/bin/python
PIP = $(PYTHON_VENV) -m pip
PIP_COMPILE = venv/bin/pip-compile
IMG_NAME=etl_beam_env
all: oci_img .installed_deps
build_img: .img_name
.img_name: Dockerfile requirements.txt dev-requirements.txt
docker build -t $(IMG_NAME) .
echo $(IMG_NAME) > $@
.installed_deps: requirements.txt dev-requirements.txt $(PYTHON_VENV)
$(PIP) install \
-r requirements.txt \
-r dev-requirements.txt
touch $@
requirements.txt: pyproject.toml $(PIP_COMPILE)
$(PIP_COMPILE) \
--resolver=backtracking \
--output-file $@ \
$<
dev-requirements.txt: pyproject.toml $(PIP_COMPILE)
$(PIP_COMPILE) \
--extra=dev \
--resolver=backtracking \
--output-file $@ \
$<
$(PIP_COMPILE): $(PYTHON_VENV)
$(PIP) install pip-tools
$(PYTHON_VENV): $(PYTHON_BIN)
virtualenv --python=$^ $(VENV)
$(PIP) install --upgrade pip
$(PYTHON_BIN):
pyenv install $(PYTHON_SUBVERSION)
clean:
rm -rf *.egg-info venv installed_deps

View File

@ -1,5 +1,5 @@
# #
# This file is autogenerated by pip-compile with Python 3.10 # This file is autogenerated by pip-compile with Python 3.8
# by the following command: # by the following command:
# #
# pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml # pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml
@ -208,7 +208,7 @@ protobuf==4.23.3
# grpc-google-iam-v1 # grpc-google-iam-v1
# grpcio-status # grpcio-status
# proto-plus # proto-plus
psycopg2==2.9.6 psycopg2-binary==2.9.6
# via beam-etl (pyproject.toml) # via beam-etl (pyproject.toml)
pyarrow==11.0.0 pyarrow==11.0.0
# via apache-beam # via apache-beam
@ -275,6 +275,8 @@ typing-extensions==4.6.3
# via # via
# apache-beam # apache-beam
# astroid # astroid
# black
# pylint
tzdata==2023.3 tzdata==2023.3
# via pandas # via pandas
urllib3==1.26.16 urllib3==1.26.16

16
etl/justfile Normal file
View File

@ -0,0 +1,16 @@
build_img:
make build_img
test: build_img
docker run \
-v $(pwd)/src:/src \
--rm \
-it $(cat .img_name) \
python -m pytest
lint: build_img
docker run \
-v $(pwd)/src:/src \
--rm \
-it $(cat .img_name) \
pylint --init-hook "import sys; sys.path.append('/src')" /src

View File

@ -1,7 +1,7 @@
[project] [project]
name = "beam_etl" name = "beam_etl"
version = "0.1" version = "0.1"
dependencies = ["wheel", "apache-beam[gcp]", "pandas", "psycopg2"] dependencies = ["wheel", "apache-beam[gcp]", "pandas", "psycopg2-binary"]
[project.optional-dependencies] [project.optional-dependencies]
dev = ["pytest", "pylint", "black"] dev = ["pytest", "pylint", "black"]

View File

@ -1,5 +1,5 @@
# #
# This file is autogenerated by pip-compile with Python 3.10 # This file is autogenerated by pip-compile with Python 3.8
# by the following command: # by the following command:
# #
# pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml # pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml
@ -177,7 +177,7 @@ protobuf==4.23.3
# grpc-google-iam-v1 # grpc-google-iam-v1
# grpcio-status # grpcio-status
# proto-plus # proto-plus
psycopg2==2.9.6 psycopg2-binary==2.9.6
# via beam-etl (pyproject.toml) # via beam-etl (pyproject.toml)
pyarrow==11.0.0 pyarrow==11.0.0
# via apache-beam # via apache-beam

0
etl/src/__init__.py Normal file
View File

View File

View File

@ -5,6 +5,7 @@
import io import io
import logging import logging
import csv import csv
from typing import Dict
import apache_beam as beam import apache_beam as beam
import psycopg2 import psycopg2
@ -17,9 +18,10 @@ class ReadFromCsv(beam.DoFn):
dictionary where the row names are the keys and the cells are the values dictionary where the row names are the keys and the cells are the values
""" """
def process(self, in_file): # pylint: disable=abstract-method,arguments-differ
logging.info("reading from input file: %s", in_file) def process(self, element):
with FileSystems.open(in_file) as file: logging.info("reading from input file: %s", element)
with FileSystems.open(element) as file:
text_wrapper = io.TextIOWrapper(file) text_wrapper = io.TextIOWrapper(file)
reader = csv.reader(text_wrapper) reader = csv.reader(text_wrapper)
try: try:
@ -33,24 +35,25 @@ class ReadFromCsv(beam.DoFn):
class WriteToPostgreSQL(beam.DoFn): class WriteToPostgreSQL(beam.DoFn):
"""DoFn to write elements to a PostgreSQL database""" """DoFn to write elements to a PostgreSQL database"""
def __init__( # pylint: disable=abstract-method
self, hostname, port, username, password, database, table, table_key=None def __init__(self, connection_details: Dict[str, str], table, table_key=None):
): # pylint: disable=super-init-not-called
self.connection_details = { self.connection_details = connection_details
"host": hostname,
"port": port,
"user": username,
"password": password,
"database": database,
}
self.table = table self.table = table
self.table_key = table_key self.table_key = table_key
self.connection = None
def setup(self): def setup(self):
self.connection = psycopg2.connect(**self.connection_details) self.connection = psycopg2.connect(**self.connection_details)
self.connection.autocommit = True self.connection.autocommit = True
def execute_insert(self, row, cursor): def execute_insert(self, row: Dict, cursor):
"""Given a dictionary reporesenting a row and a postgresql cursor,
insert the row into the database so that the dict keys are the colum
names and the dict values the cell values.
If a table_key is specified (`self.table_key` is set) handle conflicts
by doing nothing"""
colnames = ",".join(row.keys()) colnames = ",".join(row.keys())
values = ",".join(["%s"] * len(row)) values = ",".join(["%s"] * len(row))
sql = f""" sql = f"""
@ -61,19 +64,28 @@ class WriteToPostgreSQL(beam.DoFn):
sql = sql + f" ON CONFLICT ({ self.table_key }) DO NOTHING" sql = sql + f" ON CONFLICT ({ self.table_key }) DO NOTHING"
cursor.execute(sql, list(row.values())) cursor.execute(sql, list(row.values()))
# pylint: disable=arguments-differ
def process(self, element): def process(self, element):
cursor = self.connection.cursor() if self.connection is not None:
self.execute_insert(element, cursor) cursor = self.connection.cursor()
cursor.close() self.execute_insert(element, cursor)
cursor.close()
def teardown(self): def teardown(self):
self.connection.close() if self.connection is not None:
self.connection.close()
class UpsertProductsToPg(WriteToPostgreSQL): class UpsertProductsToPg(WriteToPostgreSQL):
"""DoFn to write products to PostgreSQL with our upsert logic""" """DoFn to write products to PostgreSQL with our upsert logic"""
# pylint: disable=abstract-method
def execute_insert(self, row, cursor): def execute_insert(self, row, cursor):
"""Our upsert logic is the following:
When any of primary_category, packaging, origin, height, depth or with
has changed, update the element and set the ingestion_time value to the
current timestamp
"""
colnames = ",".join(row.keys()) colnames = ",".join(row.keys())
values = ",".join(["%s"] * len(row)) values = ",".join(["%s"] * len(row))
sql = f""" sql = f"""
@ -87,12 +99,18 @@ class UpsertProductsToPg(WriteToPostgreSQL):
packaging = EXCLUDED.packaging, packaging = EXCLUDED.packaging,
origin = EXCLUDED.origin, origin = EXCLUDED.origin,
weight = EXCLUDED.weight, weight = EXCLUDED.weight,
height = EXCLUDED.height,
depth = EXCLUDED.depth,
width = EXCLUDED.width,
ingestion_time = NOW()::TIMESTAMP ingestion_time = NOW()::TIMESTAMP
WHERE WHERE
{ self.table }.primary_category != EXCLUDED.primary_category OR { self.table }.primary_category != EXCLUDED.primary_category OR
{ self.table }.materials != EXCLUDED.materials OR { self.table }.materials != EXCLUDED.materials OR
{ self.table }.packaging != EXCLUDED.packaging OR { self.table }.packaging != EXCLUDED.packaging OR
{ self.table }.origin != EXCLUDED.origin OR { self.table }.origin != EXCLUDED.origin OR
{ self.table }.weight != EXCLUDED.weight { self.table }.weight != EXCLUDED.weight OR
{ self.table }.height != EXCLUDED.height OR
{ self.table }.depth != EXCLUDED.depth OR
{ self.table }.width != EXCLUDED.width
""" """
cursor.execute(sql, list(row.values())) cursor.execute(sql, list(row.values()))

View File

@ -1,4 +1,7 @@
#!/usr/bin/env python """This Apache Beam pipeline reads rows as elements from a CSV input file,
extracts and parses relevant values, and upserts the elements to a PostgreSQL
database
"""
import logging import logging
@ -10,9 +13,6 @@ from helpers.data_io import ReadFromCsv, UpsertProductsToPg
from helpers.parse_row import parse_row from helpers.parse_row import parse_row
# def __init__(self, hostname, port, username, password, database):
class SustainabilityScoreOptions(PipelineOptions): class SustainabilityScoreOptions(PipelineOptions):
"""Options for this pipeline""" """Options for this pipeline"""
@ -33,18 +33,23 @@ def main():
beam_options = PipelineOptions() beam_options = PipelineOptions()
opts = beam_options.view_as(SustainabilityScoreOptions) opts = beam_options.view_as(SustainabilityScoreOptions)
pg_connection_details = {
"host": opts.pg_hostname,
"port": opts.pg_port,
"user": opts.pg_username,
"password": opts.pg_password,
"database": opts.pg_database,
}
with beam.Pipeline(options=beam_options) as pipeline: with beam.Pipeline(options=beam_options) as pipeline:
# fmt: off # fmt: off
# pylint: disable=expression-not-assigned
pipeline \ pipeline \
| beam.Create([opts.input]) \ | beam.Create([opts.input]) \
| beam.ParDo(ReadFromCsv()) \ | beam.ParDo(ReadFromCsv()) \
| beam.Map(parse_row) \ | beam.Map(parse_row) \
| beam.ParDo(UpsertProductsToPg( | beam.ParDo(UpsertProductsToPg(
hostname=opts.pg_hostname, connection_details=pg_connection_details,
port=opts.pg_port,
username=opts.pg_username,
password=opts.pg_password,
database=opts.pg_database,
table=opts.pg_table, table=opts.pg_table,
table_key="tcin", table_key="tcin",
)) ))