feat: added pylint and pytest for etl
parent
ac1101ae96
commit
543004b51c
|
@ -5,6 +5,7 @@ state
|
||||||
.direnv
|
.direnv
|
||||||
venv
|
venv
|
||||||
.installed_deps
|
.installed_deps
|
||||||
|
.img_name
|
||||||
|
|
||||||
*.egg-info
|
*.egg-info
|
||||||
__pycache__
|
__pycache__
|
||||||
|
|
|
@ -10,7 +10,7 @@ x-airflow-common:
|
||||||
- ./state/dbt-data/target:/dbt/target
|
- ./state/dbt-data/target:/dbt/target
|
||||||
- ./dags:/home/airflow/airflow/dags
|
- ./dags:/home/airflow/airflow/dags
|
||||||
- ./data:/home/airflow/gcs/data:ro
|
- ./data:/home/airflow/gcs/data:ro
|
||||||
- ./etl:/etl:ro
|
- ./etl/src:/etl:ro
|
||||||
- ./dbt:/dbt
|
- ./dbt:/dbt
|
||||||
environment:
|
environment:
|
||||||
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
|
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
FROM python:3.8-slim
|
||||||
|
|
||||||
|
COPY requirements.txt /requirements.txt
|
||||||
|
COPY dev-requirements.txt /dev-requirements.txt
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip && \
|
||||||
|
pip install -r /requirements.txt -r /dev-requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /src
|
|
@ -0,0 +1,54 @@
|
||||||
|
.PHONY: clean build_img
|
||||||
|
|
||||||
|
PYTHON_VERSION = 3.8
|
||||||
|
PYTHON_SUBVERSION = 3.8.12
|
||||||
|
|
||||||
|
PYENV_VERSIONS = $(HOME)/.pyenv/versions
|
||||||
|
PYTHON_BIN = $(PYENV_VERSIONS)/$(PYTHON_SUBVERSION)/bin/python$(PYTHON_VERSION)
|
||||||
|
|
||||||
|
VENV = venv
|
||||||
|
PYTHON_VENV = $(VENV)/bin/python
|
||||||
|
PIP = $(PYTHON_VENV) -m pip
|
||||||
|
PIP_COMPILE = venv/bin/pip-compile
|
||||||
|
|
||||||
|
IMG_NAME=etl_beam_env
|
||||||
|
|
||||||
|
all: oci_img .installed_deps
|
||||||
|
|
||||||
|
build_img: .img_name
|
||||||
|
|
||||||
|
.img_name: Dockerfile requirements.txt dev-requirements.txt
|
||||||
|
docker build -t $(IMG_NAME) .
|
||||||
|
echo $(IMG_NAME) > $@
|
||||||
|
|
||||||
|
.installed_deps: requirements.txt dev-requirements.txt $(PYTHON_VENV)
|
||||||
|
$(PIP) install \
|
||||||
|
-r requirements.txt \
|
||||||
|
-r dev-requirements.txt
|
||||||
|
touch $@
|
||||||
|
|
||||||
|
requirements.txt: pyproject.toml $(PIP_COMPILE)
|
||||||
|
$(PIP_COMPILE) \
|
||||||
|
--resolver=backtracking \
|
||||||
|
--output-file $@ \
|
||||||
|
$<
|
||||||
|
|
||||||
|
dev-requirements.txt: pyproject.toml $(PIP_COMPILE)
|
||||||
|
$(PIP_COMPILE) \
|
||||||
|
--extra=dev \
|
||||||
|
--resolver=backtracking \
|
||||||
|
--output-file $@ \
|
||||||
|
$<
|
||||||
|
|
||||||
|
$(PIP_COMPILE): $(PYTHON_VENV)
|
||||||
|
$(PIP) install pip-tools
|
||||||
|
|
||||||
|
$(PYTHON_VENV): $(PYTHON_BIN)
|
||||||
|
virtualenv --python=$^ $(VENV)
|
||||||
|
$(PIP) install --upgrade pip
|
||||||
|
|
||||||
|
$(PYTHON_BIN):
|
||||||
|
pyenv install $(PYTHON_SUBVERSION)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf *.egg-info venv installed_deps
|
|
@ -1,5 +1,5 @@
|
||||||
#
|
#
|
||||||
# This file is autogenerated by pip-compile with Python 3.10
|
# This file is autogenerated by pip-compile with Python 3.8
|
||||||
# by the following command:
|
# by the following command:
|
||||||
#
|
#
|
||||||
# pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml
|
# pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml
|
||||||
|
@ -208,7 +208,7 @@ protobuf==4.23.3
|
||||||
# grpc-google-iam-v1
|
# grpc-google-iam-v1
|
||||||
# grpcio-status
|
# grpcio-status
|
||||||
# proto-plus
|
# proto-plus
|
||||||
psycopg2==2.9.6
|
psycopg2-binary==2.9.6
|
||||||
# via beam-etl (pyproject.toml)
|
# via beam-etl (pyproject.toml)
|
||||||
pyarrow==11.0.0
|
pyarrow==11.0.0
|
||||||
# via apache-beam
|
# via apache-beam
|
||||||
|
@ -275,6 +275,8 @@ typing-extensions==4.6.3
|
||||||
# via
|
# via
|
||||||
# apache-beam
|
# apache-beam
|
||||||
# astroid
|
# astroid
|
||||||
|
# black
|
||||||
|
# pylint
|
||||||
tzdata==2023.3
|
tzdata==2023.3
|
||||||
# via pandas
|
# via pandas
|
||||||
urllib3==1.26.16
|
urllib3==1.26.16
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
build_img:
|
||||||
|
make build_img
|
||||||
|
|
||||||
|
test: build_img
|
||||||
|
docker run \
|
||||||
|
-v $(pwd)/src:/src \
|
||||||
|
--rm \
|
||||||
|
-it $(cat .img_name) \
|
||||||
|
python -m pytest
|
||||||
|
|
||||||
|
lint: build_img
|
||||||
|
docker run \
|
||||||
|
-v $(pwd)/src:/src \
|
||||||
|
--rm \
|
||||||
|
-it $(cat .img_name) \
|
||||||
|
pylint --init-hook "import sys; sys.path.append('/src')" /src
|
|
@ -1,7 +1,7 @@
|
||||||
[project]
|
[project]
|
||||||
name = "beam_etl"
|
name = "beam_etl"
|
||||||
version = "0.1"
|
version = "0.1"
|
||||||
dependencies = ["wheel", "apache-beam[gcp]", "pandas", "psycopg2"]
|
dependencies = ["wheel", "apache-beam[gcp]", "pandas", "psycopg2-binary"]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
dev = ["pytest", "pylint", "black"]
|
dev = ["pytest", "pylint", "black"]
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#
|
#
|
||||||
# This file is autogenerated by pip-compile with Python 3.10
|
# This file is autogenerated by pip-compile with Python 3.8
|
||||||
# by the following command:
|
# by the following command:
|
||||||
#
|
#
|
||||||
# pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml
|
# pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml
|
||||||
|
@ -177,7 +177,7 @@ protobuf==4.23.3
|
||||||
# grpc-google-iam-v1
|
# grpc-google-iam-v1
|
||||||
# grpcio-status
|
# grpcio-status
|
||||||
# proto-plus
|
# proto-plus
|
||||||
psycopg2==2.9.6
|
psycopg2-binary==2.9.6
|
||||||
# via beam-etl (pyproject.toml)
|
# via beam-etl (pyproject.toml)
|
||||||
pyarrow==11.0.0
|
pyarrow==11.0.0
|
||||||
# via apache-beam
|
# via apache-beam
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import csv
|
import csv
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
import apache_beam as beam
|
import apache_beam as beam
|
||||||
import psycopg2
|
import psycopg2
|
||||||
|
@ -17,9 +18,10 @@ class ReadFromCsv(beam.DoFn):
|
||||||
dictionary where the row names are the keys and the cells are the values
|
dictionary where the row names are the keys and the cells are the values
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def process(self, in_file):
|
# pylint: disable=abstract-method,arguments-differ
|
||||||
logging.info("reading from input file: %s", in_file)
|
def process(self, element):
|
||||||
with FileSystems.open(in_file) as file:
|
logging.info("reading from input file: %s", element)
|
||||||
|
with FileSystems.open(element) as file:
|
||||||
text_wrapper = io.TextIOWrapper(file)
|
text_wrapper = io.TextIOWrapper(file)
|
||||||
reader = csv.reader(text_wrapper)
|
reader = csv.reader(text_wrapper)
|
||||||
try:
|
try:
|
||||||
|
@ -33,24 +35,25 @@ class ReadFromCsv(beam.DoFn):
|
||||||
class WriteToPostgreSQL(beam.DoFn):
|
class WriteToPostgreSQL(beam.DoFn):
|
||||||
"""DoFn to write elements to a PostgreSQL database"""
|
"""DoFn to write elements to a PostgreSQL database"""
|
||||||
|
|
||||||
def __init__(
|
# pylint: disable=abstract-method
|
||||||
self, hostname, port, username, password, database, table, table_key=None
|
def __init__(self, connection_details: Dict[str, str], table, table_key=None):
|
||||||
):
|
# pylint: disable=super-init-not-called
|
||||||
self.connection_details = {
|
self.connection_details = connection_details
|
||||||
"host": hostname,
|
|
||||||
"port": port,
|
|
||||||
"user": username,
|
|
||||||
"password": password,
|
|
||||||
"database": database,
|
|
||||||
}
|
|
||||||
self.table = table
|
self.table = table
|
||||||
self.table_key = table_key
|
self.table_key = table_key
|
||||||
|
self.connection = None
|
||||||
|
|
||||||
def setup(self):
|
def setup(self):
|
||||||
self.connection = psycopg2.connect(**self.connection_details)
|
self.connection = psycopg2.connect(**self.connection_details)
|
||||||
self.connection.autocommit = True
|
self.connection.autocommit = True
|
||||||
|
|
||||||
def execute_insert(self, row, cursor):
|
def execute_insert(self, row: Dict, cursor):
|
||||||
|
"""Given a dictionary reporesenting a row and a postgresql cursor,
|
||||||
|
insert the row into the database so that the dict keys are the colum
|
||||||
|
names and the dict values the cell values.
|
||||||
|
If a table_key is specified (`self.table_key` is set) handle conflicts
|
||||||
|
by doing nothing"""
|
||||||
|
|
||||||
colnames = ",".join(row.keys())
|
colnames = ",".join(row.keys())
|
||||||
values = ",".join(["%s"] * len(row))
|
values = ",".join(["%s"] * len(row))
|
||||||
sql = f"""
|
sql = f"""
|
||||||
|
@ -61,19 +64,28 @@ class WriteToPostgreSQL(beam.DoFn):
|
||||||
sql = sql + f" ON CONFLICT ({ self.table_key }) DO NOTHING"
|
sql = sql + f" ON CONFLICT ({ self.table_key }) DO NOTHING"
|
||||||
cursor.execute(sql, list(row.values()))
|
cursor.execute(sql, list(row.values()))
|
||||||
|
|
||||||
|
# pylint: disable=arguments-differ
|
||||||
def process(self, element):
|
def process(self, element):
|
||||||
|
if self.connection is not None:
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
self.execute_insert(element, cursor)
|
self.execute_insert(element, cursor)
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
def teardown(self):
|
def teardown(self):
|
||||||
|
if self.connection is not None:
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
|
|
||||||
|
|
||||||
class UpsertProductsToPg(WriteToPostgreSQL):
|
class UpsertProductsToPg(WriteToPostgreSQL):
|
||||||
"""DoFn to write products to PostgreSQL with our upsert logic"""
|
"""DoFn to write products to PostgreSQL with our upsert logic"""
|
||||||
|
|
||||||
|
# pylint: disable=abstract-method
|
||||||
def execute_insert(self, row, cursor):
|
def execute_insert(self, row, cursor):
|
||||||
|
"""Our upsert logic is the following:
|
||||||
|
When any of primary_category, packaging, origin, height, depth or with
|
||||||
|
has changed, update the element and set the ingestion_time value to the
|
||||||
|
current timestamp
|
||||||
|
"""
|
||||||
colnames = ",".join(row.keys())
|
colnames = ",".join(row.keys())
|
||||||
values = ",".join(["%s"] * len(row))
|
values = ",".join(["%s"] * len(row))
|
||||||
sql = f"""
|
sql = f"""
|
||||||
|
@ -87,12 +99,18 @@ class UpsertProductsToPg(WriteToPostgreSQL):
|
||||||
packaging = EXCLUDED.packaging,
|
packaging = EXCLUDED.packaging,
|
||||||
origin = EXCLUDED.origin,
|
origin = EXCLUDED.origin,
|
||||||
weight = EXCLUDED.weight,
|
weight = EXCLUDED.weight,
|
||||||
|
height = EXCLUDED.height,
|
||||||
|
depth = EXCLUDED.depth,
|
||||||
|
width = EXCLUDED.width,
|
||||||
ingestion_time = NOW()::TIMESTAMP
|
ingestion_time = NOW()::TIMESTAMP
|
||||||
WHERE
|
WHERE
|
||||||
{ self.table }.primary_category != EXCLUDED.primary_category OR
|
{ self.table }.primary_category != EXCLUDED.primary_category OR
|
||||||
{ self.table }.materials != EXCLUDED.materials OR
|
{ self.table }.materials != EXCLUDED.materials OR
|
||||||
{ self.table }.packaging != EXCLUDED.packaging OR
|
{ self.table }.packaging != EXCLUDED.packaging OR
|
||||||
{ self.table }.origin != EXCLUDED.origin OR
|
{ self.table }.origin != EXCLUDED.origin OR
|
||||||
{ self.table }.weight != EXCLUDED.weight
|
{ self.table }.weight != EXCLUDED.weight OR
|
||||||
|
{ self.table }.height != EXCLUDED.height OR
|
||||||
|
{ self.table }.depth != EXCLUDED.depth OR
|
||||||
|
{ self.table }.width != EXCLUDED.width
|
||||||
"""
|
"""
|
||||||
cursor.execute(sql, list(row.values()))
|
cursor.execute(sql, list(row.values()))
|
|
@ -1,4 +1,7 @@
|
||||||
#!/usr/bin/env python
|
"""This Apache Beam pipeline reads rows as elements from a CSV input file,
|
||||||
|
extracts and parses relevant values, and upserts the elements to a PostgreSQL
|
||||||
|
database
|
||||||
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
@ -10,9 +13,6 @@ from helpers.data_io import ReadFromCsv, UpsertProductsToPg
|
||||||
from helpers.parse_row import parse_row
|
from helpers.parse_row import parse_row
|
||||||
|
|
||||||
|
|
||||||
# def __init__(self, hostname, port, username, password, database):
|
|
||||||
|
|
||||||
|
|
||||||
class SustainabilityScoreOptions(PipelineOptions):
|
class SustainabilityScoreOptions(PipelineOptions):
|
||||||
"""Options for this pipeline"""
|
"""Options for this pipeline"""
|
||||||
|
|
||||||
|
@ -33,18 +33,23 @@ def main():
|
||||||
beam_options = PipelineOptions()
|
beam_options = PipelineOptions()
|
||||||
opts = beam_options.view_as(SustainabilityScoreOptions)
|
opts = beam_options.view_as(SustainabilityScoreOptions)
|
||||||
|
|
||||||
|
pg_connection_details = {
|
||||||
|
"host": opts.pg_hostname,
|
||||||
|
"port": opts.pg_port,
|
||||||
|
"user": opts.pg_username,
|
||||||
|
"password": opts.pg_password,
|
||||||
|
"database": opts.pg_database,
|
||||||
|
}
|
||||||
|
|
||||||
with beam.Pipeline(options=beam_options) as pipeline:
|
with beam.Pipeline(options=beam_options) as pipeline:
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
# pylint: disable=expression-not-assigned
|
||||||
pipeline \
|
pipeline \
|
||||||
| beam.Create([opts.input]) \
|
| beam.Create([opts.input]) \
|
||||||
| beam.ParDo(ReadFromCsv()) \
|
| beam.ParDo(ReadFromCsv()) \
|
||||||
| beam.Map(parse_row) \
|
| beam.Map(parse_row) \
|
||||||
| beam.ParDo(UpsertProductsToPg(
|
| beam.ParDo(UpsertProductsToPg(
|
||||||
hostname=opts.pg_hostname,
|
connection_details=pg_connection_details,
|
||||||
port=opts.pg_port,
|
|
||||||
username=opts.pg_username,
|
|
||||||
password=opts.pg_password,
|
|
||||||
database=opts.pg_database,
|
|
||||||
table=opts.pg_table,
|
table=opts.pg_table,
|
||||||
table_key="tcin",
|
table_key="tcin",
|
||||||
))
|
))
|
Loading…
Reference in New Issue