refactor: some refactoring of file placements

main
Ricard Illa 2023-06-24 19:07:02 +02:00
parent 55b20bb897
commit 02ad9fab8d
No known key found for this signature in database
GPG Key ID: F69A672B72E54902
37 changed files with 35 additions and 102 deletions

1
.gitignore vendored
View File

@ -2,6 +2,7 @@
pipeline/state pipeline/state
.envrc .envrc
.direnv
venv venv
.installed_deps .installed_deps

View File

@ -21,7 +21,7 @@ CSV_FNAME = (
) )
CONFIG = { CONFIG = {
"input": f"{ HOME }/gcs/data/{ CSV_FNAME }", "input": f"{ HOME }/gcs/data/{ CSV_FNAME }",
"beam_etl_path": "/beam_etl/main.py", "beam_etl_path": "/etl/main.py",
"output_table": "sustainability_score.products", "output_table": "sustainability_score.products",
} }

View File

@ -10,8 +10,8 @@ x-airflow-common:
- ./dags:/home/airflow/airflow/dags - ./dags:/home/airflow/airflow/dags
- ./scripts/airflow-init.sh:/usr/local/bin/airflow-init.sh:ro - ./scripts/airflow-init.sh:/usr/local/bin/airflow-init.sh:ro
- ./scripts/airflow-entrypoint.sh:/usr/local/bin/airflow-entrypoint.sh:ro - ./scripts/airflow-entrypoint.sh:/usr/local/bin/airflow-entrypoint.sh:ro
- ../data:/home/airflow/gcs/data:ro - ./data:/home/airflow/gcs/data:ro
- ./beam_etl:/beam_etl:ro - ./etl:/etl:ro
- ./sql:/sql:ro - ./sql:/sql:ro
environment: environment:
AIRFLOW__CORE__LOAD_EXAMPLES: 'false' AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
@ -68,3 +68,13 @@ services:
- "TF_VAR_pg_port=5432" - "TF_VAR_pg_port=5432"
- "TF_VAR_pg_password=postgres" - "TF_VAR_pg_password=postgres"
- "TF_VAR_pg_username=postgres" - "TF_VAR_pg_username=postgres"
notebook:
image: jupyter/scipy-notebook
ports:
- 8888:8888
volumes:
- ./notebooks:/home/jovyan/work
- ./data:/home/jovyan/data:ro
profiles:
- notebooks

View File

@ -1,5 +1,5 @@
# #
# This file is autogenerated by pip-compile with Python 3.8 # This file is autogenerated by pip-compile with Python 3.10
# by the following command: # by the following command:
# #
# pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml # pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml
@ -208,6 +208,8 @@ protobuf==4.23.3
# grpc-google-iam-v1 # grpc-google-iam-v1
# grpcio-status # grpcio-status
# proto-plus # proto-plus
psycopg2==2.9.6
# via beam-etl (pyproject.toml)
pyarrow==11.0.0 pyarrow==11.0.0
# via apache-beam # via apache-beam
pyasn1==0.5.0 pyasn1==0.5.0
@ -273,8 +275,6 @@ typing-extensions==4.6.3
# via # via
# apache-beam # apache-beam
# astroid # astroid
# black
# pylint
tzdata==2023.3 tzdata==2023.3
# via pandas # via pandas
urllib3==1.26.16 urllib3==1.26.16

View File

@ -36,46 +36,33 @@ class WriteToPostgreSQL(beam.DoFn):
def __init__( def __init__(
self, hostname, port, username, password, database, table, table_key=None self, hostname, port, username, password, database, table, table_key=None
): ):
self.hostname = hostname self.connection_details = {
self.port = port "host": hostname,
self.username = username "port": port,
self.password = password "user": username,
self.database = database "password": password,
"database": database,
}
self.table = table self.table = table
self.table_key = table_key self.table_key = table_key
def setup(self): def setup(self):
self.connection = psycopg2.connect( self.connection = psycopg2.connect(**self.connection_details, autocommit=True)
host=self.hostname,
port=self.port,
user=self.username,
password=self.password,
database=self.database,
)
def process(self, element): def execute_insert(self, row, cursor):
cursor = self.connection.cursor() colnames = ",".join(row.keys())
colnames = ",".join(element.keys()) values = ",".join(["%s"] * len(row))
values = ",".join(["%s"] * len(element))
sql = f""" sql = f"""
INSERT INTO { self.table } ({ colnames }) INSERT INTO { self.table } ({ colnames })
VALUES ({ values }) VALUES ({ values })
""" """
if self.table_key is not None: if self.table_key is not None:
update_statement = ",".join( sql = sql + f" ON CONFLICT ({ self.table_key }) DO NOTHING"
f"{ col } = EXCLUDED.{ col }" cursor.execute(sql, list(row.values()))
for col in element.keys()
if col != self.table_key def process(self, element):
) cursor = self.connection.cursor()
sql = ( self.execute_insert(element, cursor)
sql
+ f"""
ON CONFLICT ({ self.table_key }) DO UPDATE SET
{ update_statement }
"""
)
cursor.execute(sql, list(element.values()))
self.connection.commit()
cursor.close() cursor.close()
def teardown(self): def teardown(self):

View File

@ -1,5 +1,5 @@
# #
# This file is autogenerated by pip-compile with Python 3.8 # This file is autogenerated by pip-compile with Python 3.10
# by the following command: # by the following command:
# #
# pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml # pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml

View File

@ -1,8 +0,0 @@
services:
notebook:
image: jupyter/scipy-notebook
ports:
- 8888:8888
volumes:
- ./work:/home/jovyan/work
- ../data:/home/jovyan/data:ro

View File

@ -1,50 +0,0 @@
# I use this Makefile to automatize setting up the working space for me
# As long as `virtualenv` and `pyenv` are installed, running `make` should set
# up the virtual envrionment with everything needed.
.PHONY: clean
PYTHON_VERSION = 3.8
PYTHON_SUBVERSION = 3.8.12
PYENV_VERSIONS = $(HOME)/.pyenv/versions
PYTHON_BIN = $(PYENV_VERSIONS)/$(PYTHON_SUBVERSION)/bin/python$(PYTHON_VERSION)
VENV = venv
PYTHON_VENV = $(VENV)/bin/python
PIP = $(PYTHON_VENV) -m pip
PIP_COMPILE = venv/bin/pip-compile
all: .installed_deps
.installed_deps: requirements.txt dev-requirements.txt $(PYTHON_VENV)
$(PIP) install \
-r requirements.txt \
-r dev-requirements.txt
touch $@
requirements.txt: pyproject.toml $(PIP_COMPILE)
$(PIP_COMPILE) \
--resolver=backtracking \
--output-file $@ \
$<
dev-requirements.txt: pyproject.toml $(PIP_COMPILE)
$(PIP_COMPILE) \
--extra=dev \
--resolver=backtracking \
--output-file $@ \
$<
$(PIP_COMPILE): $(PYTHON_VENV)
$(PIP) install pip-tools
$(PYTHON_VENV): $(PYTHON_BIN)
virtualenv --python=$^ $(VENV)
$(PIP) install --upgrade pip
$(PYTHON_BIN):
pyenv install $(PYTHON_VERSION)
clean:
rm -rf *.egg-info venv installed_deps

View File

@ -1,7 +0,0 @@
input := "../../data/large_target_store_products_dataset_sample - large_target_store_products_dataset_sample.csv"
run:
python -m main --input "{{ input }}"
test:
python -m pytest