refactor: some refactoring of file placements
parent
55b20bb897
commit
02ad9fab8d
|
@ -2,6 +2,7 @@
|
||||||
pipeline/state
|
pipeline/state
|
||||||
|
|
||||||
.envrc
|
.envrc
|
||||||
|
.direnv
|
||||||
venv
|
venv
|
||||||
.installed_deps
|
.installed_deps
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ CSV_FNAME = (
|
||||||
)
|
)
|
||||||
CONFIG = {
|
CONFIG = {
|
||||||
"input": f"{ HOME }/gcs/data/{ CSV_FNAME }",
|
"input": f"{ HOME }/gcs/data/{ CSV_FNAME }",
|
||||||
"beam_etl_path": "/beam_etl/main.py",
|
"beam_etl_path": "/etl/main.py",
|
||||||
"output_table": "sustainability_score.products",
|
"output_table": "sustainability_score.products",
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,8 +10,8 @@ x-airflow-common:
|
||||||
- ./dags:/home/airflow/airflow/dags
|
- ./dags:/home/airflow/airflow/dags
|
||||||
- ./scripts/airflow-init.sh:/usr/local/bin/airflow-init.sh:ro
|
- ./scripts/airflow-init.sh:/usr/local/bin/airflow-init.sh:ro
|
||||||
- ./scripts/airflow-entrypoint.sh:/usr/local/bin/airflow-entrypoint.sh:ro
|
- ./scripts/airflow-entrypoint.sh:/usr/local/bin/airflow-entrypoint.sh:ro
|
||||||
- ../data:/home/airflow/gcs/data:ro
|
- ./data:/home/airflow/gcs/data:ro
|
||||||
- ./beam_etl:/beam_etl:ro
|
- ./etl:/etl:ro
|
||||||
- ./sql:/sql:ro
|
- ./sql:/sql:ro
|
||||||
environment:
|
environment:
|
||||||
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
|
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
|
||||||
|
@ -68,3 +68,13 @@ services:
|
||||||
- "TF_VAR_pg_port=5432"
|
- "TF_VAR_pg_port=5432"
|
||||||
- "TF_VAR_pg_password=postgres"
|
- "TF_VAR_pg_password=postgres"
|
||||||
- "TF_VAR_pg_username=postgres"
|
- "TF_VAR_pg_username=postgres"
|
||||||
|
|
||||||
|
notebook:
|
||||||
|
image: jupyter/scipy-notebook
|
||||||
|
ports:
|
||||||
|
- 8888:8888
|
||||||
|
volumes:
|
||||||
|
- ./notebooks:/home/jovyan/work
|
||||||
|
- ./data:/home/jovyan/data:ro
|
||||||
|
profiles:
|
||||||
|
- notebooks
|
|
@ -1,5 +1,5 @@
|
||||||
#
|
#
|
||||||
# This file is autogenerated by pip-compile with Python 3.8
|
# This file is autogenerated by pip-compile with Python 3.10
|
||||||
# by the following command:
|
# by the following command:
|
||||||
#
|
#
|
||||||
# pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml
|
# pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml
|
||||||
|
@ -208,6 +208,8 @@ protobuf==4.23.3
|
||||||
# grpc-google-iam-v1
|
# grpc-google-iam-v1
|
||||||
# grpcio-status
|
# grpcio-status
|
||||||
# proto-plus
|
# proto-plus
|
||||||
|
psycopg2==2.9.6
|
||||||
|
# via beam-etl (pyproject.toml)
|
||||||
pyarrow==11.0.0
|
pyarrow==11.0.0
|
||||||
# via apache-beam
|
# via apache-beam
|
||||||
pyasn1==0.5.0
|
pyasn1==0.5.0
|
||||||
|
@ -273,8 +275,6 @@ typing-extensions==4.6.3
|
||||||
# via
|
# via
|
||||||
# apache-beam
|
# apache-beam
|
||||||
# astroid
|
# astroid
|
||||||
# black
|
|
||||||
# pylint
|
|
||||||
tzdata==2023.3
|
tzdata==2023.3
|
||||||
# via pandas
|
# via pandas
|
||||||
urllib3==1.26.16
|
urllib3==1.26.16
|
|
@ -36,46 +36,33 @@ class WriteToPostgreSQL(beam.DoFn):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, hostname, port, username, password, database, table, table_key=None
|
self, hostname, port, username, password, database, table, table_key=None
|
||||||
):
|
):
|
||||||
self.hostname = hostname
|
self.connection_details = {
|
||||||
self.port = port
|
"host": hostname,
|
||||||
self.username = username
|
"port": port,
|
||||||
self.password = password
|
"user": username,
|
||||||
self.database = database
|
"password": password,
|
||||||
|
"database": database,
|
||||||
|
}
|
||||||
self.table = table
|
self.table = table
|
||||||
self.table_key = table_key
|
self.table_key = table_key
|
||||||
|
|
||||||
def setup(self):
|
def setup(self):
|
||||||
self.connection = psycopg2.connect(
|
self.connection = psycopg2.connect(**self.connection_details, autocommit=True)
|
||||||
host=self.hostname,
|
|
||||||
port=self.port,
|
|
||||||
user=self.username,
|
|
||||||
password=self.password,
|
|
||||||
database=self.database,
|
|
||||||
)
|
|
||||||
|
|
||||||
def process(self, element):
|
def execute_insert(self, row, cursor):
|
||||||
cursor = self.connection.cursor()
|
colnames = ",".join(row.keys())
|
||||||
colnames = ",".join(element.keys())
|
values = ",".join(["%s"] * len(row))
|
||||||
values = ",".join(["%s"] * len(element))
|
|
||||||
sql = f"""
|
sql = f"""
|
||||||
INSERT INTO { self.table } ({ colnames })
|
INSERT INTO { self.table } ({ colnames })
|
||||||
VALUES ({ values })
|
VALUES ({ values })
|
||||||
"""
|
"""
|
||||||
if self.table_key is not None:
|
if self.table_key is not None:
|
||||||
update_statement = ",".join(
|
sql = sql + f" ON CONFLICT ({ self.table_key }) DO NOTHING"
|
||||||
f"{ col } = EXCLUDED.{ col }"
|
cursor.execute(sql, list(row.values()))
|
||||||
for col in element.keys()
|
|
||||||
if col != self.table_key
|
def process(self, element):
|
||||||
)
|
cursor = self.connection.cursor()
|
||||||
sql = (
|
self.execute_insert(element, cursor)
|
||||||
sql
|
|
||||||
+ f"""
|
|
||||||
ON CONFLICT ({ self.table_key }) DO UPDATE SET
|
|
||||||
{ update_statement }
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
cursor.execute(sql, list(element.values()))
|
|
||||||
self.connection.commit()
|
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
def teardown(self):
|
def teardown(self):
|
|
@ -1,5 +1,5 @@
|
||||||
#
|
#
|
||||||
# This file is autogenerated by pip-compile with Python 3.8
|
# This file is autogenerated by pip-compile with Python 3.10
|
||||||
# by the following command:
|
# by the following command:
|
||||||
#
|
#
|
||||||
# pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml
|
# pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml
|
|
@ -1,8 +0,0 @@
|
||||||
services:
|
|
||||||
notebook:
|
|
||||||
image: jupyter/scipy-notebook
|
|
||||||
ports:
|
|
||||||
- 8888:8888
|
|
||||||
volumes:
|
|
||||||
- ./work:/home/jovyan/work
|
|
||||||
- ../data:/home/jovyan/data:ro
|
|
|
@ -1,50 +0,0 @@
|
||||||
# I use this Makefile to automatize setting up the working space for me
|
|
||||||
# As long as `virtualenv` and `pyenv` are installed, running `make` should set
|
|
||||||
# up the virtual envrionment with everything needed.
|
|
||||||
|
|
||||||
.PHONY: clean
|
|
||||||
|
|
||||||
PYTHON_VERSION = 3.8
|
|
||||||
PYTHON_SUBVERSION = 3.8.12
|
|
||||||
|
|
||||||
PYENV_VERSIONS = $(HOME)/.pyenv/versions
|
|
||||||
PYTHON_BIN = $(PYENV_VERSIONS)/$(PYTHON_SUBVERSION)/bin/python$(PYTHON_VERSION)
|
|
||||||
|
|
||||||
VENV = venv
|
|
||||||
PYTHON_VENV = $(VENV)/bin/python
|
|
||||||
PIP = $(PYTHON_VENV) -m pip
|
|
||||||
PIP_COMPILE = venv/bin/pip-compile
|
|
||||||
|
|
||||||
all: .installed_deps
|
|
||||||
|
|
||||||
.installed_deps: requirements.txt dev-requirements.txt $(PYTHON_VENV)
|
|
||||||
$(PIP) install \
|
|
||||||
-r requirements.txt \
|
|
||||||
-r dev-requirements.txt
|
|
||||||
touch $@
|
|
||||||
|
|
||||||
requirements.txt: pyproject.toml $(PIP_COMPILE)
|
|
||||||
$(PIP_COMPILE) \
|
|
||||||
--resolver=backtracking \
|
|
||||||
--output-file $@ \
|
|
||||||
$<
|
|
||||||
|
|
||||||
dev-requirements.txt: pyproject.toml $(PIP_COMPILE)
|
|
||||||
$(PIP_COMPILE) \
|
|
||||||
--extra=dev \
|
|
||||||
--resolver=backtracking \
|
|
||||||
--output-file $@ \
|
|
||||||
$<
|
|
||||||
|
|
||||||
$(PIP_COMPILE): $(PYTHON_VENV)
|
|
||||||
$(PIP) install pip-tools
|
|
||||||
|
|
||||||
$(PYTHON_VENV): $(PYTHON_BIN)
|
|
||||||
virtualenv --python=$^ $(VENV)
|
|
||||||
$(PIP) install --upgrade pip
|
|
||||||
|
|
||||||
$(PYTHON_BIN):
|
|
||||||
pyenv install $(PYTHON_VERSION)
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -rf *.egg-info venv installed_deps
|
|
|
@ -1,7 +0,0 @@
|
||||||
input := "../../data/large_target_store_products_dataset_sample - large_target_store_products_dataset_sample.csv"
|
|
||||||
|
|
||||||
run:
|
|
||||||
python -m main --input "{{ input }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
python -m pytest
|
|
Loading…
Reference in New Issue