refactor: some refactoring of file placements
parent
55b20bb897
commit
02ad9fab8d
|
@ -2,6 +2,7 @@
|
|||
pipeline/state
|
||||
|
||||
.envrc
|
||||
.direnv
|
||||
venv
|
||||
.installed_deps
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ CSV_FNAME = (
|
|||
)
|
||||
CONFIG = {
|
||||
"input": f"{ HOME }/gcs/data/{ CSV_FNAME }",
|
||||
"beam_etl_path": "/beam_etl/main.py",
|
||||
"beam_etl_path": "/etl/main.py",
|
||||
"output_table": "sustainability_score.products",
|
||||
}
|
||||
|
|
@ -10,8 +10,8 @@ x-airflow-common:
|
|||
- ./dags:/home/airflow/airflow/dags
|
||||
- ./scripts/airflow-init.sh:/usr/local/bin/airflow-init.sh:ro
|
||||
- ./scripts/airflow-entrypoint.sh:/usr/local/bin/airflow-entrypoint.sh:ro
|
||||
- ../data:/home/airflow/gcs/data:ro
|
||||
- ./beam_etl:/beam_etl:ro
|
||||
- ./data:/home/airflow/gcs/data:ro
|
||||
- ./etl:/etl:ro
|
||||
- ./sql:/sql:ro
|
||||
environment:
|
||||
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
|
||||
|
@ -68,3 +68,13 @@ services:
|
|||
- "TF_VAR_pg_port=5432"
|
||||
- "TF_VAR_pg_password=postgres"
|
||||
- "TF_VAR_pg_username=postgres"
|
||||
|
||||
notebook:
|
||||
image: jupyter/scipy-notebook
|
||||
ports:
|
||||
- 8888:8888
|
||||
volumes:
|
||||
- ./notebooks:/home/jovyan/work
|
||||
- ./data:/home/jovyan/data:ro
|
||||
profiles:
|
||||
- notebooks
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml
|
||||
|
@ -208,6 +208,8 @@ protobuf==4.23.3
|
|||
# grpc-google-iam-v1
|
||||
# grpcio-status
|
||||
# proto-plus
|
||||
psycopg2==2.9.6
|
||||
# via beam-etl (pyproject.toml)
|
||||
pyarrow==11.0.0
|
||||
# via apache-beam
|
||||
pyasn1==0.5.0
|
||||
|
@ -273,8 +275,6 @@ typing-extensions==4.6.3
|
|||
# via
|
||||
# apache-beam
|
||||
# astroid
|
||||
# black
|
||||
# pylint
|
||||
tzdata==2023.3
|
||||
# via pandas
|
||||
urllib3==1.26.16
|
|
@ -36,46 +36,33 @@ class WriteToPostgreSQL(beam.DoFn):
|
|||
def __init__(
|
||||
self, hostname, port, username, password, database, table, table_key=None
|
||||
):
|
||||
self.hostname = hostname
|
||||
self.port = port
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.database = database
|
||||
self.connection_details = {
|
||||
"host": hostname,
|
||||
"port": port,
|
||||
"user": username,
|
||||
"password": password,
|
||||
"database": database,
|
||||
}
|
||||
self.table = table
|
||||
self.table_key = table_key
|
||||
|
||||
def setup(self):
|
||||
self.connection = psycopg2.connect(
|
||||
host=self.hostname,
|
||||
port=self.port,
|
||||
user=self.username,
|
||||
password=self.password,
|
||||
database=self.database,
|
||||
)
|
||||
self.connection = psycopg2.connect(**self.connection_details, autocommit=True)
|
||||
|
||||
def process(self, element):
|
||||
cursor = self.connection.cursor()
|
||||
colnames = ",".join(element.keys())
|
||||
values = ",".join(["%s"] * len(element))
|
||||
def execute_insert(self, row, cursor):
|
||||
colnames = ",".join(row.keys())
|
||||
values = ",".join(["%s"] * len(row))
|
||||
sql = f"""
|
||||
INSERT INTO { self.table } ({ colnames })
|
||||
VALUES ({ values })
|
||||
"""
|
||||
if self.table_key is not None:
|
||||
update_statement = ",".join(
|
||||
f"{ col } = EXCLUDED.{ col }"
|
||||
for col in element.keys()
|
||||
if col != self.table_key
|
||||
)
|
||||
sql = (
|
||||
sql
|
||||
+ f"""
|
||||
ON CONFLICT ({ self.table_key }) DO UPDATE SET
|
||||
{ update_statement }
|
||||
"""
|
||||
)
|
||||
cursor.execute(sql, list(element.values()))
|
||||
self.connection.commit()
|
||||
sql = sql + f" ON CONFLICT ({ self.table_key }) DO NOTHING"
|
||||
cursor.execute(sql, list(row.values()))
|
||||
|
||||
def process(self, element):
|
||||
cursor = self.connection.cursor()
|
||||
self.execute_insert(element, cursor)
|
||||
cursor.close()
|
||||
|
||||
def teardown(self):
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml
|
|
@ -1,8 +0,0 @@
|
|||
services:
|
||||
notebook:
|
||||
image: jupyter/scipy-notebook
|
||||
ports:
|
||||
- 8888:8888
|
||||
volumes:
|
||||
- ./work:/home/jovyan/work
|
||||
- ../data:/home/jovyan/data:ro
|
|
@ -1,50 +0,0 @@
|
|||
# I use this Makefile to automatize setting up the working space for me
|
||||
# As long as `virtualenv` and `pyenv` are installed, running `make` should set
|
||||
# up the virtual envrionment with everything needed.
|
||||
|
||||
.PHONY: clean
|
||||
|
||||
PYTHON_VERSION = 3.8
|
||||
PYTHON_SUBVERSION = 3.8.12
|
||||
|
||||
PYENV_VERSIONS = $(HOME)/.pyenv/versions
|
||||
PYTHON_BIN = $(PYENV_VERSIONS)/$(PYTHON_SUBVERSION)/bin/python$(PYTHON_VERSION)
|
||||
|
||||
VENV = venv
|
||||
PYTHON_VENV = $(VENV)/bin/python
|
||||
PIP = $(PYTHON_VENV) -m pip
|
||||
PIP_COMPILE = venv/bin/pip-compile
|
||||
|
||||
all: .installed_deps
|
||||
|
||||
.installed_deps: requirements.txt dev-requirements.txt $(PYTHON_VENV)
|
||||
$(PIP) install \
|
||||
-r requirements.txt \
|
||||
-r dev-requirements.txt
|
||||
touch $@
|
||||
|
||||
requirements.txt: pyproject.toml $(PIP_COMPILE)
|
||||
$(PIP_COMPILE) \
|
||||
--resolver=backtracking \
|
||||
--output-file $@ \
|
||||
$<
|
||||
|
||||
dev-requirements.txt: pyproject.toml $(PIP_COMPILE)
|
||||
$(PIP_COMPILE) \
|
||||
--extra=dev \
|
||||
--resolver=backtracking \
|
||||
--output-file $@ \
|
||||
$<
|
||||
|
||||
$(PIP_COMPILE): $(PYTHON_VENV)
|
||||
$(PIP) install pip-tools
|
||||
|
||||
$(PYTHON_VENV): $(PYTHON_BIN)
|
||||
virtualenv --python=$^ $(VENV)
|
||||
$(PIP) install --upgrade pip
|
||||
|
||||
$(PYTHON_BIN):
|
||||
pyenv install $(PYTHON_VERSION)
|
||||
|
||||
clean:
|
||||
rm -rf *.egg-info venv installed_deps
|
|
@ -1,7 +0,0 @@
|
|||
input := "../../data/large_target_store_products_dataset_sample - large_target_store_products_dataset_sample.csv"
|
||||
|
||||
run:
|
||||
python -m main --input "{{ input }}"
|
||||
|
||||
test:
|
||||
python -m pytest
|
Loading…
Reference in New Issue