dayrize-usecase/dags/sustainability_score/__init__.py

82 lines
2.4 KiB
Python
Raw Normal View History

2023-06-21 19:12:03 +02:00
"""
DAG IDs: sustainability_score
"""
import os
from datetime import datetime
import utils
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
2023-06-25 20:53:51 +02:00
from airflow.operators.bash import BashOperator
2023-06-21 19:12:03 +02:00
from airflow.providers.apache.beam.operators.beam import BeamRunPythonPipelineOperator
from airflow.providers.postgres.operators.postgres import PostgresOperator
2023-06-21 19:12:03 +02:00
HOME = os.environ["HOME"]
CSV_FNAME = (
"large_target_store_products_dataset_sample - "
"large_target_store_products_dataset_sample.csv"
)
CONFIG = {
"input": f"{ HOME }/gcs/data/{ CSV_FNAME }",
"beam_etl_path": "/etl/main.py",
2023-06-25 12:22:38 +02:00
"products_table": "sustainability_score.products",
"scored_table": "sustainability_score.scored_products",
2023-06-21 19:12:03 +02:00
}
def dbt(cmd: str, attach_dag: DAG) -> BashOperator:
"""Setup and return an operator to run dbt commands"""
return BashOperator(
dag=attach_dag,
task_id=f"dbt_{ cmd }",
bash_command=f"dbt { cmd }",
cwd="/dbt",
env={
"POSTGRES_HOST": "{{ conn.get('pg_db').host }}",
"POSTGRES_USER": "{{ conn.get('pg_db').login }}",
"POSTGRES_PASSWORD": "{{ conn.get('pg_db').password }}",
"POSTGRES_PORT": "{{ conn.get('pg_db').port }}",
"POSTGRES_DATABASE": "{{ conn.get('pg_db').schema }}",
},
append_env=True,
)
2023-06-21 19:12:03 +02:00
with DAG(
"sustainability_score",
schedule_interval="0 * * * 1-5",
catchup=False,
max_active_runs=10,
start_date=datetime(2023, 6, 21),
doc_md=utils.load_docs(__file__),
params=CONFIG,
) as dag:
create_products_table = PostgresOperator(
task_id="create_products_table",
sql="sql/products_schema.sql",
2023-06-25 12:46:53 +02:00
postgres_conn_id="pg_db",
)
2023-06-21 19:12:03 +02:00
etl_pipeline = BeamRunPythonPipelineOperator(
task_id="beam_etl",
py_file="{{ params.beam_etl_path }}",
pipeline_options={
"input": "{{ params.input }}",
"pg_hostname": "{{ conn.get('pg_db').host }}",
"pg_port": "{{ conn.get('pg_db').port }}",
"pg_username": "{{ conn.get('pg_db').login }}",
"pg_password": "{{ conn.get('pg_db').password }}",
"pg_database": "{{ conn.get('pg_db').schema }}",
2023-06-25 12:22:38 +02:00
"pg_table": "{{ params.products_table }}",
},
2023-06-21 19:12:03 +02:00
)
dbt_run = dbt("run", dag)
dbt_test = dbt("test", dag)
2023-06-25 20:53:51 +02:00
create_products_table >> etl_pipeline >> dbt_run >> dbt_test