2023-06-21 19:12:03 +02:00
|
|
|
"""
|
|
|
|
DAG IDs: sustainability_score
|
|
|
|
"""
|
|
|
|
|
|
|
|
import os
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
import utils
|
|
|
|
|
|
|
|
from airflow import DAG
|
|
|
|
from airflow.operators.dummy_operator import DummyOperator
|
|
|
|
|
|
|
|
from airflow.providers.apache.beam.operators.beam import BeamRunPythonPipelineOperator
|
2023-06-23 16:23:21 +02:00
|
|
|
from airflow.providers.postgres.operators.postgres import PostgresOperator
|
2023-06-21 19:12:03 +02:00
|
|
|
|
|
|
|
|
|
|
|
HOME = os.environ["HOME"]
|
|
|
|
CSV_FNAME = (
|
|
|
|
"large_target_store_products_dataset_sample - "
|
|
|
|
"large_target_store_products_dataset_sample.csv"
|
|
|
|
)
|
|
|
|
CONFIG = {
|
|
|
|
"input": f"{ HOME }/gcs/data/{ CSV_FNAME }",
|
2023-06-24 19:07:02 +02:00
|
|
|
"beam_etl_path": "/etl/main.py",
|
2023-06-25 12:22:38 +02:00
|
|
|
"products_table": "sustainability_score.products",
|
|
|
|
"scored_table": "sustainability_score.scored_products",
|
2023-06-21 19:12:03 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
with DAG(
|
|
|
|
"sustainability_score",
|
|
|
|
schedule_interval="0 * * * 1-5",
|
|
|
|
catchup=False,
|
|
|
|
max_active_runs=10,
|
|
|
|
start_date=datetime(2023, 6, 21),
|
|
|
|
doc_md=utils.load_docs(__file__),
|
|
|
|
params=CONFIG,
|
2023-06-23 16:23:21 +02:00
|
|
|
template_searchpath=["/sql"],
|
2023-06-21 19:12:03 +02:00
|
|
|
) as dag:
|
2023-06-23 16:23:21 +02:00
|
|
|
create_products_table = PostgresOperator(
|
|
|
|
task_id="create_products_table",
|
|
|
|
sql="products_schema.sql",
|
|
|
|
postgres_conn_id="pg_db",
|
|
|
|
)
|
2023-06-25 12:46:53 +02:00
|
|
|
create_scores_table = PostgresOperator(
|
|
|
|
task_id="create_scored_products_table",
|
|
|
|
sql="scored_products_schema.sql",
|
|
|
|
postgres_conn_id="pg_db",
|
|
|
|
)
|
2023-06-23 16:23:21 +02:00
|
|
|
|
2023-06-21 19:12:03 +02:00
|
|
|
etl_pipeline = BeamRunPythonPipelineOperator(
|
|
|
|
task_id="beam_etl",
|
|
|
|
py_file="{{ params.beam_etl_path }}",
|
2023-06-23 18:02:01 +02:00
|
|
|
pipeline_options={
|
|
|
|
"input": "{{ params.input }}",
|
|
|
|
"pg_hostname": "{{ conn.get('pg_db').host }}",
|
|
|
|
"pg_port": "{{ conn.get('pg_db').port }}",
|
|
|
|
"pg_username": "{{ conn.get('pg_db').login }}",
|
|
|
|
"pg_password": "{{ conn.get('pg_db').password }}",
|
|
|
|
"pg_database": "{{ conn.get('pg_db').schema }}",
|
2023-06-25 12:22:38 +02:00
|
|
|
"pg_table": "{{ params.products_table }}",
|
2023-06-23 18:02:01 +02:00
|
|
|
},
|
2023-06-21 19:12:03 +02:00
|
|
|
)
|
2023-06-23 16:23:21 +02:00
|
|
|
|
2023-06-25 12:46:53 +02:00
|
|
|
calculate_score = PostgresOperator(
|
|
|
|
task_id="calculate_score",
|
|
|
|
sql="calculate_score.sql",
|
|
|
|
postgres_conn_id="pg_db",
|
|
|
|
)
|
|
|
|
|
2023-06-23 16:23:21 +02:00
|
|
|
create_products_table >> etl_pipeline
|
2023-06-25 12:46:53 +02:00
|
|
|
[etl_pipeline, create_scores_table] >> calculate_score
|