dayrize-usecase/etl/src/main.py

60 lines
1.9 KiB
Python

"""This Apache Beam pipeline reads rows as elements from a CSV input file,
extracts and parses relevant values, and upserts the elements to a PostgreSQL
database
"""
import logging
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from helpers import UpsertProductsToPg, ReadFromCsv, ProcessRows
class SustainabilityScoreOptions(PipelineOptions):
"""Options for this pipeline"""
@classmethod
def _add_argparse_args(cls, parser):
parser.add_argument("--input", help="Input CSV file to process", type=str)
parser.add_argument("--pg_hostname", help="Postgres hostname", type=str)
parser.add_argument("--pg_port", help="Postgres port", type=str)
parser.add_argument("--pg_username", help="Postgres username", type=str)
parser.add_argument("--pg_password", help="Postgres password", type=str)
parser.add_argument("--pg_database", help="Postgres database name", type=str)
parser.add_argument("--pg_table", help="Postgres table name", type=str)
def main():
"""Construct and run the pipeline"""
beam_options = PipelineOptions()
opts = beam_options.view_as(SustainabilityScoreOptions)
pg_connection_details = {
"host": opts.pg_hostname,
"port": opts.pg_port,
"user": opts.pg_username,
"password": opts.pg_password,
"database": opts.pg_database,
}
with beam.Pipeline(options=beam_options) as pipeline:
# fmt: off
# pylint: disable=expression-not-assigned
pipeline \
| beam.Create([opts.input]) \
| beam.ParDo(ReadFromCsv()) \
| beam.ParDo(ProcessRows()) \
| beam.ParDo(UpsertProductsToPg(
connection_details=pg_connection_details,
table=opts.pg_table,
table_key="tcin",
))
# fmt: on
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
main()