60 lines
1.9 KiB
Python
60 lines
1.9 KiB
Python
"""This Apache Beam pipeline reads rows as elements from a CSV input file,
|
|
extracts and parses relevant values, and upserts the elements to a PostgreSQL
|
|
database
|
|
"""
|
|
|
|
import logging
|
|
|
|
import apache_beam as beam
|
|
from apache_beam.options.pipeline_options import PipelineOptions
|
|
|
|
from helpers import UpsertProductsToPg, ReadFromCsv, ProcessRows
|
|
|
|
|
|
class SustainabilityScoreOptions(PipelineOptions):
|
|
"""Options for this pipeline"""
|
|
|
|
@classmethod
|
|
def _add_argparse_args(cls, parser):
|
|
parser.add_argument("--input", help="Input CSV file to process", type=str)
|
|
parser.add_argument("--pg_hostname", help="Postgres hostname", type=str)
|
|
parser.add_argument("--pg_port", help="Postgres port", type=str)
|
|
parser.add_argument("--pg_username", help="Postgres username", type=str)
|
|
parser.add_argument("--pg_password", help="Postgres password", type=str)
|
|
parser.add_argument("--pg_database", help="Postgres database name", type=str)
|
|
parser.add_argument("--pg_table", help="Postgres table name", type=str)
|
|
|
|
|
|
def main():
|
|
"""Construct and run the pipeline"""
|
|
|
|
beam_options = PipelineOptions()
|
|
opts = beam_options.view_as(SustainabilityScoreOptions)
|
|
|
|
pg_connection_details = {
|
|
"host": opts.pg_hostname,
|
|
"port": opts.pg_port,
|
|
"user": opts.pg_username,
|
|
"password": opts.pg_password,
|
|
"database": opts.pg_database,
|
|
}
|
|
|
|
with beam.Pipeline(options=beam_options) as pipeline:
|
|
# fmt: off
|
|
# pylint: disable=expression-not-assigned
|
|
pipeline \
|
|
| beam.Create([opts.input]) \
|
|
| beam.ParDo(ReadFromCsv()) \
|
|
| beam.ParDo(ProcessRows()) \
|
|
| beam.ParDo(UpsertProductsToPg(
|
|
connection_details=pg_connection_details,
|
|
table=opts.pg_table,
|
|
table_key="tcin",
|
|
))
|
|
# fmt: on
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.getLogger().setLevel(logging.INFO)
|
|
main()
|