dayrize-usecase/pipeline/beam_etl/main.py

56 lines
1.8 KiB
Python
Raw Normal View History

2023-06-21 19:11:17 +02:00
#!/usr/bin/env python
import logging
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from helpers.data_io import ReadFromCsv, WriteToPostgreSQL
from helpers.parse_row import parse_row
# def __init__(self, hostname, port, username, password, database):
2023-06-21 19:11:17 +02:00
class SustainabilityScoreOptions(PipelineOptions):
"""Options for this pipeline"""
@classmethod
def _add_argparse_args(cls, parser):
parser.add_argument("--input", help="Input CSV file to process", type=str)
parser.add_argument("--pg_hostname", help="Postgres hostname", type=str)
parser.add_argument("--pg_port", help="Postgres port", type=str)
parser.add_argument("--pg_username", help="Postgres username", type=str)
parser.add_argument("--pg_password", help="Postgres password", type=str)
parser.add_argument("--pg_database", help="Postgres database name", type=str)
parser.add_argument("--pg_table", help="Postgres table name", type=str)
2023-06-21 19:11:17 +02:00
def main():
"""Construct and run the pipeline"""
2023-06-21 19:11:17 +02:00
beam_options = PipelineOptions()
opts = beam_options.view_as(SustainabilityScoreOptions)
with beam.Pipeline(options=beam_options) as pipeline:
# fmt: off
pipeline \
| beam.Create([opts.input]) \
| beam.ParDo(ReadFromCsv()) \
| beam.Map(parse_row) \
| beam.ParDo(WriteToPostgreSQL(
hostname=opts.pg_hostname,
port=opts.pg_port,
username=opts.pg_username,
password=opts.pg_password,
database=opts.pg_database,
table=opts.pg_table,
))
2023-06-21 19:11:17 +02:00
# fmt: on
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
main()