dayrize-usecase/pipeline/beam_etl/main.py

#!/usr/bin/env python

import io
import logging
import csv

import apache_beam as beam

from apache_beam.io.filesystems import FileSystems
from apache_beam.options.pipeline_options import PipelineOptions


class SustainabilityScoreOptions(PipelineOptions):
    """Options for this pipeline"""

    @classmethod
    def _add_argparse_args(cls, parser):
        parser.add_value_provider_argument(
            "--input", help="Input CSV file to process", type=str
        )
        parser.add_value_provider_argument(
            "--output", help="Destination destination table", type=str
        )


class ReadFromCsv(beam.DoFn):
    """This custom DoFn will read from a CSV file and yield each row as a
    dictionary where the row names are the keys and the cells are the values
    """

    def process(self, in_file):
        with FileSystems.open(in_file.get()) as file:
            text_wrapper = io.TextIOWrapper(file)
            reader = csv.reader(text_wrapper)
            header = next(reader)
            for row in reader:
                yield dict(zip(header, row))


def main():
    beam_options = PipelineOptions()
    opts = beam_options.view_as(SustainabilityScoreOptions)

    with beam.Pipeline(options=beam_options) as pipeline:
        # fmt: off
        pipeline \
            | beam.Create([opts.input]) \
            | beam.ParDo(ReadFromCsv())
        # fmt: on


if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)
    main()