dayrize-usecase/pipeline/beam_etl/main.py

55 lines
1.4 KiB
Python

#!/usr/bin/env python
import io
import logging
import csv
import apache_beam as beam
from apache_beam.io.filesystems import FileSystems
from apache_beam.options.pipeline_options import PipelineOptions
class SustainabilityScoreOptions(PipelineOptions):
"""Options for this pipeline"""
@classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument(
"--input", help="Input CSV file to process", type=str
)
parser.add_value_provider_argument(
"--output", help="Destination destination table", type=str
)
class ReadFromCsv(beam.DoFn):
"""This custom DoFn will read from a CSV file and yield each row as a
dictionary where the row names are the keys and the cells are the values
"""
def process(self, in_file):
with FileSystems.open(in_file.get()) as file:
text_wrapper = io.TextIOWrapper(file)
reader = csv.reader(text_wrapper)
header = next(reader)
for row in reader:
yield dict(zip(header, row))
def main():
beam_options = PipelineOptions()
opts = beam_options.view_as(SustainabilityScoreOptions)
with beam.Pipeline(options=beam_options) as pipeline:
# fmt: off
pipeline \
| beam.Create([opts.input]) \
| beam.ParDo(ReadFromCsv())
# fmt: on
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
main()