dayrize-usecase/etl/src/helpers/read_from_csv.py

29 lines
882 B
Python

"""Module containing ReadFromCsv DoFn to create a PTransform to read from a CSV
input file"""
import io
import logging
import csv
import apache_beam as beam
from apache_beam.io.filesystems import FileSystems
class ReadFromCsv(beam.DoFn):
"""This custom DoFn will read from a CSV file and yield each row as a
dictionary where the row names are the keys and the cells are the values
"""
# pylint: disable=abstract-method,arguments-differ
def process(self, element):
logging.info("reading from input file: %s", element)
with FileSystems.open(element) as file:
text_wrapper = io.TextIOWrapper(file)
reader = csv.reader(text_wrapper)
try:
header = next(reader)
except StopIteration:
return
for row in reader:
yield dict(zip(header, row))