diff --git a/.gitignore b/.gitignore index 50a29b0..2dfffff 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,9 @@ .ipynb_checkpoints pipeline/state + +.envrc +venv +.installed_deps + +*.egg-info +__pycache__ diff --git a/pipeline/beam_etl/Makefile b/pipeline/beam_etl/Makefile new file mode 100644 index 0000000..d5370a6 --- /dev/null +++ b/pipeline/beam_etl/Makefile @@ -0,0 +1,50 @@ +# I use this Makefile to automatize setting up the working space for me +# As long as `virtualenv` and `pyenv` are installed, running `make` should set +# up the virtual envrionment with everything needed. + +.PHONY: clean + +PYTHON_VERSION = 3.8 +PYTHON_SUBVERSION = 3.8.12 + +PYENV_VERSIONS = $(HOME)/.pyenv/versions +PYTHON_BIN = $(PYENV_VERSIONS)/$(PYTHON_SUBVERSION)/bin/python$(PYTHON_VERSION) + +VENV = venv +PYTHON_VENV = $(VENV)/bin/python +PIP = $(PYTHON_VENV) -m pip +PIP_COMPILE = venv/bin/pip-compile + +all: .installed_deps + +.installed_deps: requirements.txt dev-requirements.txt $(PYTHON_VENV) + $(PIP) install \ + -r requirements.txt \ + -r dev-requirements.txt + touch $@ + +requirements.txt: pyproject.toml $(PIP_COMPILE) + $(PIP_COMPILE) \ + --resolver=backtracking \ + --output-file $@ \ + $< + +dev-requirements.txt: pyproject.toml $(PIP_COMPILE) + $(PIP_COMPILE) \ + --extra=dev \ + --resolver=backtracking \ + --output-file $@ \ + $< + +$(PIP_COMPILE): $(PYTHON_VENV) + $(PIP) install pip-tools + +$(PYTHON_VENV): $(PYTHON_BIN) + virtualenv --python=$^ $(VENV) + $(PIP) install --upgrade pip + +$(PYTHON_BIN): + pyenv install $(PYTHON_VERSION) + +clean: + rm -rf *.egg-info venv installed_deps diff --git a/pipeline/beam_etl/dev-requirements.txt b/pipeline/beam_etl/dev-requirements.txt new file mode 100644 index 0000000..dc6f2bc --- /dev/null +++ b/pipeline/beam_etl/dev-requirements.txt @@ -0,0 +1,289 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile --extra=dev --output-file=dev-requirements.txt --resolver=backtracking pyproject.toml +# +apache-beam[gcp]==2.48.0 + # via beam-etl (pyproject.toml) +astroid==2.15.5 + # via pylint +black==23.3.0 + # via beam-etl (pyproject.toml) +cachetools==5.3.1 + # via + # apache-beam + # google-auth +certifi==2023.5.7 + # via requests +charset-normalizer==3.1.0 + # via requests +click==8.1.3 + # via black +cloudpickle==2.2.1 + # via apache-beam +crcmod==1.7 + # via apache-beam +dill==0.3.1.1 + # via + # apache-beam + # pylint +dnspython==2.3.0 + # via pymongo +docopt==0.6.2 + # via hdfs +exceptiongroup==1.1.1 + # via pytest +fastavro==1.7.4 + # via apache-beam +fasteners==0.18 + # via + # apache-beam + # google-apitools +google-api-core[grpc]==2.11.1 + # via + # google-cloud-bigquery + # google-cloud-bigquery-storage + # google-cloud-bigtable + # google-cloud-core + # google-cloud-datastore + # google-cloud-dlp + # google-cloud-language + # google-cloud-pubsub + # google-cloud-pubsublite + # google-cloud-recommendations-ai + # google-cloud-spanner + # google-cloud-videointelligence + # google-cloud-vision +google-apitools==0.5.31 + # via apache-beam +google-auth==2.20.0 + # via + # apache-beam + # google-api-core + # google-auth-httplib2 + # google-cloud-core +google-auth-httplib2==0.1.0 + # via apache-beam +google-cloud-bigquery==3.11.1 + # via apache-beam +google-cloud-bigquery-storage==2.20.0 + # via apache-beam +google-cloud-bigtable==2.17.0 + # via apache-beam +google-cloud-core==2.3.2 + # via + # apache-beam + # google-cloud-bigquery + # google-cloud-bigtable + # google-cloud-datastore + # google-cloud-spanner +google-cloud-datastore==2.15.2 + # via apache-beam +google-cloud-dlp==3.12.1 + # via apache-beam +google-cloud-language==2.10.0 + # via apache-beam +google-cloud-pubsub==2.17.1 + # via + # apache-beam + # google-cloud-pubsublite +google-cloud-pubsublite==1.8.2 + # via apache-beam +google-cloud-recommendations-ai==0.10.3 + # via apache-beam +google-cloud-spanner==3.36.0 + # via apache-beam +google-cloud-videointelligence==2.11.2 + # via apache-beam +google-cloud-vision==3.4.3 + # via apache-beam +google-crc32c==1.5.0 + # via google-resumable-media +google-resumable-media==2.5.0 + # via google-cloud-bigquery +googleapis-common-protos[grpc]==1.59.1 + # via + # google-api-core + # grpc-google-iam-v1 + # grpcio-status +grpc-google-iam-v1==0.12.6 + # via + # google-cloud-bigtable + # google-cloud-pubsub + # google-cloud-spanner +grpcio==1.54.2 + # via + # apache-beam + # google-api-core + # google-cloud-bigquery + # google-cloud-pubsub + # google-cloud-pubsublite + # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status +grpcio-status==1.54.2 + # via + # google-api-core + # google-cloud-pubsub + # google-cloud-pubsublite +hdfs==2.7.0 + # via apache-beam +httplib2==0.22.0 + # via + # apache-beam + # google-apitools + # google-auth-httplib2 + # oauth2client +idna==3.4 + # via requests +iniconfig==2.0.0 + # via pytest +isort==5.12.0 + # via pylint +lazy-object-proxy==1.9.0 + # via astroid +mccabe==0.7.0 + # via pylint +mypy-extensions==1.0.0 + # via black +numpy==1.24.3 + # via + # apache-beam + # pandas + # pyarrow +oauth2client==4.1.3 + # via google-apitools +objsize==0.6.1 + # via apache-beam +orjson==3.9.1 + # via apache-beam +overrides==6.5.0 + # via google-cloud-pubsublite +packaging==23.1 + # via + # black + # google-cloud-bigquery + # pytest +pandas==2.0.2 + # via beam-etl (pyproject.toml) +pathspec==0.11.1 + # via black +platformdirs==3.7.0 + # via + # black + # pylint +pluggy==1.2.0 + # via pytest +proto-plus==1.22.2 + # via + # apache-beam + # google-cloud-bigquery + # google-cloud-bigquery-storage + # google-cloud-bigtable + # google-cloud-datastore + # google-cloud-dlp + # google-cloud-language + # google-cloud-pubsub + # google-cloud-recommendations-ai + # google-cloud-spanner + # google-cloud-videointelligence + # google-cloud-vision +protobuf==4.23.3 + # via + # apache-beam + # google-api-core + # google-cloud-bigquery + # google-cloud-bigquery-storage + # google-cloud-bigtable + # google-cloud-datastore + # google-cloud-dlp + # google-cloud-language + # google-cloud-pubsub + # google-cloud-recommendations-ai + # google-cloud-spanner + # google-cloud-videointelligence + # google-cloud-vision + # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status + # proto-plus +pyarrow==11.0.0 + # via apache-beam +pyasn1==0.5.0 + # via + # oauth2client + # pyasn1-modules + # rsa +pyasn1-modules==0.3.0 + # via + # google-auth + # oauth2client +pydot==1.4.2 + # via apache-beam +pylint==2.17.4 + # via beam-etl (pyproject.toml) +pymongo==4.3.3 + # via apache-beam +pyparsing==3.1.0 + # via + # httplib2 + # pydot +pytest==7.3.2 + # via beam-etl (pyproject.toml) +python-dateutil==2.8.2 + # via + # apache-beam + # google-cloud-bigquery + # pandas +pytz==2023.3 + # via + # apache-beam + # pandas +regex==2023.6.3 + # via apache-beam +requests==2.31.0 + # via + # apache-beam + # google-api-core + # google-cloud-bigquery + # hdfs +rsa==4.9 + # via + # google-auth + # oauth2client +six==1.16.0 + # via + # google-apitools + # google-auth + # google-auth-httplib2 + # hdfs + # oauth2client + # python-dateutil +sqlparse==0.4.4 + # via google-cloud-spanner +tomli==2.0.1 + # via + # black + # pylint + # pytest +tomlkit==0.11.8 + # via pylint +typing-extensions==4.6.3 + # via + # apache-beam + # astroid + # black + # pylint +tzdata==2023.3 + # via pandas +urllib3==1.26.16 + # via + # google-auth + # requests +wheel==0.40.0 + # via beam-etl (pyproject.toml) +wrapt==1.15.0 + # via astroid +zstandard==0.21.0 + # via apache-beam diff --git a/pipeline/beam_etl/justfile b/pipeline/beam_etl/justfile new file mode 100644 index 0000000..cfb0c3a --- /dev/null +++ b/pipeline/beam_etl/justfile @@ -0,0 +1,4 @@ +input := "../../data/large_target_store_products_dataset_sample - large_target_store_products_dataset_sample.csv" + +run: + python -m main --input "{{ input }}" diff --git a/pipeline/beam_etl/main.py b/pipeline/beam_etl/main.py new file mode 100644 index 0000000..86638f3 --- /dev/null +++ b/pipeline/beam_etl/main.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import io +import logging +import csv + +import apache_beam as beam + +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import PipelineOptions + + +class SustainabilityScoreOptions(PipelineOptions): + """Options for this pipeline""" + + @classmethod + def _add_argparse_args(cls, parser): + parser.add_value_provider_argument( + "--input", help="Input CSV file to process", type=str + ) + parser.add_value_provider_argument( + "--output", help="Destination destination table", type=str + ) + + +class ReadFromCsv(beam.DoFn): + """This custom DoFn will read from a CSV file and yield each row as a + dictionary where the row names are the keys and the cells are the values + """ + + def process(self, in_file): + with FileSystems.open(in_file.get()) as file: + text_wrapper = io.TextIOWrapper(file) + reader = csv.reader(text_wrapper) + header = next(reader) + for row in reader: + yield dict(zip(header, row)) + + +def main(): + beam_options = PipelineOptions() + opts = beam_options.view_as(SustainabilityScoreOptions) + + with beam.Pipeline(options=beam_options) as pipeline: + # fmt: off + pipeline \ + | beam.Create([opts.input]) \ + | beam.ParDo(ReadFromCsv()) + # fmt: on + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + main() diff --git a/pipeline/beam_etl/pyproject.toml b/pipeline/beam_etl/pyproject.toml new file mode 100644 index 0000000..f27730d --- /dev/null +++ b/pipeline/beam_etl/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "beam_etl" +version = "0.1" +dependencies = ["wheel", "apache-beam[gcp]", "pandas"] + +[project.optional-dependencies] +dev = ["pytest", "pylint", "black"] diff --git a/pipeline/beam_etl/requirements.txt b/pipeline/beam_etl/requirements.txt new file mode 100644 index 0000000..c8e9371 --- /dev/null +++ b/pipeline/beam_etl/requirements.txt @@ -0,0 +1,241 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml +# +apache-beam[gcp]==2.48.0 + # via beam-etl (pyproject.toml) +cachetools==5.3.1 + # via + # apache-beam + # google-auth +certifi==2023.5.7 + # via requests +charset-normalizer==3.1.0 + # via requests +cloudpickle==2.2.1 + # via apache-beam +crcmod==1.7 + # via apache-beam +dill==0.3.1.1 + # via apache-beam +dnspython==2.3.0 + # via pymongo +docopt==0.6.2 + # via hdfs +fastavro==1.7.4 + # via apache-beam +fasteners==0.18 + # via + # apache-beam + # google-apitools +google-api-core[grpc]==2.11.1 + # via + # google-cloud-bigquery + # google-cloud-bigquery-storage + # google-cloud-bigtable + # google-cloud-core + # google-cloud-datastore + # google-cloud-dlp + # google-cloud-language + # google-cloud-pubsub + # google-cloud-pubsublite + # google-cloud-recommendations-ai + # google-cloud-spanner + # google-cloud-videointelligence + # google-cloud-vision +google-apitools==0.5.31 + # via apache-beam +google-auth==2.20.0 + # via + # apache-beam + # google-api-core + # google-auth-httplib2 + # google-cloud-core +google-auth-httplib2==0.1.0 + # via apache-beam +google-cloud-bigquery==3.11.1 + # via apache-beam +google-cloud-bigquery-storage==2.20.0 + # via apache-beam +google-cloud-bigtable==2.17.0 + # via apache-beam +google-cloud-core==2.3.2 + # via + # apache-beam + # google-cloud-bigquery + # google-cloud-bigtable + # google-cloud-datastore + # google-cloud-spanner +google-cloud-datastore==2.15.2 + # via apache-beam +google-cloud-dlp==3.12.1 + # via apache-beam +google-cloud-language==2.10.0 + # via apache-beam +google-cloud-pubsub==2.17.1 + # via + # apache-beam + # google-cloud-pubsublite +google-cloud-pubsublite==1.8.2 + # via apache-beam +google-cloud-recommendations-ai==0.10.3 + # via apache-beam +google-cloud-spanner==3.36.0 + # via apache-beam +google-cloud-videointelligence==2.11.2 + # via apache-beam +google-cloud-vision==3.4.3 + # via apache-beam +google-crc32c==1.5.0 + # via google-resumable-media +google-resumable-media==2.5.0 + # via google-cloud-bigquery +googleapis-common-protos[grpc]==1.59.1 + # via + # google-api-core + # grpc-google-iam-v1 + # grpcio-status +grpc-google-iam-v1==0.12.6 + # via + # google-cloud-bigtable + # google-cloud-pubsub + # google-cloud-spanner +grpcio==1.54.2 + # via + # apache-beam + # google-api-core + # google-cloud-bigquery + # google-cloud-pubsub + # google-cloud-pubsublite + # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status +grpcio-status==1.54.2 + # via + # google-api-core + # google-cloud-pubsub + # google-cloud-pubsublite +hdfs==2.7.0 + # via apache-beam +httplib2==0.22.0 + # via + # apache-beam + # google-apitools + # google-auth-httplib2 + # oauth2client +idna==3.4 + # via requests +numpy==1.24.3 + # via + # apache-beam + # pandas + # pyarrow +oauth2client==4.1.3 + # via google-apitools +objsize==0.6.1 + # via apache-beam +orjson==3.9.1 + # via apache-beam +overrides==6.5.0 + # via google-cloud-pubsublite +packaging==23.1 + # via google-cloud-bigquery +pandas==2.0.2 + # via beam-etl (pyproject.toml) +proto-plus==1.22.2 + # via + # apache-beam + # google-cloud-bigquery + # google-cloud-bigquery-storage + # google-cloud-bigtable + # google-cloud-datastore + # google-cloud-dlp + # google-cloud-language + # google-cloud-pubsub + # google-cloud-recommendations-ai + # google-cloud-spanner + # google-cloud-videointelligence + # google-cloud-vision +protobuf==4.23.3 + # via + # apache-beam + # google-api-core + # google-cloud-bigquery + # google-cloud-bigquery-storage + # google-cloud-bigtable + # google-cloud-datastore + # google-cloud-dlp + # google-cloud-language + # google-cloud-pubsub + # google-cloud-recommendations-ai + # google-cloud-spanner + # google-cloud-videointelligence + # google-cloud-vision + # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status + # proto-plus +pyarrow==11.0.0 + # via apache-beam +pyasn1==0.5.0 + # via + # oauth2client + # pyasn1-modules + # rsa +pyasn1-modules==0.3.0 + # via + # google-auth + # oauth2client +pydot==1.4.2 + # via apache-beam +pymongo==4.3.3 + # via apache-beam +pyparsing==3.1.0 + # via + # httplib2 + # pydot +python-dateutil==2.8.2 + # via + # apache-beam + # google-cloud-bigquery + # pandas +pytz==2023.3 + # via + # apache-beam + # pandas +regex==2023.6.3 + # via apache-beam +requests==2.31.0 + # via + # apache-beam + # google-api-core + # google-cloud-bigquery + # hdfs +rsa==4.9 + # via + # google-auth + # oauth2client +six==1.16.0 + # via + # google-apitools + # google-auth + # google-auth-httplib2 + # hdfs + # oauth2client + # python-dateutil +sqlparse==0.4.4 + # via google-cloud-spanner +typing-extensions==4.6.3 + # via apache-beam +tzdata==2023.3 + # via pandas +urllib3==1.26.16 + # via + # google-auth + # requests +wheel==0.40.0 + # via beam-etl (pyproject.toml) +zstandard==0.21.0 + # via apache-beam