diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2f9b59abb..3dfb6f913 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,34 +1,48 @@ repos: - - repo: https://github.com/timothycrosley/isort - rev: 5.9.3 - hooks: - - id: isort - additional_dependencies: [toml] - exclude: examples/* - - repo: https://github.com/python/black - rev: 22.3.0 - hooks: - - id: black - - repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.2 - hooks: - - id: flake8 - - repo: https://github.com/pycqa/pylint - rev: pylint-2.7.4 - hooks: - - id: pylint - - repo: https://github.com/econchick/interrogate - rev: 1.5.0 - hooks: - - id: interrogate - exclude: ^(build|docs|merlin/io|tests|setup.py|versioneer.py) - args: [--config=pyproject.toml] - - repo: https://github.com/codespell-project/codespell - rev: v2.1.0 - hooks: - - id: codespell - - repo: https://github.com/PyCQA/bandit - rev: 1.7.0 - hooks: - - id: bandit - args: [--verbose, -ll, -x, tests,examples,bench] + # imports + - repo: https://github.com/MarcoGorelli/absolufy-imports + rev: v0.3.1 + hooks: + - id: absolufy-imports + - repo: https://github.com/timothycrosley/isort + rev: 5.9.3 + hooks: + - id: isort + additional_dependencies: [toml] + exclude: examples/* + # code style + - repo: https://github.com/python/black + rev: 22.3.0 + hooks: + - id: black + - repo: https://github.com/pycqa/pylint + rev: pylint-2.7.4 + hooks: + - id: pylint + - repo: https://gitlab.com/pycqa/flake8 + rev: 3.9.2 + hooks: + - id: flake8 + # notebooks + - repo: https://github.com/s-weigand/flake8-nb + rev: v0.3.0 + hooks: + - id: flake8-nb + files: \.ipynb$ + # documentation + - repo: https://github.com/econchick/interrogate + rev: 1.5.0 + hooks: + - id: interrogate + exclude: ^(build|docs|merlin/io|tests|setup.py|versioneer.py) + args: [--config=pyproject.toml] + - repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + # security + - repo: https://github.com/PyCQA/bandit + rev: 1.7.0 + hooks: + - id: bandit + args: [--verbose, -ll, -x, tests,examples,bench] diff --git a/merlin/core/__init__.py b/merlin/core/__init__.py index 7102ab501..f6b332d52 100644 --- a/merlin/core/__init__.py +++ b/merlin/core/__init__.py @@ -14,7 +14,7 @@ # limitations under the License. # -from ._version import get_versions +from merlin.core._version import get_versions __version__ = get_versions()["version"] del get_versions diff --git a/merlin/dag/__init__.py b/merlin/dag/__init__.py index 86db9c353..49006e417 100644 --- a/merlin/dag/__init__.py +++ b/merlin/dag/__init__.py @@ -15,7 +15,7 @@ # # flake8: noqa -from .base_operator import BaseOperator, Supports -from .graph import Graph -from .node import Node, iter_nodes, postorder_iter_nodes, preorder_iter_nodes -from .selector import ColumnSelector +from merlin.dag.base_operator import BaseOperator, Supports +from merlin.dag.graph import Graph +from merlin.dag.node import Node, iter_nodes, postorder_iter_nodes, preorder_iter_nodes +from merlin.dag.selector import ColumnSelector diff --git a/merlin/dag/ops/__init__.py b/merlin/dag/ops/__init__.py index 58b236bdb..b9e2f60c6 100644 --- a/merlin/dag/ops/__init__.py +++ b/merlin/dag/ops/__init__.py @@ -15,7 +15,7 @@ # # alias submodules here to avoid breaking everything with moving to submodules # flake8: noqa -from .concat_columns import ConcatColumns -from .selection import SelectionOp -from .subset_columns import SubsetColumns -from .subtraction import SubtractionOp +from merlin.dag.ops.concat_columns import ConcatColumns +from merlin.dag.ops.selection import SelectionOp +from merlin.dag.ops.subset_columns import SubsetColumns +from merlin.dag.ops.subtraction import SubtractionOp diff --git a/merlin/io/__init__.py b/merlin/io/__init__.py index d89e26197..0a03a751e 100644 --- a/merlin/io/__init__.py +++ b/merlin/io/__init__.py @@ -15,7 +15,7 @@ # # flake8: noqa -from . import dataframe_iter, dataset, shuffle -from .dataframe_iter import DataFrameIter -from .dataset import Dataset -from .shuffle import Shuffle, shuffle_df +from merlin.io import dataframe_iter, dataset, shuffle +from merlin.io.dataframe_iter import DataFrameIter +from merlin.io.dataset import Dataset +from merlin.io.shuffle import Shuffle, shuffle_df diff --git a/merlin/io/avro.py b/merlin/io/avro.py index d3f591434..97341e07c 100644 --- a/merlin/io/avro.py +++ b/merlin/io/avro.py @@ -20,7 +20,7 @@ from dask.base import tokenize from dask.dataframe.core import new_dd_object -from .dataset_engine import DatasetEngine +from merlin.io.dataset_engine import DatasetEngine class AvroDatasetEngine(DatasetEngine): @@ -31,6 +31,7 @@ class AvroDatasetEngine(DatasetEngine): """ def __init__(self, paths, part_size, storage_options=None, cpu=False, **kwargs): + # pylint: disable=access-member-before-definition super().__init__(paths, part_size, storage_options=storage_options, cpu=cpu) if kwargs != {}: raise ValueError("Unexpected AvroDatasetEngine argument(s).") diff --git a/merlin/io/csv.py b/merlin/io/csv.py index ee4d0b695..5fc819352 100644 --- a/merlin/io/csv.py +++ b/merlin/io/csv.py @@ -27,7 +27,7 @@ from fsspec.core import get_fs_token_paths from fsspec.utils import infer_compression -from .dataset_engine import DatasetEngine +from merlin.io.dataset_engine import DatasetEngine class CSVDatasetEngine(DatasetEngine): @@ -37,6 +37,7 @@ class CSVDatasetEngine(DatasetEngine): """ def __init__(self, paths, part_size, storage_options=None, cpu=False, **kwargs): + # pylint: disable=access-member-before-definition super().__init__(paths, part_size, cpu=cpu, storage_options=storage_options) self._meta = {} self.csv_kwargs = kwargs diff --git a/merlin/io/dask.py b/merlin/io/dask.py index c9203a5e8..86277eca3 100644 --- a/merlin/io/dask.py +++ b/merlin/io/dask.py @@ -24,10 +24,9 @@ from merlin.core.dispatch import annotate from merlin.core.utils import ensure_optimize_dataframe_graph, global_dask_client +from merlin.io.shuffle import Shuffle from merlin.io.worker import clean_worker_cache, get_worker_cache - -from .shuffle import Shuffle -from .writer_factory import _writer_cls_factory, writer_factory +from merlin.io.writer_factory import _writer_cls_factory, writer_factory class DaskSubgraph: diff --git a/merlin/io/dataframe_engine.py b/merlin/io/dataframe_engine.py index 7825770d4..f63e1283e 100644 --- a/merlin/io/dataframe_engine.py +++ b/merlin/io/dataframe_engine.py @@ -21,7 +21,7 @@ from dask.dataframe.core import new_dd_object from dask.highlevelgraph import HighLevelGraph -from .dataset_engine import DatasetEngine +from merlin.io.dataset_engine import DatasetEngine class DataFrameDatasetEngine(DatasetEngine): diff --git a/merlin/io/dataset.py b/merlin/io/dataset.py index 77fdf4d15..d55ed0b90 100644 --- a/merlin/io/dataset.py +++ b/merlin/io/dataset.py @@ -33,16 +33,15 @@ import merlin.core.dispatch as dispatch from merlin.core.dispatch import convert_data, hex_to_int, is_dataframe_object from merlin.core.utils import device_mem_size, global_dask_client, set_client_deprecated +from merlin.io.csv import CSVDatasetEngine +from merlin.io.dask import _ddf_to_dataset, _simple_shuffle +from merlin.io.dataframe_engine import DataFrameDatasetEngine from merlin.io.dataframe_iter import DataFrameIter +from merlin.io.parquet import ParquetDatasetEngine from merlin.io.shuffle import _check_shuffle_arg from merlin.schema import ColumnSchema, Schema from merlin.schema.io.tensorflow_metadata import TensorflowMetadata -from .csv import CSVDatasetEngine -from .dask import _ddf_to_dataset, _simple_shuffle -from .dataframe_engine import DataFrameDatasetEngine -from .parquet import ParquetDatasetEngine - try: import cudf except ImportError: @@ -310,7 +309,7 @@ def __init__( ) elif engine == "avro": try: - from .avro import AvroDatasetEngine + from merlin.io.avro import AvroDatasetEngine except ImportError as e: raise RuntimeError( "Failed to import AvroDatasetEngine. Make sure uavro is installed." diff --git a/merlin/io/hugectr.py b/merlin/io/hugectr.py index 1206de091..a0f81cbb3 100644 --- a/merlin/io/hugectr.py +++ b/merlin/io/hugectr.py @@ -18,7 +18,7 @@ import numpy as np -from .writer import ThreadedWriter +from merlin.io.writer import ThreadedWriter class HugeCTRWriter(ThreadedWriter): diff --git a/merlin/io/parquet.py b/merlin/io/parquet.py index 310d95203..4b07e355d 100644 --- a/merlin/io/parquet.py +++ b/merlin/io/parquet.py @@ -59,11 +59,10 @@ aggregate_row_groups = None from merlin.core.utils import run_on_worker - -from .dataset_engine import DatasetEngine -from .fsspec_utils import _optimized_read_partition_remote, _optimized_read_remote -from .shuffle import Shuffle, shuffle_df -from .writer import ThreadedWriter +from merlin.io.dataset_engine import DatasetEngine +from merlin.io.fsspec_utils import _optimized_read_partition_remote, _optimized_read_remote +from merlin.io.shuffle import Shuffle, shuffle_df +from merlin.io.writer import ThreadedWriter LOG = logging.getLogger("merlin") diff --git a/merlin/io/writer.py b/merlin/io/writer.py index 397229026..9e4b7a452 100644 --- a/merlin/io/writer.py +++ b/merlin/io/writer.py @@ -28,8 +28,7 @@ from fsspec.core import get_fs_token_paths from merlin.core.dispatch import annotate - -from .shuffle import shuffle_df +from merlin.io.shuffle import shuffle_df class Writer: diff --git a/merlin/io/writer_factory.py b/merlin/io/writer_factory.py index 7c144b833..c7ed4489f 100644 --- a/merlin/io/writer_factory.py +++ b/merlin/io/writer_factory.py @@ -15,8 +15,8 @@ # from fsspec.core import get_fs_token_paths -from .hugectr import HugeCTRWriter -from .parquet import CPUParquetWriter, GPUParquetWriter +from merlin.io.hugectr import HugeCTRWriter +from merlin.io.parquet import CPUParquetWriter, GPUParquetWriter def writer_factory( diff --git a/merlin/schema/__init__.py b/merlin/schema/__init__.py index 256b2a26f..3bdee8aca 100644 --- a/merlin/schema/__init__.py +++ b/merlin/schema/__init__.py @@ -15,5 +15,5 @@ # # flake8: noqa -from .schema import ColumnSchema, Schema -from .tags import Tags, TagSet, TagsType +from merlin.schema.schema import ColumnSchema, Schema +from merlin.schema.tags import Tags, TagSet, TagsType diff --git a/merlin/schema/io/tensorflow_metadata.py b/merlin/schema/io/tensorflow_metadata.py index cbe496ac5..2d07aacf6 100644 --- a/merlin/schema/io/tensorflow_metadata.py +++ b/merlin/schema/io/tensorflow_metadata.py @@ -18,13 +18,13 @@ import fsspec import numpy -from ..schema import ColumnSchema -from ..schema import Schema as MerlinSchema -from ..tags import Tags -from . import proto_utils, schema_bp -from .schema_bp import Feature, FeatureType, FixedShape, FloatDomain, IntDomain -from .schema_bp import Schema as ProtoSchema -from .schema_bp import ValueCount +from merlin.schema.io import proto_utils, schema_bp +from merlin.schema.io.schema_bp import Feature, FeatureType, FixedShape, FloatDomain, IntDomain +from merlin.schema.io.schema_bp import Schema as ProtoSchema +from merlin.schema.io.schema_bp import ValueCount +from merlin.schema.schema import ColumnSchema +from merlin.schema.schema import Schema as MerlinSchema +from merlin.schema.tags import Tags DOMAIN_ATTRS = {FeatureType.INT: "int_domain", FeatureType.FLOAT: "float_domain"} FEATURE_TYPES = { diff --git a/merlin/schema/schema.py b/merlin/schema/schema.py index a60524c89..9497efe0d 100644 --- a/merlin/schema/schema.py +++ b/merlin/schema/schema.py @@ -19,7 +19,7 @@ import numpy as np import pandas as pd -from .tags import Tags, TagSet +from merlin.schema.tags import Tags, TagSet @dataclass(frozen=True)