Source code for dagster_pandas.constraints

import sys
from collections import defaultdict
from datetime import datetime
from functools import wraps

import pandas as pd
from pandas import DataFrame

from dagster import DagsterType, MetadataEntry, TypeCheck
from dagster import _check as check
from dagster._utils.backcompat import experimental_class_warning


class ConstraintViolationException(Exception):
    """Indicates that a constraint has been violated."""


class ConstraintWithMetadataException(Exception):
    """
    This class defines the response generated when a pandas DF fails validation -- it can be used to generate either a
    failed typecheck or an exception.

    Args:
        constraint_name (str):  the name of the violated constraint
        constraint_description (Optional[str]): the description of the violated constraint
        expectation (Optional[Union[dict,list, str, set]]): what result was expected -- typically a jsonlike, though it can be a string
        offending (Optional[Union[dict,list, str, set]]):  which pieces of the dataframe violated the expectation, typically list or string
        actual (Optional[Union[dict,list, str, set]]): what those pieces of the dataframe actually were -- typically a jsonlike
    """

    def __init__(
        self,
        constraint_name,
        constraint_description="",
        expectation=None,
        offending=None,
        actual=None,
    ):
        self.constraint_name = constraint_name
        self.constraint_description = constraint_description
        self.expectation = check.opt_inst_param(expectation, "expectation", (dict, list, str, set))
        self.offending = check.opt_inst_param(offending, "offending", (dict, list, str, set))
        self.actual = check.opt_inst_param(actual, "actual", (dict, list, str, set))
        super(ConstraintWithMetadataException, self).__init__(
            "Violated {} - {}, {} was/were expected, but we received {} which was/were {}".format(
                constraint_name,
                constraint_description,
                expectation,
                offending,
                actual,
            )
        )

    def normalize_metadata_json_value(self, val):
        if isinstance(val, set):
            return list(val)
        else:
            return val

    def convert_to_metadata(self):
        return MetadataEntry(
            "constraint-metadata",
            value={
                "constraint_name": self.constraint_name,
                "constraint_description": self.constraint_description,
                "expected": self.normalize_metadata_json_value(self.expectation),
                "offending": self.normalize_metadata_json_value(self.offending),
                "actual": self.normalize_metadata_json_value(self.actual),
            },
        )

    def return_as_typecheck(self):
        return TypeCheck(
            success=False, description=self.args[0], metadata_entries=[self.convert_to_metadata()]
        )


class DataFrameConstraintViolationException(ConstraintViolationException):
    """Indicates a dataframe level constraint has been violated."""

    def __init__(self, constraint_name, constraint_description):
        super(DataFrameConstraintViolationException, self).__init__(
            "Violated {constraint_name} - {constraint_description}".format(
                constraint_name=constraint_name, constraint_description=constraint_description
            )
        )


class DataFrameWithMetadataException(ConstraintWithMetadataException):
    def __init__(self, constraint_name, constraint_description, expectation, actual):
        super(DataFrameWithMetadataException, self).__init__(
            constraint_name, constraint_description, expectation, "a malformed dataframe", actual
        )


class ColumnConstraintViolationException(ConstraintViolationException):
    """Indicates that a column constraint has been violated."""

    def __init__(self, constraint_name, constraint_description, column_name, offending_rows=None):
        self.constraint_name = constraint_name
        self.constraint_description = constraint_description
        self.column_name = column_name
        self.offending_rows = offending_rows
        super(ColumnConstraintViolationException, self).__init__(self.construct_message())

    def construct_message(self):
        base_message = 'Violated "{constraint_name}" for column "{column_name}" - {constraint_description}'.format(
            constraint_name=self.constraint_name,
            constraint_description=self.constraint_description,
            column_name=self.column_name,
        )
        if self.offending_rows is not None:
            base_message += "The offending (index, row values) are the following: {}".format(
                self.offending_rows
            )
        return base_message


class ColumnWithMetadataException(ConstraintWithMetadataException):
    def __init__(self, constraint_name, constraint_description, expectation, offending, actual):
        super(ColumnWithMetadataException, self).__init__(
            "the column constraint " + constraint_name,
            constraint_description,
            expectation,
            offending,
            actual,
        )


class Constraint:
    """
    Base constraint object that all constraints inherit from.

    Args:
        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.
        markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.
    """

    def __init__(self, error_description=None, markdown_description=None):
        self.name = self.__class__.__name__
        self.markdown_description = check.str_param(markdown_description, "markdown_description")
        self.error_description = check.str_param(error_description, "error_description")


class ConstraintWithMetadata:
    """
    This class defines a base constraint over pandas DFs with organized metadata

    args:
        description (str): description of the constraint
        validation_fn (Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:
                    the validation function to run over inputted data
                    This function should return a tuple of a boolean for success or failure, and a dict containing
                    metadata about the test -- this metadata will be passed to the resulting exception if validation
                    fails.
        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce
        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event
                    (if set to False) when validation fails
        name (Optional[str]): what to call the constraint, defaults to the class name.
    """

    # TODO:  validation_fn returning metadata is sorta broken.  maybe have it yield typecheck events and grab metadata?

    def __init__(
        self, description, validation_fn, resulting_exception, raise_or_typecheck=True, name=None
    ):
        experimental_class_warning(self.__class__.__name__)
        if name is None:
            self.name = self.__class__.__name__
        else:
            self.name = name
        self.description = description
        # should return a tuple of (bool, and either an empty dict or a dict of extra params)
        self.validation_fn = validation_fn
        self.resulting_exception = resulting_exception
        self.raise_or_typecheck = raise_or_typecheck

    def validate(self, data, *args, **kwargs):
        res = self.validation_fn(data, *args, **kwargs)
        if not res[0]:
            exc = self.resulting_exception(
                constraint_name=self.name, constraint_description=self.description, **res[1]
            )

            if self.raise_or_typecheck:
                raise exc
            else:
                return exc.return_as_typecheck()

        else:
            if res[0]:
                return TypeCheck(success=True)

    # TODO:  composition of validations
    def as_dagster_type(self, *args, **kwargs):
        if self.raise_or_typecheck:
            raise Exception(
                "Dagster types can only be constructed from constraints that return typechecks"
            )
        return DagsterType(
            name=self.name,
            description="A Pandas DataFrame with the following validation: {}".format(
                self.description
            ),
            type_check_fn=lambda x: self.validate(x, *args),
            **kwargs,
        )


class MultiConstraintWithMetadata(ConstraintWithMetadata):
    """
    Use this class if you have multiple constraints to check over the entire dataframe

    args:
        description (str): description of the constraint
        validation_fn_arr(List[Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):
                    a list of the validation functions to run over inputted data
                    Each function should return a tuple of a boolean for success or failure, and a dict containing
                    metadata about the test -- this metadata will be passed to the resulting exception if validation
                    fails.
        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce
        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event
                    (if set to False) when validation fails
        name (Optional[str]): what to call the constraint, defaults to the class name.
    """

    def __init__(
        self,
        description,
        validation_fn_arr,
        resulting_exception,
        raise_or_typecheck=True,
        name=None,
    ):
        validation_fn_arr = check.list_param(validation_fn_arr, "validation_fn_arr")

        def validation_fn(data, *args, **kwargs):

            results = [f(data, *args, **kwargs) for f in validation_fn_arr]
            truthparam = all(item[0] for item in results)
            metadict = defaultdict(dict)
            for i, dicta in enumerate(item[1] for item in results):
                if len(dicta.keys()) > 0:
                    for key in dicta:
                        metadict[key][validation_fn_arr[i].__name__] = dicta[key]
            return (truthparam, metadict)

        super(MultiConstraintWithMetadata, self).__init__(
            description,
            validation_fn,
            resulting_exception,
            raise_or_typecheck=raise_or_typecheck,
            name=name,
        )


class StrictColumnsWithMetadata(ConstraintWithMetadata):
    def __init__(self, column_list, enforce_ordering=False, raise_or_typecheck=True, name=None):
        self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")
        self.column_list = check.list_param(column_list, "strict_column_list", of_type=str)

        def validation_fcn(inframe):
            if list(inframe.columns) == column_list:
                return (True, {})
            else:
                if self.enforce_ordering:
                    resdict = {"expectation": self.column_list, "actual": list(inframe.columns)}
                    return (False, resdict)
                else:
                    if set(inframe.columns) == set(column_list):
                        return (True, {})
                    else:
                        extra = [x for x in inframe.columns if x not in set(column_list)]
                        missing = [x for x in set(column_list) if x not in inframe.columns]
                        resdict = {
                            "expectation": self.column_list,
                            "actual": {"extra_columns": extra, "missing_columns": missing},
                        }
                        return (False, resdict)

        basestr = "ensuring that the right columns, {} were present".format(self.column_list)
        if enforce_ordering:
            basestr += " in the right order"
        super(StrictColumnsWithMetadata, self).__init__(
            basestr,
            validation_fcn,
            DataFrameWithMetadataException,
            raise_or_typecheck=raise_or_typecheck,
            name=name,
        )


class DataFrameConstraint(Constraint):
    """
    Base constraint object that represent Dataframe shape constraints.

    Args:
        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.
        markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.
    """

    def __init__(self, error_description=None, markdown_description=None):
        super(DataFrameConstraint, self).__init__(
            error_description=error_description, markdown_description=markdown_description
        )

    def validate(self, dataframe):
        raise NotImplementedError()


[docs]class StrictColumnsConstraint(DataFrameConstraint):
    """
    A dataframe constraint that validates column existence and ordering.

    Args:
        strict_column_list (List[str]): The exact list of columns that your dataframe must have.
        enforce_ordering (Optional[bool]): If true, will enforce that the ordering of column names must match.
            Default is False.
    """

    def __init__(self, strict_column_list, enforce_ordering=False):
        self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")
        self.strict_column_list = check.list_param(
            strict_column_list, "strict_column_list", of_type=str
        )
        description = "No columns outside of {cols} allowed. ".format(cols=self.strict_column_list)
        if enforce_ordering:
            description += "Columns must be in that order."
        super(StrictColumnsConstraint, self).__init__(
            error_description=description, markdown_description=description
        )

    def validate(self, dataframe):
        check.inst_param(dataframe, "dataframe", DataFrame)
        columns_received = list(dataframe.columns)
        if self.enforce_ordering:
            if self.strict_column_list != columns_received:
                raise DataFrameConstraintViolationException(
                    constraint_name=self.name,
                    constraint_description="Expected the following ordering of columns {expected}. Received: {received}".format(
                        expected=self.strict_column_list, received=columns_received
                    ),
                )
        for column in columns_received:
            if column not in self.strict_column_list:
                raise DataFrameConstraintViolationException(
                    constraint_name=self.name,
                    constraint_description="Expected {}. Recevied {}.".format(
                        self.strict_column_list, columns_received
                    ),
                )


[docs]class RowCountConstraint(DataFrameConstraint):
    """
    A dataframe constraint that validates the expected count of rows.

    Args:
        num_allowed_rows (int): The number of allowed rows in your dataframe.
        error_tolerance (Optional[int]): The acceptable threshold if you are not completely certain. Defaults to 0.
    """

    def __init__(self, num_allowed_rows, error_tolerance=0):
        self.num_allowed_rows = check.int_param(num_allowed_rows, "num_allowed_rows")
        self.error_tolerance = abs(check.int_param(error_tolerance, "error_tolerance"))
        if self.error_tolerance > self.num_allowed_rows:
            raise ValueError("Tolerance can't be greater than the number of rows you expect.")
        description = "Dataframe must have {} +- {} rows.".format(
            self.num_allowed_rows, self.error_tolerance
        )
        super(RowCountConstraint, self).__init__(
            error_description=description, markdown_description=description
        )

    def validate(self, dataframe):
        check.inst_param(dataframe, "dataframe", DataFrame)

        if not (
            self.num_allowed_rows - self.error_tolerance
            <= len(dataframe)
            <= self.num_allowed_rows + self.error_tolerance
        ):
            raise DataFrameConstraintViolationException(
                constraint_name=self.name,
                constraint_description="Expected {expected} +- {tolerance} rows. Got {received}".format(
                    expected=self.num_allowed_rows,
                    tolerance=self.error_tolerance,
                    received=len(dataframe),
                ),
            )


def apply_ignore_missing_data_to_mask(mask, column):
    return mask & ~column.isnull()


class ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):
    """
    Similar to the base class, but now your validation functions should take in columns (pd.Series) not Dataframes.
    args:
        description (str): description of the constraint
        validation_fn (Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:
                    the validation function to run over inputted data
                    This function should return a tuple of a boolean for success or failure, and a dict containing
                    metadata about the test -- this metadata will be passed to the resulting exception if validation
                    fails.
        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce
        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event
                    (if set to False) when validation fails
        name (Optional[str]): what to call the constraint, defaults to the class name.
    """

    def validate(self, data, *columns, **kwargs):
        if len(columns) == 0:
            columns = data.columns
        columns = [column for column in columns if column in data.columns]
        relevant_data = data[list(columns)]

        offending_columns = set()
        offending_values = {}
        for column in columns:
            # TODO: grab extra metadata
            res = self.validation_fn(relevant_data[column])
            if not res[0]:
                offending_columns.add(column)
                if not res[1].get("actual") is None:
                    offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()]
                else:
                    offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()]
        if len(offending_columns) == 0 and not self.raise_or_typecheck:
            return TypeCheck(success=True)
        elif len(offending_columns) > 0:
            metadict = {
                "expectation": self.description.replace("Confirms", ""),
                "actual": offending_values,
                "offending": offending_columns,
            }
            exc = self.resulting_exception(
                constraint_name=self.name, constraint_description=self.description, **metadict
            )

            if self.raise_or_typecheck:
                raise exc
            else:
                return exc.return_as_typecheck()


class ColumnConstraintWithMetadata(ConstraintWithMetadata):
    """
    This class is useful for constructing single constraints that
    you want to apply to multiple columns of your dataframe
    The main difference from the base class in terms of construction is that now, your validation_fns should operate on
    individual values.
    args:
        description (str): description of the constraint
        validation_fn (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:
                    the validation function to run over inputted data
                    This function should return a tuple of a boolean for success or failure, and a dict containing
                    metadata about the test -- this metadata will be passed to the resulting exception if validation
                    fails.
        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce
        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event
                    (if set to False) when validation fails
        name (Optional[str]): what to call the constraint, defaults to the class name.
    """

    def validate(self, data, *columns, **kwargs):
        if len(columns) == 0:
            columns = data.columns

        columns = [column for column in columns if column in data.columns]
        relevant_data = data[list(columns)]
        offending = {}
        offending_values = {}
        # TODO:  grab metadata from here
        inverse_validation = lambda x: not self.validation_fn(x)[0]
        for column in columns:
            results = relevant_data[relevant_data[column].apply(inverse_validation)]
            if len(results.index.tolist()) > 0:
                offending[column] = ["row " + str(i) for i in (results.index.tolist())]
                offending_values[column] = results[column].tolist()
        if len(offending) == 0:
            if not self.raise_or_typecheck:
                return TypeCheck(success=True)
        else:
            metadict = {
                "expectation": self.validation_fn.__doc__,
                "actual": offending_values,
                "offending": offending,
            }
            exc = self.resulting_exception(
                constraint_name=self.name, constraint_description=self.description, **metadict
            )

            if self.raise_or_typecheck:
                raise exc
            else:
                return exc.return_as_typecheck()


class MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):
    """
    This class is useful for constructing more complicated relationships between columns
    and expectations -- i.e. you want some validations on column A, others on column B, etc.
    This lets you package up the metadata neatly,
    and also allows for cases like 'fail if any one of these constraints fails but still run all of them'

    Args:
        description (str): description of the overall set of validations
        fn_and_columns_dict (Dict[str, List[Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):
                                    while this is a relatively complex type,
                                    what it amounts to is 'a dict mapping columns to the functions to
                                    run on them'
        resulting_exception (type): the response to generate if validation fails. Subclass of
                                    ConstraintWithMetadataException
        raise_or_typecheck (Optional[bool]):  whether to raise an exception (true) or a failed typecheck (false)
        type_for_internal (Optional[type]): what type to use for internal validators.  Subclass of
                                            ConstraintWithMetadata
        name (Optional[str]): what to call the constraint, defaults to the class name.
    """

    def __init__(
        self,
        description,
        fn_and_columns_dict,
        resulting_exception,
        raise_or_typecheck=True,
        type_for_internal=ColumnConstraintWithMetadata,
        name=None,
    ):
        # TODO:  support multiple descriptions
        self.column_to_fn_dict = check.dict_param(
            fn_and_columns_dict, "fn_and_columns_dict", key_type=str
        )

        def validation_fn(data, *args, **kwargs):
            metadict = defaultdict(dict)
            truthparam = True
            for column, fn_arr in self.column_to_fn_dict.items():
                if column not in data.columns:
                    continue
                for fn in fn_arr:
                    # TODO:  do this more effectively
                    new_validator = type_for_internal(
                        fn.__doc__, fn, ColumnWithMetadataException, raise_or_typecheck=False
                    )
                    result = new_validator.validate(
                        DataFrame(data[column]), column, *args, **kwargs
                    )
                    result_val = result.success
                    if result_val:
                        continue
                    result_dict = result.metadata_entries[0].entry_data.data
                    truthparam = truthparam and result_val
                    for key in result_dict.keys():
                        if "constraint" not in key:
                            if key == "expected":
                                new_key = "expectation"
                                result_dict[key] = result_dict[key].replace("returns", "").strip()
                                if column not in metadict[new_key] or new_key not in metadict:
                                    metadict[new_key][column] = dict()
                                metadict[new_key][column][fn.__name__] = result_dict[key]
                            else:
                                if column not in metadict[key] or key not in metadict:
                                    metadict[key][column] = dict()
                                if isinstance(result_dict[key], dict):
                                    metadict[key][column][fn.__name__] = result_dict[key][column]
                                else:
                                    metadict[key][column][fn.__name__] = "a violation"
            return truthparam, metadict

        super(MultiColumnConstraintWithMetadata, self).__init__(
            description,
            validation_fn,
            resulting_exception,
            raise_or_typecheck=raise_or_typecheck,
            name=name,
        )

    def validate(self, data, *args, **kwargs):
        return ConstraintWithMetadata.validate(self, data, *args, **kwargs)


class MultiAggregateConstraintWithMetadata(MultiColumnConstraintWithMetadata):
    """
    This class is similar to multicolumn, but takes in functions that operate on the whole column at once
    rather than ones that operate on each value --
    consider this similar to the difference between apply-map and apply aggregate.

    Args:
        description (str): description of the overall set of validations (TODO:  support multiple descriptions)
        fn_and_columns_dict (Dict[str, List[Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):
                                    while this is a relatively complex type,
                                    what it amounts to is a dict mapping columns to the functions to
                                    run on them'
        resulting_exception (type): the response to generate if validation fails. Subclass of
                                    ConstraintWithMetadataException
        raise_or_typecheck (Optional[bool]):  whether to raise an exception (true) or a failed typecheck (false)
        type_for_internal (Optional[type]): what type to use for internal validators.  Subclass of
                                            ConstraintWithMetadata
        name (Optional[str]): what to call the constraint, defaults to the class name.
    """

    def __init__(
        self,
        description,
        fn_and_columns_dict,
        resulting_exception,
        raise_or_typecheck=True,
        name=None,
    ):
        super(MultiAggregateConstraintWithMetadata, self).__init__(
            description,
            fn_and_columns_dict,
            resulting_exception,
            raise_or_typecheck=raise_or_typecheck,
            type_for_internal=ColumnAggregateConstraintWithMetadata,
            name=name,
        )


def non_null_validation(x):
    """
    validates that a particular value in a column is not null
    Usage:
        pass this as a column validator to
        :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'
        or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'
        Generally, you should prefer to use nonnull as a decorator/wrapper rather than using this
        directly.
    """
    return not pd.isnull(x), {}


def all_unique_validator(column, ignore_missing_vals=False):
    """
    validates that all values in an iterable are unique
    Returns duplicated values as metadata

    Usage:
        As a validation function for a
        :py:class:'~dagster_pandas.constraints.ColumnAggregateConstraintWithMetadata'
        or :py:class:'~dagster_pandas.constraints.MultiAggregateConstraintWithMetadata'
    Example:
        .. code-block:: python
            aggregate_validator = MultiAggregateConstraintWithMetadata(
                "confirms all values are unique",
                {'bar': [all_unique_validator]},
                ConstraintWithMetadataException,
                raise_or_typecheck=False,
            )
            ntype = create_structured_dataframe_type(
                "NumericType",
                columns_aggregate_validator=aggregate_validator
            )
            @op(out={'basic_dataframe': Out(dagster_type=ntype)})
            def create_dataframe(_):
                yield Output(
                    DataFrame({'foo': [1, 2, 3], 'bar': [9, 10, 10]}),
                    output_name='basic_dataframe',
                )
            #will fail with
            metadata['offending'] == {'bar': {'all_unique_validator': 'a violation'}}
            metadata['actual'] == {'bar': {'all_unique_validator': [10.0]}}
    """
    column = pd.Series(column)
    duplicated = column.duplicated()
    if ignore_missing_vals:
        duplicated = apply_ignore_missing_data_to_mask(duplicated, column)
    return not duplicated.any(), {"actual": column[duplicated]}


def nonnull(func):
    """
    decorator for column validation functions to make them error on nulls
    Usage:
        pass decorated functions as column validators to
        :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'
        or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'
    Args:
        func (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):
            the column validator you want to error on nulls
    """

    @wraps(func)
    def nvalidator(val):
        origval = func(val)
        nval = non_null_validation(val)
        return origval[0] and nval[0], {}

    nvalidator.__doc__ += " and ensures no values are null"

    return nvalidator


def column_range_validation_factory(minim=None, maxim=None, ignore_missing_vals=False):
    """
    factory for validators testing if column values are within a range
    Args:
        minim(Optional[Comparable]): the low end of the range
        maxim(Optional[Comparable]): the high end of the range
        ignore_missing_vals(Optional[bool]): whether to ignore nulls

    Returns: a validation function for this constraint
    Usage:
        pass returned functions as column validators to
         :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'
        or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'
    Examples:
        .. code-block:: python
            in_range_validator = column_range_validation_factory(1, 3, ignore_missing_vals=True)
            column_validator = MultiColumnConstraintWithMetadata(
                "confirms values are numbers in a range",
                {'foo': [in_range_validator]},
                ColumnWithMetadataException,
                raise_or_typecheck=False,
            )
            ntype = create_structured_dataframe_type(
                "NumericType",
                columns_validator=column_validator
            )
            @op(out={'basic_dataframe': Out(dagster_type=ntype)})
            def create_dataframe(_):
                yield Output(
                    DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),
                    output_name='basic_dataframe',
                )
            #will fail with
            metadata['offending'] == {'foo': {'in_range_validation_fn': ['row 2']}}
            metadata['actual'] == {'foo': {'in_range_validation_fn': [7]}}

    """
    if minim is None:
        if isinstance(maxim, datetime):
            minim = datetime.min
        else:
            minim = -1 * (sys.maxsize - 1)
    if maxim is None:
        if isinstance(minim, datetime):
            maxim = datetime.max
        else:
            maxim = sys.maxsize

    def in_range_validation_fn(x):
        if ignore_missing_vals and pd.isnull(x):
            return True, {}
        return (isinstance(x, (type(minim), type(maxim)))) and (x <= maxim) and (x >= minim), {}

    in_range_validation_fn.__doc__ = "checks whether values are between {} and {}".format(
        minim, maxim
    )
    if ignore_missing_vals:
        in_range_validation_fn.__doc__ += ", ignoring nulls"

    return in_range_validation_fn


def categorical_column_validator_factory(categories, ignore_missing_vals=False):
    """
    factory for validators testing if all values are in some set
    Args:
        categories(Union[Sequence, set]): the set of allowed values
        ignore_missing_vals(Optional[bool]): whether to ignore nulls

    Returns: a validation function for this constraint

    Usage:
        pass returned functions as column validators to
        :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'
        or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'

    Example:
        .. code-block:: python
            categorical_validation_fn = categorical_column_validator_factory([1, 2])
            column_validator = MultiColumnConstraintWithMetadata(
                "confirms values are numbers in a range",
                {'foo': [categorical_validation_fn]},
                ColumnWithMetadataException,
                raise_or_typecheck=False,
            )
            ntype = create_structured_dataframe_type(
                "NumericType",
                columns_validator=column_validator
            )
            @op(out={'basic_dataframe': Out(dagster_type=ntype)})
            def create_dataframe(_):
                yield Output(
                    DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),
                    output_name='basic_dataframe',
                )
            #will fail with
            metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 2']}}
            metadata['actual'] == {'foo': {'categorical_validation_fn': [7]}}

    """

    categories = set(categories)

    def categorical_validation_fn(x):
        if ignore_missing_vals and pd.isnull(x):
            return True, {}
        return (x in categories), {}

    categorical_validation_fn.__doc__ = (
        "checks whether values are within this set of values: {}".format(categories)
    )
    if ignore_missing_vals:
        categorical_validation_fn.__doc__ += ", ignoring nulls"

    return categorical_validation_fn


def dtype_in_set_validation_factory(datatypes, ignore_missing_vals=False):
    """
    factory for testing if the dtype of a val falls within some allowed set
    Args:
        datatypes(Union[set[type], type]): which datatype/datatypes are allowed
        ignore_missing_vals(Optional[bool]): whether to ignore nulls

    Returns: a validation function for this constraint

    Usage:
        pass returned functions as column validators to
        :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'
        or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'

    Examples:
        .. code-block:: python
            dtype_is_num_validator = dtype_in_set_validation_factory((int, float, int64, float64))
            column_validator = MultiColumnConstraintWithMetadata(
                "confirms values are numbers in a range",
                {'foo': [dtype_is_num_validator]},
                ColumnWithMetadataException,
                raise_or_typecheck=False,
            )
            ntype = create_structured_dataframe_type(
                "NumericType",
                columns_validator=column_validator
            )
            @op(out={'basic_dataframe': Out(dagster_type=ntype)})
            def create_dataframe(_):
                yield Output(
                    DataFrame({'foo': [1, 'a', 7], 'bar': [9, 10, 10]}),
                    output_name='basic_dataframe',
                )
            #will fail with
            metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 1']}}
            metadata['actual'] == {'foo': {'categorical_validation_fn': ['a']}}

    """

    def dtype_in_set_validation_fn(x):
        if ignore_missing_vals and pd.isnull(x):
            return True, {}
        return isinstance(x, datatypes), {}

    dtype_in_set_validation_fn.__doc__ = "checks whether values are this type/types: {}".format(
        datatypes
    )
    if ignore_missing_vals:
        dtype_in_set_validation_fn.__doc__ += ", ignoring nulls"

    return dtype_in_set_validation_fn


class ColumnRangeConstraintWithMetadata(ColumnConstraintWithMetadata):
    def __init__(self, minim=None, maxim=None, columns=None, raise_or_typecheck=True):
        self.name = self.__class__.__name__

        description = "Confirms values are between {} and {}".format(minim, maxim)
        super(ColumnRangeConstraintWithMetadata, self).__init__(
            description=description,
            validation_fn=column_range_validation_factory(minim=minim, maxim=maxim),
            resulting_exception=ColumnWithMetadataException,
            raise_or_typecheck=raise_or_typecheck,
        )
        self.columns = columns

    def validate(self, data, *args, **kwargs):
        if self.columns is None:
            self.columns = list(data.columns)
        self.columns.extend(args)
        return super(ColumnRangeConstraintWithMetadata, self).validate(
            data, *self.columns, **kwargs
        )


class ColumnConstraint(Constraint):
    """
    Base constraint object that represent dataframe column shape constraints.

    Args:
        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.
        markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.
    """

    def __init__(self, error_description=None, markdown_description=None):
        super(ColumnConstraint, self).__init__(
            error_description=error_description, markdown_description=markdown_description
        )

    def validate(self, dataframe, column_name):
        pass

    @staticmethod
    def get_offending_row_pairs(dataframe, column_name):
        return zip(dataframe.index.tolist(), dataframe[column_name].tolist())


class ColumnDTypeFnConstraint(ColumnConstraint):
    """
    A column constraint that applies a pandas dtype validation function to a columns dtype.

    Args:
        type_fn (Callable[[Set[str]], bool]): This is a function that takes the pandas columns dtypes and
            returns if those dtypes match the types it expects. See pandas.core.dtypes.common for examples.
    """

    def __init__(self, type_fn):
        self.type_fn = check.callable_param(type_fn, "type_fn")
        description = f'Dtype must satisfy "{self.type_fn.__name__}"'
        super(ColumnDTypeFnConstraint, self).__init__(
            error_description=description, markdown_description=description
        )

    def validate(self, dataframe, column_name):
        column_dtype = dataframe[column_name].dtype
        if not self.type_fn(column_dtype):
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=f'{self.error_description}, but was "{column_dtype}"',
                column_name=column_name,
            )


class ColumnDTypeInSetConstraint(ColumnConstraint):
    """
    A column constraint that validates the pandas column dtypes based on the expected set of dtypes.

    Args:
        expected_dtype_set (Set[str]): The set of pandas dtypes that the pandas column dtypes must match.
    """

    def __init__(self, expected_dtype_set):
        self.expected_dtype_set = check.set_param(expected_dtype_set, "expected_dtype_set")
        description = "Column dtype must be in the following set {}.".format(
            self.expected_dtype_set
        )
        super(ColumnDTypeInSetConstraint, self).__init__(
            error_description=description, markdown_description=description
        )

    def validate(self, dataframe, column_name):
        received_dtypes = dataframe[column_name].dtype
        if str(received_dtypes) not in self.expected_dtype_set:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description="{base_error_message}. DTypes received: {received_dtypes}".format(
                    base_error_message=self.error_description, received_dtypes=received_dtypes
                ),
                column_name=column_name,
            )


class NonNullableColumnConstraint(ColumnConstraint):
    """
    A column constraint that ensures all values in a pandas column are not null.
    """

    def __init__(self):
        description = "No Null values allowed."
        super(NonNullableColumnConstraint, self).__init__(
            error_description=description, markdown_description=description
        )

    def validate(self, dataframe, column_name):
        rows_with_null_columns = dataframe[dataframe[column_name].isna()]
        if not rows_with_null_columns.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=self.get_offending_row_pairs(rows_with_null_columns, column_name),
            )


class UniqueColumnConstraint(ColumnConstraint):
    """
    A column constraint that ensures all values in a pandas column are unique.

    Args:
        ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.
    """

    def __init__(self, ignore_missing_vals):
        description = "Column must be unique."
        self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")
        super(UniqueColumnConstraint, self).__init__(
            error_description=description, markdown_description=description
        )

    def validate(self, dataframe, column_name):
        invalid = dataframe[column_name].duplicated()
        if self.ignore_missing_vals:
            invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])
        rows_with_duplicated_values = dataframe[invalid]
        if not rows_with_duplicated_values.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=rows_with_duplicated_values,
            )


class CategoricalColumnConstraint(ColumnConstraint):
    """
    A column constraint that ensures all values in a pandas column are a valid category.

    Args:
        categories (Set[str]): Set of categories that values in your pandas column must match.
        ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.
    """

    def __init__(self, categories, ignore_missing_vals):
        self.categories = list(check.set_param(categories, "categories", of_type=str))
        self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")
        super(CategoricalColumnConstraint, self).__init__(
            error_description="Expected Categories are {}".format(self.categories),
            markdown_description="Category examples are {}...".format(self.categories[:5]),
        )

    def validate(self, dataframe, column_name):
        invalid = ~dataframe[column_name].isin(self.categories)
        if self.ignore_missing_vals:
            invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])
        rows_with_unexpected_buckets = dataframe[invalid]
        if not rows_with_unexpected_buckets.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=rows_with_unexpected_buckets,
            )


class MinValueColumnConstraint(ColumnConstraint):
    """
    A column constraint that ensures all values in a pandas column are greater than the provided
    lower bound [inclusive].

    Args:
        min_value (Union[int, float, datetime.datetime]): The lower bound.
        ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.
    """

    def __init__(self, min_value, ignore_missing_vals):
        self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))
        self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")
        super(MinValueColumnConstraint, self).__init__(
            markdown_description="values > {}".format(self.min_value),
            error_description="Column must have values > {}".format(self.min_value),
        )

    def validate(self, dataframe, column_name):
        invalid = dataframe[column_name] < self.min_value
        if self.ignore_missing_vals:
            invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])
        out_of_bounds_rows = dataframe[invalid]
        if not out_of_bounds_rows.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=out_of_bounds_rows,
            )


class MaxValueColumnConstraint(ColumnConstraint):
    """
    A column constraint that ensures all values in a pandas column are less than the provided
    upper bound [inclusive].

    Args:
        max_value (Union[int, float, datetime.datetime]): The upper bound.
        ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.
    """

    def __init__(self, max_value, ignore_missing_vals):
        self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))
        self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")
        super(MaxValueColumnConstraint, self).__init__(
            markdown_description="values < {}".format(self.max_value),
            error_description="Column must have values < {}".format(self.max_value),
        )

    def validate(self, dataframe, column_name):
        invalid = dataframe[column_name] > self.max_value
        if self.ignore_missing_vals:
            invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])
        out_of_bounds_rows = dataframe[invalid]
        if not out_of_bounds_rows.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=out_of_bounds_rows,
            )


class InRangeColumnConstraint(ColumnConstraint):
    """
    A column constraint that ensures all values in a pandas column are between the lower and upper
    bound [inclusive].

    Args:
        min_value (Union[int, float, datetime.datetime]): The lower bound.
        max_value (Union[int, float, datetime.datetime]): The upper bound.
        ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non
            missing values.
    """

    def __init__(self, min_value, max_value, ignore_missing_vals):
        self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))
        self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))
        self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")
        super(InRangeColumnConstraint, self).__init__(
            markdown_description="{} < values < {}".format(self.min_value, self.max_value),
            error_description="Column must have values between {} and {} inclusive.".format(
                self.min_value, self.max_value
            ),
        )

    def validate(self, dataframe, column_name):
        invalid = ~dataframe[column_name].between(self.min_value, self.max_value)
        if self.ignore_missing_vals:
            invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])
        out_of_bounds_rows = dataframe[invalid]
        if not out_of_bounds_rows.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=out_of_bounds_rows,
            )