Spark expectations
Koheesio step for running data quality rules with Spark Expectations engine.
koheesio.integrations.spark.dq.spark_expectations.SparkExpectationsTransformation #
Run DQ rules for an input dataframe with Spark Expectations engine.
References
Spark Expectations: https://engineering.nike.com/spark-expectations/1.0.0/
drop_meta_column
class-attribute
instance-attribute
#
drop_meta_column: bool = Field(
default=False,
alias="drop_meta_columns",
description="Whether to drop meta columns added by spark expectations on the output df",
)
enable_debugger
class-attribute
instance-attribute
#
enable_debugger: bool = Field(
default=False, alias="debugger", description="..."
)
error_writer_format
class-attribute
instance-attribute
#
error_writer_format: Optional[str] = Field(
default="delta",
alias="dataframe_writer_format",
description="The format used to write to the err table",
)
error_writer_mode
class-attribute
instance-attribute
#
error_writer_mode: Optional[Union[str, BatchOutputMode]] = (
Field(
default=APPEND,
alias="dataframe_writer_mode",
description="The write mode that will be used to write to the err table",
)
)
error_writing_options
class-attribute
instance-attribute
#
error_writing_options: Optional[Dict[str, str]] = Field(
default_factory=dict,
alias="error_writing_options",
description="Options for writing to the error table",
)
format
class-attribute
instance-attribute
#
format: str = Field(
default="delta",
alias="dataframe_writer_format",
description="The format used to write to the stats and err table. Separate output formats can be specified for each table using the error_writer_format and stats_writer_format params",
)
mode
class-attribute
instance-attribute
#
mode: Union[str, BatchOutputMode] = Field(
default=APPEND,
alias="dataframe_writer_mode",
description="The write mode that will be used to write to the err and stats table. Separate output modes can be specified for each table using the error_writer_mode and stats_writer_mode params",
)
product_id
class-attribute
instance-attribute
#
product_id: str = Field(
default=...,
description="Spark Expectations product identifier",
)
rules_table
class-attribute
instance-attribute
#
rules_table: str = Field(
default=...,
alias="product_rules_table",
description="DQ rules table",
)
se_user_conf
class-attribute
instance-attribute
#
se_user_conf: Dict[str, Any] = Field(
default={
se_notifications_enable_email: False,
se_notifications_enable_slack: False,
},
alias="user_conf",
description="SE user provided confs",
validate_default=False,
)
statistics_streaming
class-attribute
instance-attribute
#
statistics_streaming: Dict[str, Any] = Field(
default={se_enable_streaming: False},
alias="stats_streaming_options",
description="SE stats streaming options ",
validate_default=False,
)
statistics_table
class-attribute
instance-attribute
#
statistics_table: str = Field(
default=...,
alias="dq_stats_table_name",
description="DQ stats table",
)
stats_writer_format
class-attribute
instance-attribute
#
stats_writer_format: Optional[str] = Field(
default="delta",
alias="stats_writer_format",
description="The format used to write to the stats table",
)
stats_writer_mode
class-attribute
instance-attribute
#
stats_writer_mode: Optional[Union[str, BatchOutputMode]] = (
Field(
default=APPEND,
alias="stats_writer_mode",
description="The write mode that will be used to write to the stats table",
)
)
target_table
class-attribute
instance-attribute
#
target_table: str = Field(
default=...,
alias="target_table_name",
description="The table that will contain good records. Won't write to it, but will write to the err table with same name plus _err suffix",
)
Output #
Output of the SparkExpectationsTransformation step.
error_table_writer
class-attribute
instance-attribute
#
error_table_writer: WrappedDataFrameWriter = Field(
default=...,
description="Spark Expectations error table writer",
)
rules_df
class-attribute
instance-attribute
#
se
class-attribute
instance-attribute
#
stats_table_writer
class-attribute
instance-attribute
#
stats_table_writer: WrappedDataFrameWriter = Field(
default=...,
description="Spark Expectations stats table writer",
)
execute #
execute() -> Output
Apply data quality rules to a dataframe using the out-of-the-box SE decorator