Dummy

Module for the DummyWriter class.

koheesio.spark.writers.dummy.DummyWriter #

A simple DummyWriter that performs the equivalent of a df.show() on the given DataFrame and returns the first row of data as a dict.

This Writer does not actually write anything to a source/destination, but is useful for debugging or testing purposes.

Parameters:

Name	Type	Description	Default
`n`	`PositiveInt`	Number of rows to show.	`20`
`truncate`	`bool \| PositiveInt`	If set to `True`, truncate strings longer than 20 chars by default. If set to a number greater than one, truncates long strings to length `truncate` and align cells right.	`True`
`vertical`	`bool`	If set to `True`, print output rows vertically (one line per column value).	`False`

n `class-attribute` `instance-attribute` #

n: PositiveInt = Field(
    default=20, description="Number of rows to show.", gt=0
)

truncate `class-attribute` `instance-attribute` #

truncate: Union[bool, PositiveInt] = Field(
    default=True,
    description="If set to ``True``, truncate strings longer than 20 chars by default.If set to a number greater than one, truncates long strings to length ``truncate`` and align cells right.",
)

vertical `class-attribute` `instance-attribute` #

vertical: bool = Field(
    default=False,
    description="If set to ``True``, print output rows vertically (one line per column value).",
)

Output #

DummyWriter output

df_content `class-attribute` `instance-attribute` #

df_content: str = Field(
    default=...,
    description="The content of the DataFrame as a string",
)

head `class-attribute` `instance-attribute` #

head: Dict[str, Any] = Field(
    default=...,
    description="The first row of the DataFrame as a dict",
)

execute #

execute() -> Output

Execute the DummyWriter

Source code in src/koheesio/spark/writers/dummy.py

def execute(self) -> Output:
    """Execute the DummyWriter"""
    # logs the equivalent of doing df.show()
    df_content = show_string(df=self.df, n=self.n, truncate=self.truncate, vertical=self.vertical)
    self.log.info(f"content of df that was passed to DummyWriter:\n{df_content}")

    self.output.head = self.df.head().asDict()
    self.output.df_content = df_content

int_truncate #

int_truncate(truncate_value: Union[int, bool]) -> int

Truncate is either a bool or an int.

Parameters:

truncate_value : int | bool, optional, default=True If int, specifies the maximum length of the string. If bool and True, defaults to a maximum length of 20 characters.

Returns:

int The maximum length of the string.

Source code in src/koheesio/spark/writers/dummy.py

@field_validator("truncate")
def int_truncate(cls, truncate_value: Union[int, bool]) -> int:
    """
    Truncate is either a bool or an int.

    Parameters:
    -----------
    truncate_value : int | bool, optional, default=True
        If int, specifies the maximum length of the string.
        If bool and True, defaults to a maximum length of 20 characters.

    Returns:
    --------
    int
        The maximum length of the string.

    """
    # Same logic as what is inside DataFrame.show()
    if isinstance(truncate_value, bool) and truncate_value is True:
        return 20  # default is 20 chars
    return int(truncate_value)  # otherwise 0, or whatever the user specified

Dummy

koheesio.spark.writers.dummy.DummyWriter #

n class-attribute instance-attribute #

truncate class-attribute instance-attribute #

vertical class-attribute instance-attribute #