File loader

Generic file Readers for different file formats.

Supported file formats: - CSV - Parquet - Avro - JSON - ORC - Text

Examples:

from koheesio.spark.readers import (
    CsvReader,
    ParquetReader,
    AvroReader,
    JsonReader,
    OrcReader,
)

csv_reader = CsvReader(path="path/to/file.csv", header=True)
parquet_reader = ParquetReader(path="path/to/file.parquet")
avro_reader = AvroReader(path="path/to/file.avro")
json_reader = JsonReader(path="path/to/file.json")
orc_reader = OrcReader(path="path/to/file.orc")

For more information about the available options, see Spark's official documentation.

koheesio.spark.readers.file_loader.AvroReader #

Reads an Avro file.

This class is a convenience class that sets the format field to FileFormat.avro.

Extra parameters can be passed to the reader using the extra_params attribute or as keyword arguments.

Example:

reader = AvroReader(path="path/to/file.avro", mergeSchema=True)

Make sure to have the spark-avro package installed in your environment.

For more information about the available options, see the official documentation.

format `class-attribute` `instance-attribute` #

format: FileFormat = avro

koheesio.spark.readers.file_loader.CsvReader #

Reads a CSV file.

This class is a convenience class that sets the format field to FileFormat.csv.

Extra parameters can be passed to the reader using the extra_params attribute or as keyword arguments.

Example:

reader = CsvReader(path="path/to/file.csv", header=True)

For more information about the available options, see the official pyspark documentation and read about CSV data source.

Also see the data sources generic options.

format `class-attribute` `instance-attribute` #

format: FileFormat = csv

koheesio.spark.readers.file_loader.FileFormat #

Supported file formats.

This enum represents the supported file formats that can be used with the FileLoader class. The available file formats are: - csv: Comma-separated values format - parquet: Apache Parquet format - avro: Apache Avro format - json: JavaScript Object Notation format - orc: Apache ORC format - text: Plain text format

avro `class-attribute` `instance-attribute` #

avro = 'avro'

csv `class-attribute` `instance-attribute` #

csv = 'csv'

json `class-attribute` `instance-attribute` #

json = 'json'

orc `class-attribute` `instance-attribute` #

orc = 'orc'

parquet `class-attribute` `instance-attribute` #

parquet = 'parquet'

text `class-attribute` `instance-attribute` #

text = 'text'

koheesio.spark.readers.file_loader.FileLoader #

Generic file reader.

Available file formats:
- CSV
- Parquet
- Avro
- JSON
- ORC
- Text (default)

Extra parameters can be passed to the reader using the `extra_params` attribute or as keyword arguments.

Example:
```python
reader = FileLoader(
    path="path/to/textfile.txt",
    format="text",
    header=True,
    lineSep="

", ) ```

For more information about the available options, see Spark's
[official pyspark documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.text.html)
and [read about text data source](https://spark.apache.org/docs/latest/sql-data-sources-text.html).

Also see the [data sources generic options](https://spark.apache.org/docs/3.5.0/sql-data-sources-generic-options.html).

format `class-attribute` `instance-attribute` #

format: FileFormat = Field(
    default=text, description="File format to read"
)

path `class-attribute` `instance-attribute` #

path: Union[Path, str] = Field(
    default=..., description="Path to the file to read"
)

schema_ `class-attribute` `instance-attribute` #

schema_: Optional[Union[StructType, str]] = Field(
    default=None,
    description="Schema to use when reading the file",
    validate_default=False,
    alias="schema",
)

streaming `class-attribute` `instance-attribute` #

streaming: Optional[bool] = Field(
    default=False,
    description="Whether to read the files as a Stream or not",
)

ensure_path_is_str #

ensure_path_is_str(
    path: Union[Path, str],
) -> Union[Path, str]

Ensure that the path is a string as required by Spark.

Source code in src/koheesio/spark/readers/file_loader.py

@field_validator("path")
def ensure_path_is_str(cls, path: Union[Path, str]) -> Union[Path, str]:
    """Ensure that the path is a string as required by Spark."""
    if isinstance(path, Path):
        return str(path.absolute().as_posix())
    return path

execute #

execute() -> Output

Reads the file, in batch or as a stream, using the specified format and schema, while applying any extra parameters.

Source code in src/koheesio/spark/readers/file_loader.py

def execute(self) -> Reader.Output:
    """Reads the file, in batch or as a stream, using the specified format and schema, while applying any extra parameters."""
    reader = self.spark.readStream if self.streaming else self.spark.read
    reader = reader.format(self.format)

    if self.schema_:
        reader.schema(self.schema_)

    if self.extra_params:
        reader = reader.options(**self.extra_params)

    self.output.df = reader.load(self.path)  # type: ignore

koheesio.spark.readers.file_loader.JsonReader #

Reads a JSON file.

This class is a convenience class that sets the format field to FileFormat.json.

Extra parameters can be passed to the reader using the extra_params attribute or as keyword arguments.

Example:

reader = JsonReader(path="path/to/file.json", allowComments=True)

For more information about the available options, see the official pyspark documentation and read about JSON data source.

Also see the data sources generic options.

format `class-attribute` `instance-attribute` #

format: FileFormat = json

koheesio.spark.readers.file_loader.OrcReader #

Reads an ORC file.

This class is a convenience class that sets the format field to FileFormat.orc.

Extra parameters can be passed to the reader using the extra_params attribute or as keyword arguments.

Example:

reader = OrcReader(path="path/to/file.orc", mergeSchema=True)

For more information about the available options, see the official documentation and read about ORC data source.

Also see the data sources generic options.

format `class-attribute` `instance-attribute` #

format: FileFormat = orc

koheesio.spark.readers.file_loader.ParquetReader #

Reads a Parquet file.

This class is a convenience class that sets the format field to FileFormat.parquet.

Extra parameters can be passed to the reader using the extra_params attribute or as keyword arguments.

Example:

reader = ParquetReader(path="path/to/file.parquet", mergeSchema=True)

For more information about the available options, see the official pyspark documentation and read about Parquet data source.

Also see the data sources generic options.

format `class-attribute` `instance-attribute` #

format: FileFormat = parquet

File loader

koheesio.spark.readers.file_loader.AvroReader #

format class-attribute instance-attribute #

koheesio.spark.readers.file_loader.CsvReader #

format class-attribute instance-attribute #

koheesio.spark.readers.file_loader.FileFormat #

avro class-attribute instance-attribute #

csv class-attribute instance-attribute #

json class-attribute instance-attribute #

orc class-attribute instance-attribute #

parquet class-attribute instance-attribute #

text class-attribute instance-attribute #

koheesio.spark.readers.file_loader.FileLoader #

format class-attribute instance-attribute #

path class-attribute instance-attribute #

schema_ class-attribute instance-attribute #

streaming class-attribute instance-attribute #

ensure_path_is_str #

execute #

koheesio.spark.readers.file_loader.JsonReader #

format class-attribute instance-attribute #

koheesio.spark.readers.file_loader.OrcReader #

format class-attribute instance-attribute #

koheesio.spark.readers.file_loader.ParquetReader #

format class-attribute instance-attribute #

format `class-attribute` `instance-attribute` #

format `class-attribute` `instance-attribute` #

avro `class-attribute` `instance-attribute` #

csv `class-attribute` `instance-attribute` #

json `class-attribute` `instance-attribute` #

orc `class-attribute` `instance-attribute` #

parquet `class-attribute` `instance-attribute` #

text `class-attribute` `instance-attribute` #

format `class-attribute` `instance-attribute` #

path `class-attribute` `instance-attribute` #

schema_ `class-attribute` `instance-attribute` #

streaming `class-attribute` `instance-attribute` #

format `class-attribute` `instance-attribute` #

format `class-attribute` `instance-attribute` #

format `class-attribute` `instance-attribute` #