Skip to content

Download file

This module contains the implementation of the DownloadFileStep class, which is responsible for downloading files from a given URL and saving them to a specified local directory path.

It supports various file write modes such as overwrite, append, ignore, exclusive, and backup.

Classes:

Name Description
FileWriteMode

Enum representing different file write modes.

DownloadFileStep

Class for downloading files with support for different write modes.

koheesio.steps.download_file.DownloadFileStep #

Downloads a file from the given URL and saves it to the specified download path.

Example
URL = "http://example.com/testfile.txt"
download_path = Path("downloads")
step = DownloadFileStep(
    url=URL, download_path=download_path, mode="ignore"
)
step.execute()

In the above example, the file testfile.txt will be downloaded from the URL http://example.com/testfile.txt to the downloads directory.

Parameters:

Name Type Description Default
url str

The URL to download the file from.

required
download_path str

The local directory path where the file will be downloaded.

required
chunk_size int

The size (in bytes) of the chunks to download the file in, must be greater than 16.

8192
mode FileWriteMode

Write mode: overwrite, append, ignore, exclusive, or backup. See the docstring of FileWriteMode for more details.

FileWriteMode.OVERWRITE

chunk_size class-attribute instance-attribute #

chunk_size: int = Field(
    8192,
    ge=16,
    description="The size (in bytes) of the chunks to download the file in, must be greater than or equal to 16.",
)

download_path class-attribute instance-attribute #

download_path: DirectoryPath = Field(
    ...,
    description="The local directory path where the file will be downloaded to.",
)

mode class-attribute instance-attribute #

mode: FileWriteMode = Field(
    default=OVERWRITE,
    description="Write mode: overwrite, append, ignore, exclusive, backup, or update.",
)

Output #

download_file_path class-attribute instance-attribute #

download_file_path: FilePath = Field(
    ...,
    description="The full path where the file was downloaded to.",
)

execute #

execute() -> Output

Executes the file download process, handling different write modes, and saving the file to the specified path.

Returns:

Type Description
Output

An instance of the Output class containing the path where the file was downloaded.

Source code in src/koheesio/steps/download_file.py
def execute(self) -> Output:
    """
    Executes the file download process, handling different write modes, and saving the file to the specified path.

    Returns
    -------
    Output
        An instance of the Output class containing the path where the file was downloaded.
    """
    _filename = Path(self.url).name
    _filepath = self.download_path / _filename
    _write_mode = self.mode.write_mode

    # Check if the file should be written based on the given mode
    if not self.should_write_file(_filepath, _filename):
        return self.output

    # Create the download path if it does not exist
    self.output.download_file_path = _filepath
    self.output.download_file_path.touch(exist_ok=True)

    with self._request(stream=True) as response:  # type: ignore
        with self.output.download_file_path.open(mode=_write_mode) as f:  # type: ignore
            for chunk in response.iter_content(chunk_size=self.chunk_size):
                self.log.debug(f"Downloading chunk of size {len(chunk)}")
                self.log.debug(f"Downloaded {f.tell()} bytes")
                self.log.debug(f"Writing to file {self.output.download_file_path}")
                f.write(chunk)

should_write_file #

should_write_file(_filepath: Path, _filename: str) -> bool

Determine if the file should be written based on the write mode.

Parameters:

Name Type Description Default
_filepath Path

The path of the file to be written.

required
_filename str

The name of the file to be written.

required

Returns:

Type Description
bool

True if the file should be written, False otherwise.

Source code in src/koheesio/steps/download_file.py
def should_write_file(self, _filepath: Path, _filename: str) -> bool:
    """
    Determine if the file should be written based on the write mode.

    Parameters
    ----------
    _filepath : Path
        The path of the file to be written.
    _filename : str
        The name of the file to be written.

    Returns
    -------
    bool
        True if the file should be written, False otherwise.
    """
    _mode = self.mode

    # OVERWRITE and APPEND modes will write the file irrespective of whether it exists or not
    if _filepath.exists() and _mode not in {FileWriteMode.OVERWRITE, FileWriteMode.APPEND}:
        if _mode == FileWriteMode.IGNORE:
            # If the file exists in IGNORE mode, return False
            self.log.info(f"File {_filepath} already exists. Ignoring {_filename} based on IGNORE mode.")
            self.output.download_file_path = _filepath
            return False

        elif _mode == FileWriteMode.EXCLUSIVE:
            raise FileExistsError(
                f"File {_filepath} already exists. Cannot write to {_filename} based on EXCLUSIVE mode."
            )

        elif _mode == FileWriteMode.BACKUP:
            # In BACKUP mode, we first create a timestamped backup before overwriting the existing file.
            file_to_be_backed_up = _filepath
            backup_path = _filepath.with_suffix(f"{_filepath.suffix}.{int(time.time())}.bak")
            # create the backup
            self.log.info(f"Creating backup of {_filename} as {backup_path}...")
            file_to_be_backed_up.rename(backup_path)

    return True

validate_mode #

validate_mode(
    v: Union[str, FileWriteMode],
) -> FileWriteMode

Ensure that the mode is a valid FileWriteMode.

Source code in src/koheesio/steps/download_file.py
@field_validator("mode")
def validate_mode(cls, v: Union[str, FileWriteMode]) -> FileWriteMode:
    """Ensure that the mode is a valid FileWriteMode."""
    return FileWriteMode.from_string(v) if isinstance(v, str) else v

koheesio.steps.download_file.FileWriteMode #

The different write modes for the DownloadFileStep.

OVERWRITE:#
  • If the file exists, it will be overwritten.
  • If it does not exist, a new file will be created.
APPEND:#
  • If the file exists, the new data will be appended to it.
  • If it does not exist, a new file will be created.
IGNORE:#
  • If the file exists, the method will return without writing anything.
  • If it does not exist, a new file will be created.
EXCLUSIVE:#
  • If the file exists, an error will be raised.
  • If it does not exist, a new file will be created.
BACKUP:#
  • If the file exists, a backup will be created and the original file will be overwritten.
  • If it does not exist, a new file will be created.

APPEND class-attribute instance-attribute #

APPEND = 'append'

BACKUP class-attribute instance-attribute #

BACKUP = 'backup'

EXCLUSIVE class-attribute instance-attribute #

EXCLUSIVE = 'exclusive'

IGNORE class-attribute instance-attribute #

IGNORE = 'ignore'

OVERWRITE class-attribute instance-attribute #

OVERWRITE = 'overwrite'

write_mode property #

write_mode: str

Return the write mode for the given FileWriteMode.

from_string classmethod #

from_string(mode: str) -> FileWriteMode

Return the FileWriteMode for the given string.

Parameters:

Name Type Description Default
mode str

The string representation of the FileWriteMode.

required

Returns:

Type Description
FileWriteMode

The FileWriteMode enum corresponding to the given string

Source code in src/koheesio/steps/download_file.py
@classmethod
def from_string(cls, mode: str) -> FileWriteMode:
    """Return the FileWriteMode for the given string.

    Parameters
    ----------
    mode : str
        The string representation of the FileWriteMode.

    Returns
    -------
    FileWriteMode
        The FileWriteMode enum corresponding to the given string
    """
    return cls[mode.upper()]