Kerchunking¶

reference ¶

Functions:

Name	Description
`create_kerchunk_reference`	Reference local NetCDF files using Kerchunk
`create_single_reference`	Helper function for create_kerchunk_reference()

create_kerchunk_reference ¶

create_kerchunk_reference(
    source_directory: Path,
    output_directory: Path,
    pattern: str = "*.nc",
    workers: int = 4,
    dry_run: bool = False,
    verbose: int = VERBOSE_LEVEL_DEFAULT,
)

Reference local NetCDF files using Kerchunk

Source code in rekx/reference.py

def create_kerchunk_reference(
    source_directory: Annotated[Path, typer_argument_source_directory],
    output_directory: Annotated[Path, typer_argument_output_directory],
    pattern: Annotated[str, typer_option_filename_pattern] = "*.nc",
    workers: Annotated[int, typer_option_number_of_workers] = 4,
    dry_run: Annotated[bool, typer_option_dry_run] = False,
    verbose: Annotated[int, typer_option_verbose] = VERBOSE_LEVEL_DEFAULT,
):
    """Reference local NetCDF files using Kerchunk"""
    # import cProfile
    # import pstats
    # profiler = cProfile.Profile()
    # profiler.enable()

    file_paths = list(source_directory.glob(pattern))
    if not file_paths:
        logger.info("No files found in the source directory matching the pattern.")
        return
    if dry_run:
        print(
            f"[bold]Dry run[/bold] of [bold]operations that would be performed[/bold]:"
        )
        print(
            f"> Reading files in [code]{source_directory}[/code] matching the pattern [code]{pattern}[/code]"
        )
        print(f"> Number of files matched: {len(file_paths)}")
        print(f"> Creating single reference files to [code]{output_directory}[/code]")
        return  # Exit for a dry run
    output_directory.mkdir(parents=True, exist_ok=True)

    # Map verbosity level to display mode
    mode = DisplayMode(verbose)
    with display_context[mode]:
        with multiprocessing.Pool(processes=workers) as pool:
            from functools import partial

            partial_create_single_reference = partial(
                create_single_reference, output_directory=output_directory
            )
            results = pool.map(partial_create_single_reference, file_paths)

create_single_reference ¶

create_single_reference(
    file_path: Path,
    output_directory: Path,
    verbose: int = 0,
)

Helper function for create_kerchunk_reference()

Notes

Will create an MD5 Hash for each new reference file in order to avoid regenerating the same file in case of a renewed attempt to reference the same file. This is useful in the context or epxlorative massive processing.

Source code in rekx/reference.py

def create_single_reference(
    file_path: Path,
    output_directory: Path,
    # md5: bool = True,
    verbose: int = 0,
):
    """Helper function for create_kerchunk_reference()

    Notes
    -----

    Will create an MD5 Hash for each new reference file in order to avoid
    regenerating the same file in case of a renewed attempt to reference the
    same file.  This is useful in the context or epxlorative massive
    processing.

    """
    filename = file_path.stem
    output_file = f"{output_directory}/{filename}.json"
    hash_file = output_file + ".hash"
    generated_hash = generate_file_md5(file_path)
    local_fs = fsspec.filesystem("file")
    if local_fs.exists(output_file) and local_fs.exists(hash_file):
        logger.debug(f"Found a reference file '{output_file}' and a hash '{hash_file}'")
        with local_fs.open(hash_file, "r") as hf:
            existing_hash = hf.read().strip()

        if existing_hash == generated_hash:
            pass
    else:
        logger.debug(
            f"Creating reference file '{output_file}' with hash '{generated_hash}'"
        )
        file_url = f"file://{file_path}"
        with fsspec.open(file_url, mode="rb") as input_file:
            h5chunks = SingleHdf5ToZarr(input_file, file_url, inline_threshold=0)
            json = ujson.dumps(h5chunks.translate()).encode()
            with local_fs.open(output_file, "wb") as f:
                f.write(json)
            with local_fs.open(hash_file, "w") as hf:
                hf.write(generated_hash)

combine ¶

Functions:

Name	Description
`combine_kerchunk_references`	Combine multiple JSON references into a single logical aggregate
`combine_kerchunk_references_to_parquet`	Combine multiple JSON references into a single Parquet store using Kerchunk's `MultiZarrToZarr` function

combine_kerchunk_references ¶

combine_kerchunk_references(
    source_directory: Path,
    pattern: str = "*.json",
    combined_reference: Path = "combined_kerchunk.json",
    dry_run: bool = False,
    verbose: int = VERBOSE_LEVEL_DEFAULT,
)

Combine multiple JSON references into a single logical aggregate dataset using Kerchunk's MultiZarrToZarr function

Source code in rekx/combine.py

def combine_kerchunk_references(
    source_directory: Annotated[Path, typer_argument_source_directory],
    pattern: Annotated[str, typer_option_filename_pattern] = "*.json",
    combined_reference: Annotated[
        Path, typer_argument_kerchunk_combined_reference
    ] = "combined_kerchunk.json",
    dry_run: Annotated[bool, typer_option_dry_run] = False,
    verbose: Annotated[int, typer_option_verbose] = VERBOSE_LEVEL_DEFAULT,
):
    """Combine multiple JSON references into a single logical aggregate
    dataset using Kerchunk's `MultiZarrToZarr` function"""

    mode = DisplayMode(verbose)
    with display_context[mode]:
        source_directory = Path(source_directory)
        reference_file_paths = list(source_directory.glob(pattern))
        reference_file_paths = list(map(str, reference_file_paths))

        if dry_run:
            print(
                f"[bold]Dry run[/bold] of [bold]operations that would be performed[/bold]:"
            )
            print(
                f"> Reading files in [code]{source_directory}[/code] matching the pattern [code]{pattern}[/code]"
            )
            print(f"> Number of files matched: {len(reference_file_paths)}")
            print(
                f"> Writing combined reference file to [code]{combined_reference}[/code]"
            )
            return  # Exit for a dry run

        from kerchunk.combine import MultiZarrToZarr

        mzz = MultiZarrToZarr(
            reference_file_paths,
            concat_dims=["time"],
            identical_dims=["lat", "lon"],
        )
        multifile_kerchunk = mzz.translate()

        combined_reference_filename = Path(combined_reference)
        local_fs = fsspec.filesystem("file")
        with local_fs.open(combined_reference_filename, "wb") as f:
            f.write(ujson.dumps(multifile_kerchunk).encode())

combine_kerchunk_references_to_parquet ¶

combine_kerchunk_references_to_parquet(
    source_directory: Path,
    pattern: str = "*.json",
    combined_reference: Path = "combined_kerchunk.parq",
    dry_run: bool = False,
    verbose: int = VERBOSE_LEVEL_DEFAULT,
)

Combine multiple JSON references into a single Parquet store using Kerchunk's MultiZarrToZarr function

Source code in rekx/combine.py

def combine_kerchunk_references_to_parquet(
    source_directory: Annotated[Path, typer_argument_source_directory],
    pattern: Annotated[str, typer_option_filename_pattern] = "*.json",
    combined_reference: Annotated[
        Path, typer_argument_kerchunk_combined_reference
    ] = "combined_kerchunk.parq",
    dry_run: Annotated[bool, typer_option_dry_run] = False,
    verbose: Annotated[int, typer_option_verbose] = VERBOSE_LEVEL_DEFAULT,
):
    """Combine multiple JSON references into a single Parquet store using Kerchunk's `MultiZarrToZarr` function"""

    mode = DisplayMode(verbose)
    with display_context[mode]:
        source_directory = Path(source_directory)
        reference_file_paths = list(source_directory.glob(pattern))
        reference_file_paths = list(map(str, reference_file_paths))

        if dry_run:
            print(
                f"[bold]Dry run[/bold] of [bold]operations that would be performed[/bold]:"
            )
            print(
                f"> Reading files in [code]{source_directory}[/code] matching the pattern [code]{pattern}[/code]"
            )
            print(f"> Number of files matched: {len(reference_file_paths)}")
            print(
                f"> Writing combined reference file to [code]{combined_reference}[/code]"
            )
            return  # Exit for a dry run

        # Create LazyReferenceMapper to pass to MultiZarrToZarr
        filesystem = fsspec.filesystem("file")
        import os

        combined_reference.mkdir(parents=True, exist_ok=True)
        from fsspec.implementations.reference import LazyReferenceMapper

        output_lazy = LazyReferenceMapper(
            root=str(combined_reference),
            fs=filesystem,
            cache_size=1000,
        )

        from kerchunk.combine import MultiZarrToZarr

        # Combine single references
        mzz = MultiZarrToZarr(
            reference_file_paths,
            remote_protocol="file",
            concat_dims=["time"],
            identical_dims=["lat", "lon"],
            out=output_lazy,
        )
        multifile_kerchunk = mzz.translate()

        output_lazy.flush()  # Write all non-full reference batches

        # Read from the Parquet storage
        kerchunk.df.refs_to_dataframe(multifile_kerchunk, str(combined_reference))

        filesystem = fsspec.implementations.reference.ReferenceFileSystem(
            fo=str(combined_reference),
            target_protocol="file",
            remote_protocol="file",
            lazy=True,
        )
        ds = xr.open_dataset(
            filesystem.get_mapper(""),
            engine="zarr",
            chunks={},
            backend_kwargs={"consolidated": False},
        )
        print(ds)

parquet ¶

Functions:

Name	Description
`combine_parquet_stores_to_parquet`	Combine multiple Parquet stores into a single aggregate dataset using Kerchunk's `MultiZarrToZarr` function
`create_multiple_parquet_stores`
`create_parquet_store`
`create_single_parquet_store`	Helper function for create_multiple_parquet_stores()
`parquet_multi_reference`	Create Parquet references from an HDF5/NetCDF file
`parquet_reference`	Create Parquet references from an HDF5/NetCDF file
`select_from_parquet`	Select data from a Parquet store

combine_parquet_stores_to_parquet ¶

combine_parquet_stores_to_parquet(
    source_directory: Path,
    pattern: str = "*.parquet",
    combined_reference: Path = "combined_kerchunk.parquet",
    record_size: int = DEFAULT_RECORD_SIZE,
    dry_run: bool = False,
    verbose: int = VERBOSE_LEVEL_DEFAULT,
)

Combine multiple Parquet stores into a single aggregate dataset using Kerchunk's MultiZarrToZarr function