IIIF Tile Generation with Dagster¶
Complete guide to generating IIIF (International Image Interoperability Framework) tiles from high-resolution images using pyvips in Dagster pipelines.
What is IIIF?¶
IIIF is a set of open standards for delivering high-quality, attributed digital objects online at scale. It's used by: - Museums and galleries - Libraries and archives - Digital humanities projects - Cultural heritage institutions
Key Benefits: - Zoom into gigapixel images smoothly - Deliver images efficiently (only load visible tiles) - Consistent viewer experience across institutions - Deep zoom without downloading full image
Architecture Overview¶
High-Res Image → Tile Generation → IIIF Manifest → IIIF Server → Web Viewer
(TIFF) (pyvips) (JSON) (Cantaloupe) (OpenSeadragon)
In Dagster:
artwork_images → generate_iiif_tiles → create_iiif_manifest → upload_to_s3
(asset) (asset) (asset) (asset)
Installation¶
# Install libvips (the underlying C library)
# macOS:
brew install vips
# Ubuntu/Debian:
sudo apt-get install libvips libvips-dev
# Install Python wrapper
pip install pyvips
# Optional: AWS for S3 upload
pip install boto3
Basic Tile Generation Asset¶
import dagster as dg
import polars as pl
from pathlib import Path
import pyvips
from cogapp_libs.dagster import add_dataframe_metadata, track_timing
@dg.asset(
kinds={"image", "iiif"},
group_name="image_processing",
)
def generate_iiif_tiles(
context: dg.AssetExecutionContext,
artwork_images: pl.DataFrame, # DataFrame with image paths
) -> pl.DataFrame:
"""Generate IIIF tiles from high-resolution images using pyvips.
Input DataFrame columns:
- artwork_id: Unique identifier
- image_path: Path to high-res image (TIFF, JPEG, PNG)
- width: Original image width (optional, will be calculated)
- height: Original image height (optional, will be calculated)
Output DataFrame columns:
- artwork_id: Same as input
- tiles_path: Path to generated tiles directory
- tile_count: Number of tiles generated
- max_zoom_level: Maximum zoom level
- width: Image width in pixels
- height: Image height in pixels
"""
results = []
with track_timing(context, "tile_generation"):
for row in artwork_images.iter_rows(named=True):
artwork_id = row["artwork_id"]
image_path = Path(row["image_path"])
context.log.info(f"Processing {artwork_id}: {image_path}")
# Output directory for this artwork's tiles
tiles_dir = Path(f"data/output/iiif/tiles/{artwork_id}")
tiles_dir.mkdir(parents=True, exist_ok=True)
# Load image with pyvips (lazy loading, very fast)
image = pyvips.Image.new_from_file(str(image_path), access="sequential")
# Calculate zoom levels based on image size
max_dimension = max(image.width, image.height)
max_zoom = 0
while (256 * (2 ** max_zoom)) < max_dimension:
max_zoom += 1
context.log.info(
f"{artwork_id}: {image.width}x{image.height}px, "
f"max zoom: {max_zoom}"
)
# Generate tiles using DeepZoom format (IIIF compatible)
# Tile size: 256x256 pixels (IIIF standard)
# Overlap: 1 pixel (reduces seam artifacts)
image.dzsave(
str(tiles_dir / artwork_id),
suffix=".jpg", # JPEG tiles
tile_size=256,
overlap=1,
depth="onetile", # One tile per level
layout="iiif", # IIIF layout
id=artwork_id, # IIIF identifier
)
# Count generated tiles
tile_count = sum(1 for _ in tiles_dir.rglob("*.jpg"))
results.append({
"artwork_id": artwork_id,
"tiles_path": str(tiles_dir),
"tile_count": tile_count,
"max_zoom_level": max_zoom,
"width": image.width,
"height": image.height,
})
context.log.info(
f"{artwork_id}: Generated {tile_count} tiles "
f"at {max_zoom + 1} zoom levels"
)
result_df = pl.DataFrame(results)
add_dataframe_metadata(
context,
result_df,
total_tiles=result_df["tile_count"].sum(),
avg_tiles_per_image=result_df["tile_count"].mean(),
)
return result_df
IIIF Manifest Generation¶
IIIF manifests are JSON files describing the image and how to display it.
import json
from datetime import datetime
@dg.asset(
kinds={"iiif", "json"},
group_name="image_processing",
)
def create_iiif_manifests(
context: dg.AssetExecutionContext,
generate_iiif_tiles: pl.DataFrame,
artwork_metadata: pl.DataFrame, # Artwork titles, artists, etc.
) -> pl.DataFrame:
"""Generate IIIF Presentation API 3.0 manifests for each artwork.
Output: DataFrame with manifest paths
"""
results = []
# Join tile data with metadata
combined = generate_iiif_tiles.join(
artwork_metadata,
on="artwork_id",
how="left"
)
for row in combined.iter_rows(named=True):
artwork_id = row["artwork_id"]
manifest_path = Path(f"data/output/iiif/manifests/{artwork_id}.json")
manifest_path.parent.mkdir(parents=True, exist_ok=True)
# IIIF Presentation API 3.0 manifest
manifest = {
"@context": "http://iiif.io/api/presentation/3/context.json",
"id": f"https://example.com/iiif/{artwork_id}/manifest.json",
"type": "Manifest",
"label": {
"en": [row.get("title", f"Artwork {artwork_id}")]
},
"metadata": [
{
"label": {"en": ["Artist"]},
"value": {"en": [row.get("artist_name", "Unknown")]}
},
{
"label": {"en": ["Date"]},
"value": {"en": [str(row.get("year", "Unknown"))]}
},
],
"items": [
{
"id": f"https://example.com/iiif/{artwork_id}/canvas",
"type": "Canvas",
"width": row["width"],
"height": row["height"],
"items": [
{
"id": f"https://example.com/iiif/{artwork_id}/page",
"type": "AnnotationPage",
"items": [
{
"id": f"https://example.com/iiif/{artwork_id}/annotation",
"type": "Annotation",
"motivation": "painting",
"body": {
"id": f"https://example.com/iiif/{artwork_id}/full/max/0/default.jpg",
"type": "Image",
"format": "image/jpeg",
"service": [
{
"id": f"https://example.com/iiif/{artwork_id}",
"type": "ImageService3",
"profile": "level2",
"width": row["width"],
"height": row["height"],
}
],
"width": row["width"],
"height": row["height"],
},
"target": f"https://example.com/iiif/{artwork_id}/canvas",
}
],
}
],
}
],
}
# Write manifest
with open(manifest_path, "w") as f:
json.dump(manifest, f, indent=2)
results.append({
"artwork_id": artwork_id,
"manifest_path": str(manifest_path),
})
context.log.info(f"Generated manifest for {artwork_id}")
result_df = pl.DataFrame(results)
add_dataframe_metadata(context, result_df)
return result_df
Advanced: Parallel Processing¶
For large collections, process images in parallel:
from concurrent.futures import ThreadPoolExecutor, as_completed
@dg.asset
def generate_iiif_tiles_parallel(
context: dg.AssetExecutionContext,
artwork_images: pl.DataFrame,
) -> pl.DataFrame:
"""Generate tiles in parallel using thread pool."""
def process_single_image(row):
"""Process one image (thread-safe)."""
artwork_id = row["artwork_id"]
image_path = Path(row["image_path"])
tiles_dir = Path(f"data/output/iiif/tiles/{artwork_id}")
tiles_dir.mkdir(parents=True, exist_ok=True)
image = pyvips.Image.new_from_file(str(image_path), access="sequential")
image.dzsave(
str(tiles_dir / artwork_id),
suffix=".jpg",
tile_size=256,
overlap=1,
layout="iiif",
id=artwork_id,
)
tile_count = sum(1 for _ in tiles_dir.rglob("*.jpg"))
return {
"artwork_id": artwork_id,
"tiles_path": str(tiles_dir),
"tile_count": tile_count,
"width": image.width,
"height": image.height,
}
results = []
# Process with thread pool (max 4 concurrent)
with ThreadPoolExecutor(max_workers=4) as executor:
futures = {
executor.submit(process_single_image, row): row
for row in artwork_images.iter_rows(named=True)
}
for future in as_completed(futures):
row = futures[future]
try:
result = future.result()
results.append(result)
context.log.info(f"Completed {result['artwork_id']}")
except Exception as e:
context.log.error(f"Failed {row['artwork_id']}: {e}")
return pl.DataFrame(results)
Upload to S3¶
@dg.asset(
kinds={"s3", "iiif"},
group_name="image_processing",
)
def upload_iiif_to_s3(
context: dg.AssetExecutionContext,
generate_iiif_tiles: pl.DataFrame,
create_iiif_manifests: pl.DataFrame,
) -> pl.DataFrame:
"""Upload IIIF tiles and manifests to S3."""
import boto3
from pathlib import Path
s3 = boto3.client("s3")
bucket = "my-iiif-bucket"
results = []
# Upload tiles
for row in generate_iiif_tiles.iter_rows(named=True):
artwork_id = row["artwork_id"]
tiles_dir = Path(row["tiles_path"])
uploaded = 0
for tile_file in tiles_dir.rglob("*.jpg"):
# Preserve directory structure in S3
relative_path = tile_file.relative_to(tiles_dir.parent)
s3_key = f"iiif/tiles/{relative_path}"
s3.upload_file(
str(tile_file),
bucket,
s3_key,
ExtraArgs={"ContentType": "image/jpeg"}
)
uploaded += 1
context.log.info(f"Uploaded {uploaded} tiles for {artwork_id}")
results.append({
"artwork_id": artwork_id,
"s3_bucket": bucket,
"s3_prefix": f"iiif/tiles/{artwork_id}",
"files_uploaded": uploaded,
})
# Upload manifests
for row in create_iiif_manifests.iter_rows(named=True):
artwork_id = row["artwork_id"]
manifest_path = Path(row["manifest_path"])
s3_key = f"iiif/manifests/{artwork_id}.json"
s3.upload_file(
str(manifest_path),
bucket,
s3_key,
ExtraArgs={"ContentType": "application/json"}
)
result_df = pl.DataFrame(results)
add_dataframe_metadata(context, result_df)
return result_df
IIIF Server Setup¶
Option 1: Cantaloupe (Java)¶
# Download Cantaloupe
wget https://github.com/cantaloupe-project/cantaloupe/releases/download/v5.0.5/cantaloupe-5.0.5.zip
unzip cantaloupe-5.0.5.zip
# Configure (cantaloupe.properties)
FilesystemSource.BasicLookupStrategy.path_prefix = /path/to/tiles/
endpoint.iiif.2.enabled = true
endpoint.iiif.3.enabled = true
# Run
java -Dcantaloupe.config=./cantaloupe.properties -Xmx2g -jar cantaloupe-5.0.5.jar
Option 2: IIPImage (C++)¶
# Install
apt-get install iipimage-server
# Configure nginx
location /iiif/ {
fastcgi_pass localhost:9000;
fastcgi_param PATH_INFO $fastcgi_script_name;
fastcgi_param REQUEST_METHOD $request_method;
fastcgi_param QUERY_STRING $query_string;
fastcgi_param CONTENT_TYPE $content_type;
fastcgi_param CONTENT_LENGTH $content_length;
}
Option 3: Static Tiles (No Server)¶
If you generate tiles with layout="iiif", you can serve them statically:
Web Viewer Integration¶
OpenSeadragon¶
<!DOCTYPE html>
<html>
<head>
<script src="https://cdn.jsdelivr.net/npm/openseadragon@4.0/build/openseadragon/openseadragon.min.js"></script>
</head>
<body>
<div id="viewer" style="width: 100%; height: 800px;"></div>
<script>
var viewer = OpenSeadragon({
id: "viewer",
prefixUrl: "https://cdn.jsdelivr.net/npm/openseadragon@4.0/build/openseadragon/images/",
tileSources: {
"@context": "http://iiif.io/api/image/3/context.json",
"id": "https://example.com/iiif/artwork123",
"type": "ImageService3",
"profile": "level2",
"protocol": "http://iiif.io/api/image",
"width": 5000,
"height": 7000,
"tiles": [{
"width": 256,
"scaleFactors": [1, 2, 4, 8, 16, 32]
}]
}
});
</script>
</body>
</html>
Mirador (IIIF Viewer)¶
<div id="mirador"></div>
<script src="https://unpkg.com/mirador@latest/dist/mirador.min.js"></script>
<script>
Mirador.viewer({
id: 'mirador',
windows: [{
manifestId: 'https://example.com/iiif/artwork123/manifest.json',
}]
});
</script>
Complete Pipeline Example¶
from pathlib import Path
import dagster as dg
import polars as pl
from cogapp_libs.dagster import add_dataframe_metadata
# 1. List artwork images
@dg.asset
def artwork_images(context) -> pl.DataFrame:
"""Scan directory for high-res images."""
image_dir = Path("data/input/images")
images = []
for img_path in image_dir.glob("*.tif"):
artwork_id = img_path.stem
images.append({
"artwork_id": artwork_id,
"image_path": str(img_path),
})
return pl.DataFrame(images)
# 2. Generate tiles (defined above)
# @dg.asset
# def generate_iiif_tiles(...): ...
# 3. Create manifests (defined above)
# @dg.asset
# def create_iiif_manifests(...): ...
# 4. Upload to S3 (defined above)
# @dg.asset
# def upload_iiif_to_s3(...): ...
# 5. Update search index
@dg.asset(io_manager_key="elasticsearch_io_manager")
def artwork_search_index(
create_iiif_manifests: pl.DataFrame,
artwork_metadata: pl.DataFrame,
) -> pl.DataFrame:
"""Index artwork metadata + IIIF URLs for search."""
return create_iiif_manifests.join(
artwork_metadata,
on="artwork_id"
).with_columns([
pl.col("manifest_path").str.replace(
"data/output/iiif/manifests",
"https://example.com/iiif/manifests"
).alias("manifest_url")
])
Performance Tips¶
1. Use TIFF Pyramids¶
For very large images, create pyramidal TIFFs first:
# Convert to pyramidal TIFF
image = pyvips.Image.new_from_file("input.jpg")
image.tiffsave(
"output.tif",
tile=True,
pyramid=True,
compression="jpeg",
tile_width=256,
tile_height=256,
)
2. Optimize JPEG Quality¶
image.dzsave(
str(tiles_dir / artwork_id),
suffix=".jpg[Q=85]", # 85% quality (balance size/quality)
tile_size=256,
)
3. Use WebP for Modern Browsers¶
image.dzsave(
str(tiles_dir / artwork_id),
suffix=".webp[Q=80]", # WebP tiles (smaller files)
tile_size=256,
)
4. Pre-generate Thumbnails¶
# Generate thumbnail for previews
thumbnail = image.thumbnail_image(512)
thumbnail.jpegsave(f"thumbnails/{artwork_id}.jpg", Q=90)
Resources¶
- IIIF Specs: https://iiif.io/api/
- pyvips Docs: https://libvips.github.io/pyvips/
- Cantaloupe: https://cantaloupe-project.github.io/
- OpenSeadragon: https://openseadragon.github.io/
- Mirador: https://projectmirador.org/
IIIF enables world-class image delivery for cultural heritage! 🖼️