python: pmtiles-convert from mbtiles writes v3 spec

This commit is contained in:
Brandon Liu
2022-10-12 00:02:59 +08:00
parent 43e46371c1
commit 0f03dc3312
8 changed files with 293 additions and 336 deletions

View File

@@ -1,153 +1,133 @@
import itertools
import json
import tempfile
import gzip
import shutil
from contextlib import contextmanager
from pmtiles import Entry
def entrysort(t):
return (t.z, t.x, t.y)
# Find best base zoom to avoid extra indirection for as many tiles as we can
# precondition: entries is sorted, only tile entries, len(entries) > max_dir_size
def find_leaf_level(entries, max_dir_size):
return entries[max_dir_size].z - 1
def make_pyramid(tile_entries, start_leaf_offset, max_dir_size=21845):
sorted_entries = sorted(tile_entries, key=entrysort)
if len(sorted_entries) <= max_dir_size:
return (sorted_entries, [])
leaf_dirs = []
# determine root leaf level
leaf_level = find_leaf_level(sorted_entries, max_dir_size)
def by_parent(e):
level_diff = e.z - leaf_level
return (leaf_level, e.x // (1 << level_diff), e.y // (1 << level_diff))
root_entries = [e for e in sorted_entries if e.z < leaf_level]
# get all entries greater than or equal to the leaf level
entries_in_leaves = [e for e in sorted_entries if e.z >= leaf_level]
# group the entries by their parent (stable)
entries_in_leaves.sort(key=by_parent)
current_offset = start_leaf_offset
# pack entries into groups
packed_entries = []
packed_roots = []
for group in itertools.groupby(entries_in_leaves, key=by_parent):
subpyramid_entries = list(group[1])
root = by_parent(subpyramid_entries[0])
if len(packed_entries) + len(subpyramid_entries) <= max_dir_size:
packed_entries.extend(subpyramid_entries)
packed_roots.append((root[0], root[1], root[2]))
else:
# flush the current packed entries
for p in packed_roots:
root_entries.append(
Entry(
p[0], p[1], p[2], current_offset, 17 * len(packed_entries), True
)
)
# re-sort the packed_entries by ZXY order
packed_entries.sort(key=entrysort)
leaf_dirs.append(packed_entries)
current_offset += 17 * len(packed_entries)
packed_entries = subpyramid_entries
packed_roots = [(root[0], root[1], root[2])]
# finalize the last set
if len(packed_entries):
for p in packed_roots:
root_entries.append(
Entry(p[0], p[1], p[2], current_offset, 17 * len(packed_entries), True)
)
# re-sort the packed_entries by ZXY order
packed_entries.sort(key=entrysort)
leaf_dirs.append(packed_entries)
return (root_entries, leaf_dirs)
from .tile import Entry, serialize_directory, Compression, serialize_header
@contextmanager
def write(fname):
f = open(fname, "wb")
w = Writer(f, 21845)
w = Writer(f)
try:
yield w
finally:
f.close()
def build_roots_leaves(entries, leaf_size):
root_entries = []
leaves_bytes = b""
num_leaves = 0
i = 0
while i < len(entries):
num_leaves += 1
serialized = serialize_directory(entries[i : i + leaf_size])
root_entries.append(
Entry(entries[0].tile_id, len(leaves_bytes), len(serialized), 0)
)
leaves_bytes += serialized
i += leaf_size
return serialize_directory(root_entries), leaves_bytes, num_leaves
def optimize_directories(entries, target_root_len):
test_bytes = serialize_directory(entries)
if len(test_bytes) < target_root_len:
return test_bytes, b"", 0
leaf_size = 4096
while True:
root_bytes, leaves_bytes, num_leaves = build_roots_leaves(entries, leaf_size)
if len(root_bytes) < target_root_len:
return root_bytes, leaves_bytes, num_leaves
leaf_size *= 2
class Writer:
def __init__(self, f, max_dir_size):
self.offset = 512000
def __init__(self, f):
self.f = f
self.f.write(b"\0" * self.offset)
self.tile_entries = []
self.hash_to_offset = {}
self.max_dir_size = max_dir_size
self.tile_f = tempfile.TemporaryFile()
self.offset = 0
self.addressed_tiles = 0
def write_tile(self, z, x, y, data):
# TODO enforce ordered writes
def write_tile(self, tileid, data):
hsh = hash(data)
if hsh in self.hash_to_offset:
self.tile_entries.append(
Entry(z, x, y, self.hash_to_offset[hsh], len(data), False)
)
last = self.tile_entries[-1]
found = self.hash_to_offset[hsh]
if tileid == last.tile_id + last.run_length and last.offset == found:
self.tile_entries[-1].run_length += 1
else:
self.tile_entries.append(Entry(tileid, found, len(data), 1))
else:
self.f.write(data)
self.tile_entries.append(Entry(z, x, y, self.offset, len(data), False))
self.tile_f.write(data)
self.tile_entries.append(Entry(tileid, self.offset, len(data), 1))
self.hash_to_offset[hsh] = self.offset
self.offset = self.offset + len(data)
self.offset += len(data)
def _write_entry(self, entry):
if entry.is_dir:
z_bytes = 0b10000000 | entry.z
else:
z_bytes = entry.z
self.f.write(z_bytes.to_bytes(1, byteorder="little"))
self.f.write(entry.x.to_bytes(3, byteorder="little"))
self.f.write(entry.y.to_bytes(3, byteorder="little"))
self.f.write(entry.offset.to_bytes(6, byteorder="little"))
self.f.write(entry.length.to_bytes(4, byteorder="little"))
self.addressed_tiles += 1
def _write_header(self, metadata, root_entries_len):
self.f.write((0x4D50).to_bytes(2, byteorder="little"))
self.f.write((2).to_bytes(2, byteorder="little"))
metadata_serialized = json.dumps(metadata)
# 512000 - (17 * 21845) - 2 (magic) - 2 (version) - 4 (jsonlen) - 2 (dictentries) = 140625
assert len(metadata_serialized) < 140625
self.f.write(len(metadata_serialized).to_bytes(4, byteorder="little"))
self.f.write(root_entries_len.to_bytes(2, byteorder="little"))
self.f.write(metadata_serialized.encode("utf-8"))
def finalize(self, header, metadata):
print("# of addressed tiles:", self.addressed_tiles)
print("# of tile entries (after RLE):", len(self.tile_entries))
print("# of tile contents:", len(self.hash_to_offset))
def finalize(self, metadata={}):
root_dir, leaf_dirs = make_pyramid(
self.tile_entries, self.offset, self.max_dir_size
header["addressed_tiles_count"] = self.addressed_tiles
header["tile_entries_count"] = len(self.tile_entries)
header["tile_contents_count"] = len(self.hash_to_offset)
root_bytes, leaves_bytes, num_leaves = optimize_directories(
self.tile_entries, 16384 - 127
)
if len(leaf_dirs) > 0:
for leaf_dir in leaf_dirs:
for entry in leaf_dir:
self._write_entry(entry)
if num_leaves > 0:
print("Root dir bytes:", len(root_bytes))
print("Leaves dir bytes:", len(leaves_bytes))
print("Num leaf dirs:", num_leaves)
print("Total dir bytes:", len(root_bytes) + len(leaves_bytes))
print("Average leaf dir bytes:", len(leaves_bytes) / num_leaves)
print(
"Average bytes per entry:",
(len(root_bytes) + len(leaves_bytes)) / self.addressed_tiles,
)
else:
print("Total dir bytes:", len(root_bytes))
print(
"Average bytes per addressed tile:",
len(root_bytes) / self.addressed_tiles,
)
self.f.seek(0)
self._write_header(metadata, len(root_dir))
compressed_metadata = gzip.compress(json.dumps(metadata).encode())
header["clustered"] = True
header["internal_compression"] = Compression.GZIP
header[
"tile_compression"
] = Compression.GZIP # TODO: not necessarily true for non-vector
header["root_offset"] = 127
header["root_length"] = len(root_bytes)
header["metadata_offset"] = header["root_offset"] + header["root_length"]
header["metadata_length"] = len(compressed_metadata)
header["leaf_directory_offset"] = (
header["metadata_offset"] + header["metadata_length"]
)
header["leaf_directory_length"] = len(leaves_bytes)
header["tile_data_offset"] = (
header["leaf_directory_offset"] + header["leaf_directory_length"]
)
header["tile_data_length"] = self.offset
for entry in root_dir:
self._write_entry(entry)
header_bytes = serialize_header(header)
return {
"num_tiles": len(self.tile_entries),
"num_unique_tiles": len(self.hash_to_offset),
"num_leaves": len(leaf_dirs),
}
self.f.write(header_bytes)
self.f.write(root_bytes)
self.f.write(compressed_metadata)
self.f.write(leaves_bytes)
self.tile_f.seek(0)
shutil.copyfileobj(self.tile_f, self.f)