refactor of python writer; separate pyramid building from i/o

This commit is contained in:
Brandon Liu
2022-02-28 10:59:02 +08:00
parent 6beb114b97
commit 7fe6325c74
4 changed files with 144 additions and 84 deletions

View File

@@ -0,0 +1,3 @@
from collections import namedtuple
Entry = namedtuple('Entry',['z','x','y','offset','length','is_dir'])

View File

@@ -1,11 +1,74 @@
import gzip
import itertools import itertools
import json import json
from contextlib import contextmanager from contextlib import contextmanager
from collections import defaultdict from pmtiles import Entry
def tilesort(t): def entrysort(t):
return (t[0],t[1],t[2]) return (t.z,t.x,t.y)
# Find best base zoom to avoid extra indirection for as many tiles as we can
# precondition: entries is sorted, only tile entries, len(entries) > max_dir_size
def find_leaf_level(entries,max_dir_size):
return entries[max_dir_size].z - 1
def make_pyramid(tile_entries,start_leaf_offset,max_dir_size=21845):
sorted_entries = sorted(tile_entries,key=entrysort)
if len(sorted_entries) <= max_dir_size:
return (sorted_entries,[])
leaf_dirs = []
# determine root leaf level
leaf_level = find_leaf_level(sorted_entries,max_dir_size)
def by_parent(e):
level_diff = e.z - leaf_level
return (leaf_level,e.x//(1 << level_diff),e.y//(1 << level_diff))
root_entries = [e for e in sorted_entries if e.z < leaf_level]
# get all entries greater than or equal to the leaf level
entries_in_leaves = [e for e in sorted_entries if e.z >= leaf_level]
# group the entries by their parent (stable)
entries_in_leaves.sort(key=by_parent)
current_offset = start_leaf_offset
# pack entries into groups
packed_entries = []
packed_roots = []
for group in itertools.groupby(entries_in_leaves,key=by_parent):
subpyramid_entries = list(group[1])
if len(packed_entries) + len(subpyramid_entries) <= max_dir_size:
# the first item MUST be the root of the pyramid (sorted) - but it may have multiple roots
root = subpyramid_entries[0]
packed_entries.extend(subpyramid_entries)
packed_roots.append((root.z,root.x,root.y))
else:
# flush the current packed entries
root = packed_entries[0]
for p in packed_roots:
root_entries.append(Entry(p[0],p[1],p[2],current_offset,17 * len(packed_entries),True))
# re-sort the packed_entries by ZXY order
packed_entries.sort(key=entrysort)
leaf_dirs.append(packed_entries)
current_offset += 17 * len(packed_entries)
packed_entries = subpyramid_entries
packed_roots = [(root.z,root.x,root.y)]
# finalize the last set
if len(packed_entries):
for p in packed_roots:
root_entries.append(Entry(p[0],p[1],p[2],current_offset,17 * len(packed_entries),True))
# re-sort the packed_entries by ZXY order
packed_entries.sort(key=entrysort)
leaf_dirs.append(packed_entries)
# sort root entries again?
return (root_entries,leaf_dirs)
@contextmanager @contextmanager
def write(fname): def write(fname):
@@ -20,40 +83,29 @@ class Writer:
self.f = open(fname,'wb') self.f = open(fname,'wb')
self.offset = 512000 self.offset = 512000
self.f.write(b'\0' * self.offset) self.f.write(b'\0' * self.offset)
self.tiles = [] self.tile_entries = []
self.hash_to_offset = {} self.hash_to_offset = {}
self.leaves = []
self.zoom_counts = defaultdict(int)
def write_tile(self,z,x,y,data): def write_tile(self,z,x,y,data):
hsh = hash(data) hsh = hash(data)
if hsh in self.hash_to_offset: if hsh in self.hash_to_offset:
self.tiles.append((z,x,y,self.hash_to_offset[hsh],len(data))) self.tile_entries.append(Entry(z,x,y,self.hash_to_offset[hsh],len(data),False))
else: else:
self.f.write(data) self.f.write(data)
# TODO optimize order self.tile_entries.append(Entry(z,x,y,self.offset,len(data),False))
self.tiles.append((z,x,y,self.offset,len(data)))
self.hash_to_offset[hsh] = self.offset self.hash_to_offset[hsh] = self.offset
self.offset = self.offset + len(data) self.offset = self.offset + len(data)
self.zoom_counts[z] += 1
def write_entry(self,entry): def write_entry(self,entry):
self.f.write(entry[0].to_bytes(1,byteorder='little')) if entry.is_dir:
self.f.write(entry[1].to_bytes(3,byteorder='little')) z_bytes = 0b10000000 | entry.z
self.f.write(entry[2].to_bytes(3,byteorder='little')) else:
self.f.write(entry[3].to_bytes(6,byteorder='little')) z_bytes = entry.z
self.f.write(entry[4].to_bytes(4,byteorder='little')) self.f.write(z_bytes.to_bytes(1,byteorder='little'))
self.f.write(entry.x.to_bytes(3,byteorder='little'))
def write_leafdir(self,tiles,total_len): self.f.write(entry.y.to_bytes(3,byteorder='little'))
entries_to_sort = [] self.f.write(entry.offset.to_bytes(6,byteorder='little'))
for t in tiles: self.f.write(entry.length.to_bytes(4,byteorder='little'))
self.leaves.append((t[0][0],t[0][1],t[0][2],self.offset,17*total_len))
entries = t[1]
for entry in entries:
entries_to_sort.append(entry)
entries_to_sort.sort(key=tilesort)
for entry in entries_to_sort:
self.write_entry(entry)
def write_header(self,metadata,root_entries_len): def write_header(self,metadata,root_entries_len):
self.f.write((0x4D50).to_bytes(2,byteorder='little')) self.f.write((0x4D50).to_bytes(2,byteorder='little'))
@@ -65,68 +117,21 @@ class Writer:
self.f.write(root_entries_len.to_bytes(2,byteorder='little')) self.f.write(root_entries_len.to_bytes(2,byteorder='little'))
self.f.write(metadata_serialized.encode('utf-8')) self.f.write(metadata_serialized.encode('utf-8'))
def finalize(self,metadata = {}): def finalize(self,metadata = {}):
if len(self.tiles) < 21845: root_dir, leaf_dirs = make_pyramid(self.tile_entries,self.offset)
self.f.seek(0)
self.write_header(metadata,len(self.tiles))
self.tiles.sort(key=tilesort)
for entry in self.tiles:
self.write_entry(entry)
else:
leafdir_tiles = []
leafdir_len = 0
# Find best base zoom to avoid extra indirection for as many tiles as we can if len(leaf_dirs) > 0:
base_zoom = 7 for leaf_dir in leaf_dirs:
n_so_far = sum(self.zoom_counts[z] for z in range(0,8)) for entry in leaf_dir:
while n_so_far + self.zoom_counts[base_zoom+1] < 21845:
n_so_far += self.zoom_counts[base_zoom+1]
base_zoom += 1
def by_parent(t):
if t[0] >= base_zoom:
level_diff = t[0] - base_zoom
return (base_zoom,t[1]//(1 << level_diff),t[2]//(1 << level_diff))
else:
return (0,t[1]//(1 << t[0]),t[2]//(1 << t[0]))
# TODO optimize order
self.tiles.sort(key=by_parent)
for group in itertools.groupby(self.tiles,key=by_parent):
if group[0][0] != base_zoom:
continue
entries = list(group[1])
if leafdir_len + len(entries) <= 21845:
leafdir_tiles.append((group[0],entries))
leafdir_len = leafdir_len + len(entries)
else:
self.write_leafdir(leafdir_tiles,leafdir_len)
self.offset += 17 * leafdir_len
leafdir_tiles = [(group[0],entries)]
leafdir_len = len(entries)
# finalize
if len(leafdir_tiles):
self.write_leafdir(leafdir_tiles,leafdir_len)
root_tiles = []
root = [(group[0],list(group[1])) for group in itertools.groupby(self.tiles,key=by_parent) if group[0][0] == 0]
if root:
root_tiles = root[0][1]
self.f.seek(0)
self.write_header(metadata,len(root_tiles) + len(self.leaves))
root_tiles.sort(key=tilesort)
for entry in root_tiles:
self.write_entry(entry) self.write_entry(entry)
# the leaf level > the root tile entries self.f.seek(0)
self.leaves.sort(key=tilesort) self.write_header(metadata,len(root_dir))
for entry in self.leaves:
z_dir = (0b10000000 | entry[0])
self.write_entry((z_dir,entry[1],entry[2],entry[3],entry[4]))
return {'num_tiles':len(self.tiles),'num_unique_tiles':len(self.hash_to_offset),'num_leaves':len(self.leaves)} for entry in root_dir:
self.write_entry(entry)
return {'num_tiles':len(self.tile_entries),'num_unique_tiles':len(self.hash_to_offset),'num_leaves':len(leaf_dirs)}
def close(self): def close(self):
self.f.close() self.f.close()

0
python/test/__init__.py Normal file
View File

View File

@@ -0,0 +1,52 @@
import unittest
from pmtiles import Entry
from pmtiles.writer import find_leaf_level, make_pyramid
class TestTilePyramid(unittest.TestCase):
def test_root_sorted(self):
entries = [
Entry(1,0,0,1,1,False),
Entry(1,0,1,2,1,False),
Entry(1,1,0,3,1,False),
Entry(1,1,1,4,1,False),
Entry(0,0,0,0,1,False)
]
root_entries, leaf_dirs = make_pyramid(entries,0,6)
self.assertEqual(len(root_entries),5)
self.assertEqual(len(leaf_dirs),0)
self.assertEqual(root_entries[0].z,0)
self.assertEqual(root_entries[4].z,1)
def test_leafdir(self):
entries = [
Entry(0,0,0,0,1,False),
Entry(1,0,0,1,1,False),
Entry(1,0,1,2,1,False),
Entry(1,1,0,3,1,False),
Entry(1,1,1,4,1,False),
Entry(2,0,0,5,1,False),
Entry(3,0,0,6,1,False),
Entry(2,0,1,7,1,False),
Entry(3,0,2,8,1,False)
]
root_entries, leaf_dirs = make_pyramid(entries,0,7)
self.assertEqual(len(root_entries),7)
self.assertEqual(len(leaf_dirs),1)
self.assertEqual(len(leaf_dirs[0]),4)
self.assertEqual(leaf_dirs[0][0].z,2)
self.assertEqual(leaf_dirs[0][1].z,2)
self.assertEqual(leaf_dirs[0][2].z,3)
self.assertEqual(leaf_dirs[0][3].z,3)
def test_full_z7_pyramid(self):
entries = []
# create artificial 8 levels
for z in range(0,9):
for x in range(0,pow(2,z)):
for y in range(0,pow(2,z)):
entries.append(Entry(z,x,y,0,0,False))
self.assertEqual(find_leaf_level(entries,21845),7)
root_entries, leaf_dirs = make_pyramid(entries,0)
self.assertEqual(len(root_entries),21845)
self.assertEqual(len(leaf_dirs),4)
self.assertTrue(len(leaf_dirs[0]) <= 21845)