From c7ee23e8aa6ab9855237d37066a85204faf510c0 Mon Sep 17 00:00:00 2001 From: Michael Barry Date: Fri, 25 Mar 2022 05:39:12 -0400 Subject: [PATCH] Improve name:latin logic (#147) * name:latin improvements * improve latin letter regex * allow region codes and x-extension's on localized names --- .../basemap/util/LanguageUtils.java | 33 ++++-- .../basemap/util/LanguageUtilsTest.java | 106 +++++++++++++++--- 2 files changed, 111 insertions(+), 28 deletions(-) diff --git a/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java b/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java index 2757107..b5f44f0 100644 --- a/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java +++ b/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java @@ -42,6 +42,7 @@ import com.onthegomap.planetiler.util.Translations; import java.util.HashMap; import java.util.Map; import java.util.Set; +import java.util.function.Predicate; import java.util.regex.Pattern; import java.util.stream.Stream; @@ -53,10 +54,23 @@ import java.util.stream.Stream; * openmaptiles-tools. */ public class LanguageUtils { + // See https://github.com/onthegomap/planetiler/issues/86 + + // Name tags that should be eligible for finding a latin name. + // See https://wiki.openstreetmap.org/wiki/Multilingual_names + private static final Predicate VALID_NAME_TAGS = + Pattern + .compile("^name:[a-z]{2,3}(-[a-z]{4})?([-_](x-)?[a-z]{2,})?(-([a-z]{2}|[0-9]{3}))?$", Pattern.CASE_INSENSITIVE) + .asMatchPredicate(); + + // Match strings that only contain latin characters. + private static final Predicate ONLY_LATIN = Pattern + .compile("^[\\P{IsLetter}[\\p{IsLetter}&&\\p{IsLatin}]]+$") + .asMatchPredicate(); + + // Match only latin letters + private static final Pattern LATIN_LETTER = Pattern.compile("[\\p{IsLetter}&&\\p{IsLatin}]+"); - private static final Pattern NONLATIN = Pattern - .compile("[^\\x{0000}-\\x{024f}\\x{1E00}-\\x{1EFF}\\x{0300}-\\x{036f}\\x{0259}]"); - private static final Pattern LETTER = Pattern.compile("[A-Za-zÀ-ÖØ-öø-ÿĀ-ɏ]+"); private static final Pattern EMPTY_PARENS = Pattern.compile("(\\([ -.]*\\)|\\[[ -.]*])"); private static final Pattern LEADING_TRAILING_JUNK = Pattern.compile("(^\\s*([./-]\\s*)*|(\\s+[./-])*\\s*$)"); private static final Pattern WHITESPACE = Pattern.compile("\\s+"); @@ -73,7 +87,7 @@ public class LanguageUtils { } static boolean containsOnlyLatinCharacters(String string) { - return string != null && !NONLATIN.matcher(string).find(); + return string != null && ONLY_LATIN.test(string); } private static String transliteratedName(Map tags) { @@ -84,7 +98,7 @@ public class LanguageUtils { if (name == null) { return null; } - var matcher = LETTER.matcher(name); + var matcher = LATIN_LETTER.matcher(name); if (matcher.find()) { String result = matcher.replaceAll(""); // if the name was " ( getAllNameTranslationsBesidesEnglishAndGerman(Map tags) { return tags.entrySet().stream() - .filter(e -> { - String key = e.getKey(); - return key.startsWith("name:") && !EN_DE_NAME_KEYS.contains(key); - }) + .filter(e -> !EN_DE_NAME_KEYS.contains(e.getKey()) && VALID_NAME_TAGS.test(e.getKey())) .map(Map.Entry::getValue) .map(LanguageUtils::string); } - } diff --git a/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java b/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java index 20caacd..7b8e7f4 100644 --- a/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java +++ b/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java @@ -13,6 +13,7 @@ import java.util.Map; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.ValueSource; public class LanguageUtilsTest { @@ -59,7 +60,7 @@ public class LanguageUtilsTest { "é, true", "éś, true", "ɏə, true", - "ɐ, false", + "ɐ, true", "ᵿἀ, false", "Ḁỿ, true", "\u02ff\u0370, false", @@ -95,24 +96,95 @@ public class LanguageUtilsTest { } @ParameterizedTest - @CsvSource({ - "name, a, true", - "name:en, a, true", - "int_name, a, true", - "name:fr, a, true", - "name:es, a, true", - "name:pt, a, true", - "name:de, a, true", - "name:ar, ِغَّ, false", - "name:it, a, true", - "name:jp, ア, false", - "name:jp-Latn, a, true", - "name:jp_rm, a, true", + @ValueSource(strings = { + // OSM tags that SHOULD be eligible for name:latin feature in the output + "name:en", + "name:en-US", + "name:en-010", + "int_name", + "name:fr", + "name:es", + "name:pt", + "name:de", + "name:ar", + "name:it", + "name:ko-Latn", + "name:be-tarask", + // https://wiki.openstreetmap.org/wiki/Multilingual_names#Japan + "name:ja", + "name:ja-Latn", + "name:ja_rm", + "name:ja_kana", + // https://wiki.openstreetmap.org/wiki/Multilingual_names#China + "name:zh-CN", + "name:zh-hant-CN", + "name:zh_pinyin", + "name:zh_zhuyin", + "name:zh-Latn-tongyong", + "name:zh-Latn-pinyin", + "name:zh-Latn-wadegiles", + "name:yue-Latn-jyutping", + // https://wiki.openstreetmap.org/wiki/Multilingual_names#France + "name:fr", + "name:fr-x-gallo", + "name:br", + "name:oc", + "name:vls", + "name:frp", + "name:gcf", + "name:gsw", }) - public void testLatinFallbacks(String key, String value, boolean use) { - assertEquals(use ? value : null, LanguageUtils.getNames(Map.of( - key, value + public void testLatinFallbacks(String key) { + assertEquals("a", LanguageUtils.getNames(Map.of( + key, "a" ), translations).get("name:latin")); + assertNull(LanguageUtils.getNames(Map.of( + key, "ア" + ), translations).get("name:latin")); + assertNull(LanguageUtils.getNames(Map.of( + key, "غ" + ), translations).get("name:latin")); + } + + @ParameterizedTest + @ValueSource(strings = { + // OSM tags that should NOT be eligible for name:latin feature in the output + "name:signed", + "name:prefix", + "name:abbreviation", + "name:source", + "name:full", + "name:adjective", + "name:proposed", + "name:pronunciation", + "name:etymology", + "name:etymology:wikidata", + "name:etymology:wikipedia", + "name:etymology:right", + "name:etymology:left", + "name:genitive", + }) + public void testNoLatinFallback(String key) { + assertSubmap(Map.of( + "name", "Branch Hill–Loveland Road", + "name_en", "Branch Hill–Loveland Road", + "name_de", "Branch Hill–Loveland Road", + "name:latin", "Branch Hill–Loveland Road", + "name_int", "Branch Hill–Loveland Road" + ), LanguageUtils.getNames(Map.of( + "name", "Branch Hill–Loveland Road", + key, "Q22133584;Q843993" + ), translations)); + assertSubmap(Map.of( + "name", "日", + "name_en", "日", + "name_de", "日", + "name:latin", "rì", + "name_int", "rì" + ), LanguageUtils.getNames(Map.of( + "name", "日", + key, "other" // don't use this latin string with invalid name keys + ), translations)); } @ParameterizedTest