Improve name:latin logic (#147)

* name:latin improvements

* improve latin letter regex

* allow region codes and x-extension's on localized names
This commit is contained in:
Michael Barry
2022-03-25 05:39:12 -04:00
committed by GitHub
parent 31ced408c4
commit c7ee23e8aa
2 changed files with 111 additions and 28 deletions

View File

@@ -42,6 +42,7 @@ import com.onthegomap.planetiler.util.Translations;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Stream; import java.util.stream.Stream;
@@ -53,10 +54,23 @@ import java.util.stream.Stream;
* <a href="https://github.com/openmaptiles/openmaptiles-tools/blob/master/sql/zzz_language.sql">openmaptiles-tools</a>. * <a href="https://github.com/openmaptiles/openmaptiles-tools/blob/master/sql/zzz_language.sql">openmaptiles-tools</a>.
*/ */
public class LanguageUtils { public class LanguageUtils {
// See https://github.com/onthegomap/planetiler/issues/86
// Name tags that should be eligible for finding a latin name.
// See https://wiki.openstreetmap.org/wiki/Multilingual_names
private static final Predicate<String> VALID_NAME_TAGS =
Pattern
.compile("^name:[a-z]{2,3}(-[a-z]{4})?([-_](x-)?[a-z]{2,})?(-([a-z]{2}|[0-9]{3}))?$", Pattern.CASE_INSENSITIVE)
.asMatchPredicate();
// Match strings that only contain latin characters.
private static final Predicate<String> ONLY_LATIN = Pattern
.compile("^[\\P{IsLetter}[\\p{IsLetter}&&\\p{IsLatin}]]+$")
.asMatchPredicate();
// Match only latin letters
private static final Pattern LATIN_LETTER = Pattern.compile("[\\p{IsLetter}&&\\p{IsLatin}]+");
private static final Pattern NONLATIN = Pattern
.compile("[^\\x{0000}-\\x{024f}\\x{1E00}-\\x{1EFF}\\x{0300}-\\x{036f}\\x{0259}]");
private static final Pattern LETTER = Pattern.compile("[A-Za-zÀ-ÖØ-öø-ÿĀ-ɏ]+");
private static final Pattern EMPTY_PARENS = Pattern.compile("(\\([ -.]*\\)|\\[[ -.]*])"); private static final Pattern EMPTY_PARENS = Pattern.compile("(\\([ -.]*\\)|\\[[ -.]*])");
private static final Pattern LEADING_TRAILING_JUNK = Pattern.compile("(^\\s*([./-]\\s*)*|(\\s+[./-])*\\s*$)"); private static final Pattern LEADING_TRAILING_JUNK = Pattern.compile("(^\\s*([./-]\\s*)*|(\\s+[./-])*\\s*$)");
private static final Pattern WHITESPACE = Pattern.compile("\\s+"); private static final Pattern WHITESPACE = Pattern.compile("\\s+");
@@ -73,7 +87,7 @@ public class LanguageUtils {
} }
static boolean containsOnlyLatinCharacters(String string) { static boolean containsOnlyLatinCharacters(String string) {
return string != null && !NONLATIN.matcher(string).find(); return string != null && ONLY_LATIN.test(string);
} }
private static String transliteratedName(Map<String, Object> tags) { private static String transliteratedName(Map<String, Object> tags) {
@@ -84,7 +98,7 @@ public class LanguageUtils {
if (name == null) { if (name == null) {
return null; return null;
} }
var matcher = LETTER.matcher(name); var matcher = LATIN_LETTER.matcher(name);
if (matcher.find()) { if (matcher.find()) {
String result = matcher.replaceAll(""); String result = matcher.replaceAll("");
// if the name was "<nonlatin text> (<latin description)" // if the name was "<nonlatin text> (<latin description)"
@@ -128,7 +142,8 @@ public class LanguageUtils {
boolean isLatin = containsOnlyLatinCharacters(name); boolean isLatin = containsOnlyLatinCharacters(name);
String latin = isLatin ? name : String latin = isLatin ? name :
Stream.concat(Stream.of(nameEn, intName, nameDe), getAllNameTranslationsBesidesEnglishAndGerman(tags)) Stream
.concat(Stream.of(nameEn, intName, nameDe), getAllNameTranslationsBesidesEnglishAndGerman(tags))
.filter(LanguageUtils::containsOnlyLatinCharacters) .filter(LanguageUtils::containsOnlyLatinCharacters)
.findFirst().orElse(null); .findFirst().orElse(null);
if (latin == null && translations != null && translations.getShouldTransliterate()) { if (latin == null && translations != null && translations.getShouldTransliterate()) {
@@ -160,12 +175,8 @@ public class LanguageUtils {
private static Stream<String> getAllNameTranslationsBesidesEnglishAndGerman(Map<String, Object> tags) { private static Stream<String> getAllNameTranslationsBesidesEnglishAndGerman(Map<String, Object> tags) {
return tags.entrySet().stream() return tags.entrySet().stream()
.filter(e -> { .filter(e -> !EN_DE_NAME_KEYS.contains(e.getKey()) && VALID_NAME_TAGS.test(e.getKey()))
String key = e.getKey();
return key.startsWith("name:") && !EN_DE_NAME_KEYS.contains(key);
})
.map(Map.Entry::getValue) .map(Map.Entry::getValue)
.map(LanguageUtils::string); .map(LanguageUtils::string);
} }
} }

View File

@@ -13,6 +13,7 @@ import java.util.Map;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource; import org.junit.jupiter.params.provider.CsvSource;
import org.junit.jupiter.params.provider.ValueSource;
public class LanguageUtilsTest { public class LanguageUtilsTest {
@@ -59,7 +60,7 @@ public class LanguageUtilsTest {
"é, true", "é, true",
"éś, true", "éś, true",
"ɏə, true", "ɏə, true",
"ɐ, false", "ɐ, true",
"ᵿἀ, false", "ᵿἀ, false",
"Ḁỿ, true", "Ḁỿ, true",
"\u02ff\u0370, false", "\u02ff\u0370, false",
@@ -95,24 +96,95 @@ public class LanguageUtilsTest {
} }
@ParameterizedTest @ParameterizedTest
@CsvSource({ @ValueSource(strings = {
"name, a, true", // OSM tags that SHOULD be eligible for name:latin feature in the output
"name:en, a, true", "name:en",
"int_name, a, true", "name:en-US",
"name:fr, a, true", "name:en-010",
"name:es, a, true", "int_name",
"name:pt, a, true", "name:fr",
"name:de, a, true", "name:es",
"name:ar, ِغَّ, false", "name:pt",
"name:it, a, true", "name:de",
"name:jp, ア, false", "name:ar",
"name:jp-Latn, a, true", "name:it",
"name:jp_rm, a, true", "name:ko-Latn",
"name:be-tarask",
// https://wiki.openstreetmap.org/wiki/Multilingual_names#Japan
"name:ja",
"name:ja-Latn",
"name:ja_rm",
"name:ja_kana",
// https://wiki.openstreetmap.org/wiki/Multilingual_names#China
"name:zh-CN",
"name:zh-hant-CN",
"name:zh_pinyin",
"name:zh_zhuyin",
"name:zh-Latn-tongyong",
"name:zh-Latn-pinyin",
"name:zh-Latn-wadegiles",
"name:yue-Latn-jyutping",
// https://wiki.openstreetmap.org/wiki/Multilingual_names#France
"name:fr",
"name:fr-x-gallo",
"name:br",
"name:oc",
"name:vls",
"name:frp",
"name:gcf",
"name:gsw",
}) })
public void testLatinFallbacks(String key, String value, boolean use) { public void testLatinFallbacks(String key) {
assertEquals(use ? value : null, LanguageUtils.getNames(Map.of( assertEquals("a", LanguageUtils.getNames(Map.of(
key, value key, "a"
), translations).get("name:latin")); ), translations).get("name:latin"));
assertNull(LanguageUtils.getNames(Map.of(
key, ""
), translations).get("name:latin"));
assertNull(LanguageUtils.getNames(Map.of(
key, "غ"
), translations).get("name:latin"));
}
@ParameterizedTest
@ValueSource(strings = {
// OSM tags that should NOT be eligible for name:latin feature in the output
"name:signed",
"name:prefix",
"name:abbreviation",
"name:source",
"name:full",
"name:adjective",
"name:proposed",
"name:pronunciation",
"name:etymology",
"name:etymology:wikidata",
"name:etymology:wikipedia",
"name:etymology:right",
"name:etymology:left",
"name:genitive",
})
public void testNoLatinFallback(String key) {
assertSubmap(Map.of(
"name", "Branch HillLoveland Road",
"name_en", "Branch HillLoveland Road",
"name_de", "Branch HillLoveland Road",
"name:latin", "Branch HillLoveland Road",
"name_int", "Branch HillLoveland Road"
), LanguageUtils.getNames(Map.of(
"name", "Branch HillLoveland Road",
key, "Q22133584;Q843993"
), translations));
assertSubmap(Map.of(
"name", "",
"name_en", "",
"name_de", "",
"name:latin", "",
"name_int", ""
), LanguageUtils.getNames(Map.of(
"name", "",
key, "other" // don't use this latin string with invalid name keys
), translations));
} }
@ParameterizedTest @ParameterizedTest