mirror of
https://github.com/cfpwastaken/planetiler-openmaptiles.git
synced 2026-02-04 12:31:10 +00:00
Improve name:latin logic (#147)
* name:latin improvements * improve latin letter regex * allow region codes and x-extension's on localized names
This commit is contained in:
@@ -42,6 +42,7 @@ import com.onthegomap.planetiler.util.Translations;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.function.Predicate;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
@@ -53,10 +54,23 @@ import java.util.stream.Stream;
|
|||||||
* <a href="https://github.com/openmaptiles/openmaptiles-tools/blob/master/sql/zzz_language.sql">openmaptiles-tools</a>.
|
* <a href="https://github.com/openmaptiles/openmaptiles-tools/blob/master/sql/zzz_language.sql">openmaptiles-tools</a>.
|
||||||
*/
|
*/
|
||||||
public class LanguageUtils {
|
public class LanguageUtils {
|
||||||
|
// See https://github.com/onthegomap/planetiler/issues/86
|
||||||
|
|
||||||
|
// Name tags that should be eligible for finding a latin name.
|
||||||
|
// See https://wiki.openstreetmap.org/wiki/Multilingual_names
|
||||||
|
private static final Predicate<String> VALID_NAME_TAGS =
|
||||||
|
Pattern
|
||||||
|
.compile("^name:[a-z]{2,3}(-[a-z]{4})?([-_](x-)?[a-z]{2,})?(-([a-z]{2}|[0-9]{3}))?$", Pattern.CASE_INSENSITIVE)
|
||||||
|
.asMatchPredicate();
|
||||||
|
|
||||||
|
// Match strings that only contain latin characters.
|
||||||
|
private static final Predicate<String> ONLY_LATIN = Pattern
|
||||||
|
.compile("^[\\P{IsLetter}[\\p{IsLetter}&&\\p{IsLatin}]]+$")
|
||||||
|
.asMatchPredicate();
|
||||||
|
|
||||||
|
// Match only latin letters
|
||||||
|
private static final Pattern LATIN_LETTER = Pattern.compile("[\\p{IsLetter}&&\\p{IsLatin}]+");
|
||||||
|
|
||||||
private static final Pattern NONLATIN = Pattern
|
|
||||||
.compile("[^\\x{0000}-\\x{024f}\\x{1E00}-\\x{1EFF}\\x{0300}-\\x{036f}\\x{0259}]");
|
|
||||||
private static final Pattern LETTER = Pattern.compile("[A-Za-zÀ-ÖØ-öø-ÿĀ-ɏ]+");
|
|
||||||
private static final Pattern EMPTY_PARENS = Pattern.compile("(\\([ -.]*\\)|\\[[ -.]*])");
|
private static final Pattern EMPTY_PARENS = Pattern.compile("(\\([ -.]*\\)|\\[[ -.]*])");
|
||||||
private static final Pattern LEADING_TRAILING_JUNK = Pattern.compile("(^\\s*([./-]\\s*)*|(\\s+[./-])*\\s*$)");
|
private static final Pattern LEADING_TRAILING_JUNK = Pattern.compile("(^\\s*([./-]\\s*)*|(\\s+[./-])*\\s*$)");
|
||||||
private static final Pattern WHITESPACE = Pattern.compile("\\s+");
|
private static final Pattern WHITESPACE = Pattern.compile("\\s+");
|
||||||
@@ -73,7 +87,7 @@ public class LanguageUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static boolean containsOnlyLatinCharacters(String string) {
|
static boolean containsOnlyLatinCharacters(String string) {
|
||||||
return string != null && !NONLATIN.matcher(string).find();
|
return string != null && ONLY_LATIN.test(string);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String transliteratedName(Map<String, Object> tags) {
|
private static String transliteratedName(Map<String, Object> tags) {
|
||||||
@@ -84,7 +98,7 @@ public class LanguageUtils {
|
|||||||
if (name == null) {
|
if (name == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
var matcher = LETTER.matcher(name);
|
var matcher = LATIN_LETTER.matcher(name);
|
||||||
if (matcher.find()) {
|
if (matcher.find()) {
|
||||||
String result = matcher.replaceAll("");
|
String result = matcher.replaceAll("");
|
||||||
// if the name was "<nonlatin text> (<latin description)"
|
// if the name was "<nonlatin text> (<latin description)"
|
||||||
@@ -128,7 +142,8 @@ public class LanguageUtils {
|
|||||||
|
|
||||||
boolean isLatin = containsOnlyLatinCharacters(name);
|
boolean isLatin = containsOnlyLatinCharacters(name);
|
||||||
String latin = isLatin ? name :
|
String latin = isLatin ? name :
|
||||||
Stream.concat(Stream.of(nameEn, intName, nameDe), getAllNameTranslationsBesidesEnglishAndGerman(tags))
|
Stream
|
||||||
|
.concat(Stream.of(nameEn, intName, nameDe), getAllNameTranslationsBesidesEnglishAndGerman(tags))
|
||||||
.filter(LanguageUtils::containsOnlyLatinCharacters)
|
.filter(LanguageUtils::containsOnlyLatinCharacters)
|
||||||
.findFirst().orElse(null);
|
.findFirst().orElse(null);
|
||||||
if (latin == null && translations != null && translations.getShouldTransliterate()) {
|
if (latin == null && translations != null && translations.getShouldTransliterate()) {
|
||||||
@@ -160,12 +175,8 @@ public class LanguageUtils {
|
|||||||
|
|
||||||
private static Stream<String> getAllNameTranslationsBesidesEnglishAndGerman(Map<String, Object> tags) {
|
private static Stream<String> getAllNameTranslationsBesidesEnglishAndGerman(Map<String, Object> tags) {
|
||||||
return tags.entrySet().stream()
|
return tags.entrySet().stream()
|
||||||
.filter(e -> {
|
.filter(e -> !EN_DE_NAME_KEYS.contains(e.getKey()) && VALID_NAME_TAGS.test(e.getKey()))
|
||||||
String key = e.getKey();
|
|
||||||
return key.startsWith("name:") && !EN_DE_NAME_KEYS.contains(key);
|
|
||||||
})
|
|
||||||
.map(Map.Entry::getValue)
|
.map(Map.Entry::getValue)
|
||||||
.map(LanguageUtils::string);
|
.map(LanguageUtils::string);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ import java.util.Map;
|
|||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.params.ParameterizedTest;
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
import org.junit.jupiter.params.provider.CsvSource;
|
import org.junit.jupiter.params.provider.CsvSource;
|
||||||
|
import org.junit.jupiter.params.provider.ValueSource;
|
||||||
|
|
||||||
public class LanguageUtilsTest {
|
public class LanguageUtilsTest {
|
||||||
|
|
||||||
@@ -59,7 +60,7 @@ public class LanguageUtilsTest {
|
|||||||
"é, true",
|
"é, true",
|
||||||
"éś, true",
|
"éś, true",
|
||||||
"ɏə, true",
|
"ɏə, true",
|
||||||
"ɐ, false",
|
"ɐ, true",
|
||||||
"ᵿἀ, false",
|
"ᵿἀ, false",
|
||||||
"Ḁỿ, true",
|
"Ḁỿ, true",
|
||||||
"\u02ff\u0370, false",
|
"\u02ff\u0370, false",
|
||||||
@@ -95,24 +96,95 @@ public class LanguageUtilsTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@CsvSource({
|
@ValueSource(strings = {
|
||||||
"name, a, true",
|
// OSM tags that SHOULD be eligible for name:latin feature in the output
|
||||||
"name:en, a, true",
|
"name:en",
|
||||||
"int_name, a, true",
|
"name:en-US",
|
||||||
"name:fr, a, true",
|
"name:en-010",
|
||||||
"name:es, a, true",
|
"int_name",
|
||||||
"name:pt, a, true",
|
"name:fr",
|
||||||
"name:de, a, true",
|
"name:es",
|
||||||
"name:ar, ِغَّ, false",
|
"name:pt",
|
||||||
"name:it, a, true",
|
"name:de",
|
||||||
"name:jp, ア, false",
|
"name:ar",
|
||||||
"name:jp-Latn, a, true",
|
"name:it",
|
||||||
"name:jp_rm, a, true",
|
"name:ko-Latn",
|
||||||
|
"name:be-tarask",
|
||||||
|
// https://wiki.openstreetmap.org/wiki/Multilingual_names#Japan
|
||||||
|
"name:ja",
|
||||||
|
"name:ja-Latn",
|
||||||
|
"name:ja_rm",
|
||||||
|
"name:ja_kana",
|
||||||
|
// https://wiki.openstreetmap.org/wiki/Multilingual_names#China
|
||||||
|
"name:zh-CN",
|
||||||
|
"name:zh-hant-CN",
|
||||||
|
"name:zh_pinyin",
|
||||||
|
"name:zh_zhuyin",
|
||||||
|
"name:zh-Latn-tongyong",
|
||||||
|
"name:zh-Latn-pinyin",
|
||||||
|
"name:zh-Latn-wadegiles",
|
||||||
|
"name:yue-Latn-jyutping",
|
||||||
|
// https://wiki.openstreetmap.org/wiki/Multilingual_names#France
|
||||||
|
"name:fr",
|
||||||
|
"name:fr-x-gallo",
|
||||||
|
"name:br",
|
||||||
|
"name:oc",
|
||||||
|
"name:vls",
|
||||||
|
"name:frp",
|
||||||
|
"name:gcf",
|
||||||
|
"name:gsw",
|
||||||
})
|
})
|
||||||
public void testLatinFallbacks(String key, String value, boolean use) {
|
public void testLatinFallbacks(String key) {
|
||||||
assertEquals(use ? value : null, LanguageUtils.getNames(Map.of(
|
assertEquals("a", LanguageUtils.getNames(Map.of(
|
||||||
key, value
|
key, "a"
|
||||||
), translations).get("name:latin"));
|
), translations).get("name:latin"));
|
||||||
|
assertNull(LanguageUtils.getNames(Map.of(
|
||||||
|
key, "ア"
|
||||||
|
), translations).get("name:latin"));
|
||||||
|
assertNull(LanguageUtils.getNames(Map.of(
|
||||||
|
key, "غ"
|
||||||
|
), translations).get("name:latin"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@ValueSource(strings = {
|
||||||
|
// OSM tags that should NOT be eligible for name:latin feature in the output
|
||||||
|
"name:signed",
|
||||||
|
"name:prefix",
|
||||||
|
"name:abbreviation",
|
||||||
|
"name:source",
|
||||||
|
"name:full",
|
||||||
|
"name:adjective",
|
||||||
|
"name:proposed",
|
||||||
|
"name:pronunciation",
|
||||||
|
"name:etymology",
|
||||||
|
"name:etymology:wikidata",
|
||||||
|
"name:etymology:wikipedia",
|
||||||
|
"name:etymology:right",
|
||||||
|
"name:etymology:left",
|
||||||
|
"name:genitive",
|
||||||
|
})
|
||||||
|
public void testNoLatinFallback(String key) {
|
||||||
|
assertSubmap(Map.of(
|
||||||
|
"name", "Branch Hill–Loveland Road",
|
||||||
|
"name_en", "Branch Hill–Loveland Road",
|
||||||
|
"name_de", "Branch Hill–Loveland Road",
|
||||||
|
"name:latin", "Branch Hill–Loveland Road",
|
||||||
|
"name_int", "Branch Hill–Loveland Road"
|
||||||
|
), LanguageUtils.getNames(Map.of(
|
||||||
|
"name", "Branch Hill–Loveland Road",
|
||||||
|
key, "Q22133584;Q843993"
|
||||||
|
), translations));
|
||||||
|
assertSubmap(Map.of(
|
||||||
|
"name", "日",
|
||||||
|
"name_en", "日",
|
||||||
|
"name_de", "日",
|
||||||
|
"name:latin", "rì",
|
||||||
|
"name_int", "rì"
|
||||||
|
), LanguageUtils.getNames(Map.of(
|
||||||
|
"name", "日",
|
||||||
|
key, "other" // don't use this latin string with invalid name keys
|
||||||
|
), translations));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
|
|||||||
Reference in New Issue
Block a user