diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestURLs.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestURLs.java
new file mode 100644
index 00000000000..d878a4af97b
--- /dev/null
+++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestURLs.java
@@ -0,0 +1,419 @@
+package org.unicode.cldr.unittest;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.Sets;
+import com.google.common.collect.TreeMultimap;
+import com.ibm.icu.dev.test.TestFmwk;
+import com.ibm.icu.util.ICUUncheckedIOException;
+import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+import org.unicode.cldr.util.CLDRPaths;
+
+public class TestURLs extends TestFmwk {
+
+ private static final boolean DISABLE_BROKEN = true; // test for broken URL not working yet
+
+ private static final Joiner JOIN_TAB = Joiner.on('\t');
+ private static final Joiner JOIN_SP = Joiner.on(' ');
+ private static final Joiner JOIN_LF = Joiner.on('\n');
+
+ enum SiteType {
+ SITE,
+ SPEC,
+ CHART,
+ GITHUB
+ }
+
+ public static void main(String[] args) {
+ new TestURLs().run(args);
+ }
+
+ public void testDetection() {
+ String[][] tests = {
+ {"## Parts", "#Parts"},
+ {"## Parts", "#Parts"},
+ {
+ "* Part 1: [Core](tr35.md#Contents) (languages, locales, basic structure)",
+ "tr35.md#Contents"
+ },
+ };
+ int lineCount = 0;
+ for (String[] test : tests) {
+ String line = test[0];
+ String expected = test[1];
+ Multimap lineToUrls = TreeMultimap.create();
+ getUrls(lineCount++ + line, 0, lineToUrls);
+ Collection result = lineToUrls.asMap().get(0);
+ assertEquals(line, expected, result == null ? null : Joiner.on('◎').join(result));
+ }
+ }
+
+ static final Path BASE = Path.of(CLDRPaths.BASE_DIRECTORY);
+
+ public void testSiteFiles() {
+ checkSiteFiles(SiteType.SITE, Path.of(CLDRPaths.BASE_DIRECTORY, "docs/site"));
+ }
+
+ public void testSpecFiles() {
+ checkSiteFiles(SiteType.SPEC, Path.of(CLDRPaths.BASE_DIRECTORY, "docs/ldml"));
+ }
+
+ private void checkSiteFiles(SiteType siteType, Path directoryToScan) {
+ Path directoryPath = directoryToScan;
+ Set results = new LinkedHashSet<>();
+ try (Stream filepath = Files.walk(directoryPath)) {
+ filepath.filter(Files::isRegularFile)
+ .filter(p -> p.getFileName().toString().endsWith(".md"))
+ .forEach(x -> checkFile(siteType, x, results));
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
+ }
+ System.out.println();
+ assertResults(results);
+ System.out.println("\nDomains\n" + JOIN_LF.join(domains));
+ }
+
+ public void testFile() {
+ Path p = Path.of(CLDRPaths.BASE_DIRECTORY, "docs/site/development/adding-locales.md");
+ Set results = new LinkedHashSet<>();
+ checkFile(SiteType.SITE, p, results);
+ assertResults(results);
+ }
+
+ public void assertResults(Set results) {
+ results.stream()
+ .forEach(
+ x -> {
+ for (Issue issue : Issue.values()) {
+ Collection errorLines = x.getProblems(issue);
+ if (errorLines.isEmpty()) {
+ continue;
+ }
+ if (Issue.ERRORS.contains(issue)) {
+ if (!assertEquals("File errors", 0, errorLines.size())) {
+ errorLines.stream().forEach(System.out::println);
+ }
+ } else {
+ errorLines.stream().forEach(System.out::println);
+ }
+ }
+ });
+ }
+
+ public void checkFile(SiteType siteType, Path p, Set results) {
+ // logln(p.toString());
+ LineChecker lineChecker = new LineChecker(siteType, p);
+ results.add(lineChecker);
+ try {
+ Files.lines(p)
+ .forEach(
+ line -> {
+ lineChecker.checkLine(line);
+ });
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
+ }
+ }
+
+ public void testLineChecker() {
+ String[][] tests = {
+ {"SITE", "../index/downloads.md#cldr-releasesdownloads", "BAD_LOCAL HAS_MD"},
+ {"SITE", "../index/downloads#cldr-releasesdownloads", "BAD_LOCAL"},
+ };
+ for (String[] test : tests) {
+ SiteType siteType = SiteType.valueOf(test[0]);
+ String line = test[1];
+ String expected = test[2];
+ LineChecker lineChecker = new LineChecker(siteType, Path.of("../foo"));
+ Set actual = lineChecker.getURLIssues(0, line);
+ assertEquals(JOIN_TAB.join(siteType, line), expected, JOIN_SP.join(actual));
+ }
+ }
+
+ public void testIssueExamples() {
+ for (Issue issue : Issue.values()) {
+ if (DISABLE_BROKEN & issue == Issue.BROKEN) {
+ continue;
+ }
+ final SiteType siteType = SiteType.SITE; // expand later for other types
+ LineChecker lineChecker = new LineChecker(siteType, Path.of("../foo"));
+ Set actual = lineChecker.getURLIssues(0, issue.example);
+ assertEquals(
+ JOIN_TAB.join(siteType, issue.example),
+ issue.toString(),
+ JOIN_SP.join(Sets.intersection(Set.of(issue), actual)).toString());
+ }
+ }
+
+ enum Issue {
+ ΩK("", "https://unicode-org.atlassian.net/browse/CLDR-14927"),
+ OLD_ANCHOR("Old anchor, needs replacement", "downloads/cldr-44#h.nvqx283jwsx"),
+ BAD_LOCAL("Local can't refer outside of docs/site", "../ldml/tr35.md"),
+ MD_FORBID(
+ "Links within Site must not end with .md; but those within Spec must",
+ "../index/downloads.md#cldr-releasesdownloads"),
+ MD_REQUIRE(
+ "Links within Site must not end with .md; but those within Spec must",
+ "../index/downloads.md#cldr-releasesdownloads"),
+ BAD_GITHUB(
+ "Don't link to github for site or spec",
+ "https://unicode-org.github.io/cldr-staging/charts/42/delta/bcp47.html"),
+ BAD_GITHUB_DOCS(
+ "Don't link into github for docs",
+ "https://github.com/unicode-org/cldr/blob/main/docs/requesting_changes.md"),
+ MAKE_RELATIVE("Change to relative", "https://cldr.unicode.org/index/cldr-spec"),
+ OLD_SITE(
+ "Don't link to old site",
+ "https://sites.google.com/unicode.org/cldr/index/downloads/cldr-42#h.xtb1v8tpviuc"),
+ OLDER_SITE(
+ "Don't link to ancient site",
+ "http://www.unicode.org/repos/cldr/trunk/common/bcp47/number.xml"),
+ SEARCH(
+ "Don't vector through google search",
+ "http://www.google.com/search?q=Congo+site%3Alemonde.fr"),
+ SMOKE(
+ "Don't link to smoketest",
+ "https://cldr-smoke.unicode.org/smoketest/v#/fr/Symbols2/47925556fd2904b5"),
+ MALFORMED_URL("MalformedURLException", "http://example.com:-80/"),
+ BROKEN("Can't access URL", "https://qreioqhfiorufpehbquhe.com");
+
+ static final Set ERRORS =
+ Sets.difference(
+ ImmutableSet.copyOf(Issue.values()), Set.of(Issue.ΩK, Issue.OLD_ANCHOR));
+ final String message;
+ final String example;
+
+ // private Issue(String message) {
+ // this(message, "…");
+ // }
+ private Issue(String message, String example) {
+ this.message = message;
+ this.example = example;
+ }
+ }
+
+ static final Set domains = new TreeSet<>();
+
+ static final class LineChecker {
+ int count = 0;
+ private final Multimap lineToUrls = TreeMultimap.create();
+ private final SiteType siteType;
+ private final String disallowedLocal;
+ private final Path path;
+
+ public LineChecker(SiteType siteType, Path path) {
+ this.siteType = siteType;
+ disallowedLocal = siteType == SiteType.SPEC ? "/cldr/docs/ldml/" : "/cldr/docs/site/";
+ this.path = path;
+ }
+
+ private void checkLine(String line) {
+ getUrls(line, ++count, lineToUrls);
+ }
+
+ public List getProblems(Issue issue) {
+ List errorLines = new ArrayList<>();
+ lineToUrls.forEach(
+ (x, y) -> {
+ Set issues = getURLIssues(x, y);
+ issues.stream()
+ .filter(q -> q == issue)
+ .forEach(
+ r -> {
+ Path pathRelative = BASE.relativize(path);
+ errorLines.add(
+ JOIN_TAB.join(issue, pathRelative, x, y));
+ });
+ });
+ return errorLines;
+ }
+
+ public Set getURLIssues(Integer line, String urlString) {
+ Set issues = new LinkedHashSet<>();
+ try {
+ URL url;
+ boolean isLocal;
+ if (PROTOCOL_PAT.matcher(urlString).lookingAt()) {
+ url = new URL(urlString);
+ isLocal = false;
+ } else {
+ url = new URL("file:" + Path.of(path.toString(), urlString).normalize());
+ isLocal = true;
+ }
+ String file = url.getFile();
+ String domain = url.getAuthority();
+ if (domain != null) {
+ domains.add(domain);
+ }
+
+ checkSite(urlString, url, file, domain, isLocal, issues);
+
+ if (issues.isEmpty() && !isLocal & !doesURLExist(url)) { // not working yet
+ issues.add(Issue.BROKEN);
+ }
+ } catch (MalformedURLException e) {
+ issues.add(Issue.MALFORMED_URL);
+ }
+ if (issues.isEmpty()) {
+ issues.add(Issue.ΩK);
+ }
+ return issues;
+ }
+
+ public void checkSite(
+ String urlString,
+ URL url,
+ String file,
+ String domain,
+ boolean isLocal,
+ Set issues) {
+ if (isLocal) {
+ if (!url.toString().contains(disallowedLocal)) {
+ issues.add(Issue.BAD_LOCAL);
+ }
+ final boolean endsWithMd = file.endsWith(".md") && urlString.contains(".md");
+ // second check is because relativizing 'fills in' a file with .md
+
+ if (siteType == SiteType.SPEC
+ && !endsWithMd
+ && !urlString.startsWith("#")
+ && !urlString.endsWith(".png")
+ && !urlString.endsWith(".abnf")) {
+ issues.add(Issue.MD_REQUIRE);
+ }
+ if (siteType == SiteType.SITE && endsWithMd) {
+ issues.add(Issue.MD_FORBID);
+ // relative links cannot end with .md
+ }
+ }
+ if (url.getRef() != null //
+ && url.getRef().startsWith("h.")) {
+ issues.add(Issue.OLD_ANCHOR);
+ }
+ if (file.contains("github.com/") && file.contains("/docs/")) {
+ issues.add(Issue.BAD_GITHUB);
+ }
+ checkDomain(issues, file, domain);
+ }
+
+ public void checkDomain(Set error, String file, String domain) {
+ if (domain != null) {
+ switch (domain) {
+ case "cldr-smoke.unicode.org":
+ error.add(Issue.SMOKE);
+ break;
+ case "sites.google.com":
+ error.add(Issue.OLD_SITE);
+ break;
+ case "cldr.unicode.org":
+ if (siteType == SiteType.SITE) {
+ error.add(Issue.MAKE_RELATIVE);
+ }
+ case "unicode.org":
+ if (file.startsWith("/cldr/data/")
+ || file.startsWith("unicode.org/cldr/repository_access")
+ || file.startsWith("/cldr/trac/")) {
+ error.add(Issue.OLDER_SITE);
+ }
+ break;
+ case "www.unicode.org":
+ if (file.startsWith("/reports/tr35/")) {
+ if (siteType == SiteType.SPEC) {
+ error.add(Issue.MAKE_RELATIVE);
+ }
+ } else if (file.startsWith("/repos")) {
+ error.add(Issue.OLDER_SITE);
+ }
+ break;
+ case "github.com":
+ if (file.contains("/docs/")) {
+ error.add(Issue.BAD_GITHUB_DOCS);
+ }
+ break;
+ case "unicode-org.github.io":
+ if (file.startsWith("/cldr-staging/")) {
+ error.add(Issue.BAD_GITHUB);
+ }
+ break;
+ case "www.google.com":
+ if (file.startsWith("/url?") || file.startsWith("/search?")) {
+ error.add(Issue.SEARCH);
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ public static Pattern PROTOCOL_PAT = Pattern.compile("[a-z0-9+.\\-]+:"); //
+
+ public static Pattern OLD_ANCHOR = Pattern.compile(".*#h\\..*"); //
+
+ public static Pattern URL_PAT =
+ Pattern.compile(
+ "(?:\\bhref=(?:" //
+ + "[\"]([^\"]*)" //
+ + "|[']([^']*)))"
+ + "|\\]\\(([^)]*)"); //
+
+ public static void getUrls(String line, int lineNumber, Multimap lineToUrls) {
+ Matcher m = URL_PAT.matcher(line);
+ int count = 0;
+ while (m.find()) {
+ for (int i = 1; i <= m.groupCount(); ++i) {
+ String group = m.group(i);
+ if (group != null) {
+ lineToUrls.put(lineNumber, group);
+ ++count;
+ break;
+ }
+ }
+ }
+ // For debugging
+ // if (count == 0) {
+ // int start = line.indexOf("href");
+ // System.out.println(RegexUtilities.showMismatch(url, line.subSequence(start,
+ // line.length())));
+ // int d=0;
+ // }
+ }
+
+ public static boolean doesURLExist(URL url) {
+ if (DISABLE_BROKEN) return true; // the following doesn't work.
+ try {
+ // We want to check the current URL
+ HttpURLConnection.setFollowRedirects(false);
+
+ HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
+
+ // We don't need to get data
+ httpURLConnection.setRequestMethod("HEAD");
+
+ // Some websites don't like programmatic access so pretend to be a browser
+ httpURLConnection.setRequestProperty(
+ "User-Agent",
+ "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)");
+ int responseCode = httpURLConnection.getResponseCode();
+
+ // We only accept response code 200
+ return responseCode == HttpURLConnection.HTTP_OK;
+ } catch (Exception e) {
+ return false;
+ }
+ }
+}