Merge pull request #841 from wwarriner/fix-url

Fix missing urls and update checker
uabrc · Nov 19, 2024 · 77efcc3 · 77efcc3
2 parents 249bf1b + 3ad634f
commit 77efcc3
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 14 deletions.
diff --git a/.linkcheckerrc b/.linkcheckerrc
@@ -85,7 +85,7 @@ fileoutput=csv
 
 # CSV logger
 [csv]
-filename=out/linkchecker-out.csv
+filename=out/linkchecker-raw.csv
 separator=,
 #quotechar="
 #dialect=excel
@@ -190,7 +190,7 @@ recursionlevel=1
 
 ##################### filtering options ##########################
 [filtering]
-#ignore=
+ignore=rc.uab.edu
 # ignore everything with 'lconline' in the URL name
 #  lconline
 # and ignore everything with 'bookmark' in the URL name

diff --git a/docs/cheaha/open_ondemand/hpc_desktop.md b/docs/cheaha/open_ondemand/hpc_desktop.md
@@ -14,7 +14,7 @@ Once you click the tab, you'll see the control panel appear. The second option f
 
 ![!VNC clipboard](images/ood_desktop_copy_paste.png)
 
-To copy from the VNC to your personal machine, highlight the text you want to copy in the VNC session, and that text will appear in the clipboard. Select the text in the clipboard, copy it, and then paste it on your local machine. Images cannot be copy-pasted through this clipboard. Instead, images should be saved as a file and then transferred through tools such as [Globus](../../data_management/transfer/globus.md), [rclone](../../data_management/transfer/rclone.md), or an [scp utility](https://kb.iu.edu/d/agye).
+To copy from the VNC to your personal machine, highlight the text you want to copy in the VNC session, and that text will appear in the clipboard. Select the text in the clipboard, copy it, and then paste it on your local machine. Images cannot be copy-pasted through this clipboard. Instead, images should be saved as a file and then transferred through tools such as [Globus](../../data_management/transfer/globus.md), [rclone](../../data_management/transfer/rclone.md), or an [scp utility](https://servicenow.iu.edu/kb?id=kb_article_view&sysparm_article=KB0024361).
 
 ## Visual Studio Code Remote Tunnel
 

diff --git a/docs/workflow_solutions/git_collaboration.md b/docs/workflow_solutions/git_collaboration.md
@@ -117,7 +117,7 @@ Effective use of issue tracking can greatly reduce cognitive load and simplify c
 
 The typical issue lifecycle, at a high level, is something like below.
 
-1. Create an issue. [GitHub](https://docs.github.com/en/issues/tracking-your-work-with-issues/creating-an-issue) [GitLab](https://docs.gitlab.com/ee/user/project/issues/create_issues.html)
+1. Create an issue. [GitHub](https://docs.github.com/en/issues/tracking-your-work-with-issues/using-issues/creating-an-issue) [GitLab](https://docs.gitlab.com/ee/user/project/issues/create_issues.html)
 1. Ask for clarifications and discuss as needed.
 1. Use the [Fork-Pull/Merge Request Workflow](#the-fork-pullmerge-request-workflow) to resolve the issue. In the Pull Request description, put the text `Fixes #...` where `...` should be replaced by the issue's number. When the request is merged, the issue will automatically be linked to the request and closed.
 

diff --git a/docs/workflow_solutions/using_anaconda.md b/docs/workflow_solutions/using_anaconda.md
@@ -24,7 +24,7 @@ Anaconda is a package manager, meaning it handles all of the difficult mathemati
 
 Anaconda is structured around environments. Environments are self-contained collections of researcher-selected packages. Environments can be changed out using a simple package without requiring tedious installing and uninstalling of packages or software, and avoiding dependency conflicts with each other. Environments allow researchers to work and collaborate on multiple projects, each with different requirements, all on the same computer. Environments can be installed from the command line, from pre-designed or shared YAML files, and can be modified or updated as needed.
 
-The following subsections detail some of the more common commands and use cases for Anaconda usage. More complete information on this process can be found at the [Anaconda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#). Need some hands-on experience, you can find instructions on how to install PyTorch and TensorFlow using Anaconda in this [tutorial](../cheaha/tutorial/pytorch_tensorflow.md).
+The following subsections detail some of the more common commands and use cases for Anaconda usage. More complete information on this process can be found at the [Anaconda documentation](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). Need some hands-on experience, you can find instructions on how to install PyTorch and TensorFlow using Anaconda in this [tutorial](../cheaha/tutorial/pytorch_tensorflow.md).
 
 <!-- markdownlint-disable MD046 -->
 !!! important

diff --git a/scripts/linkchecker.py b/scripts/linkchecker.py
@@ -1,5 +1,6 @@
 import subprocess
 from pathlib import Path, PurePath
+from typing import Optional
 
 import pandas as pd
 
@@ -8,21 +9,115 @@
 OUTPUT = PurePath("out")
 Path(OUTPUT).mkdir(exist_ok=True)
 
+# FILE PATHS
 LINKCHECKER_LOG = OUTPUT / "linkchecker.log"
-LINKCHECKER_CSV = OUTPUT / "linkchecker-out.csv"
+LINKCHECKER_RAW_CSV = OUTPUT / "linkchecker-raw.csv"
+LINKCHECKER_OUT_CSV = OUTPUT / "linkchecker-out.csv"
 
+# COLUMNS
+## ORIGINAL
+RESULT = "result"
+URLNAME = "urlname"
+URL = "url"
+PARENTNAME = "parentname"
+LINE = "line"
+COLUMN = "column"
+## RENAMED
+URL_IN_MARKDOWN = "document-url"
+URL_AFTER_REDIRECTION = "url-after-redirection"
+MARKDOWN_FILE = "document"
 
-if __name__ == "__main__":
+
+def run_linkchecker() -> None:
     with open(LINKCHECKER_LOG, "wb", buffering=0) as f:
         subprocess.run("linkchecker --config=.linkcheckerrc docs", stdout=f)
-    df = pd.read_csv(LINKCHECKER_CSV)
-    df = df[["result", "urlname", "parentname", "line", "column", "url"]]
 
-    # drop good urls
-    same_url = df["urlname"] == df["url"]
-    result_ok = df["result"].str.startswith("200")
+
+def load_output() -> pd.DataFrame:
+    df = pd.read_csv(LINKCHECKER_RAW_CSV)
+    df = df[[RESULT, URLNAME, URL, PARENTNAME, LINE, COLUMN]]
+    df = df.rename(
+        columns={
+            URLNAME: URL_IN_MARKDOWN,
+            URL: URL_AFTER_REDIRECTION,
+            PARENTNAME: MARKDOWN_FILE,
+        },
+    )
+    return df
+
+
+def replace_lines_containing(
+    _s: pd.Series, _containing: str, _with: str, /, find_in: Optional[pd.Series] = None
+) -> pd.Series:
+    if find_in is None:
+        find_in = _s
+
+    contains = find_rows_containing(find_in, _containing)
+    out = _s.copy()
+    out[contains] = _with
+    return out
+
+
+def find_rows_containing(_s: pd.Series, _containing: str) -> pd.Series:
+    return _s.str.contains(_containing)
+
+
+def ignore_ok_with_no_redirects(_df: pd.DataFrame) -> pd.DataFrame:
+    same_url = _df[URL_IN_MARKDOWN] == _df[URL_AFTER_REDIRECTION]
+    result_ok = _df[RESULT].str.startswith("200")
     drop = same_url & result_ok
-    df = df[~drop]
+    out = _df[~drop]
+    return out
+
+
+def ignore_rows_containing(
+    _df: pd.DataFrame,
+    _in: str,
+    _containing: str,
+    /,
+    if_result_code: Optional[str] = None,
+) -> pd.DataFrame:
+    find_in = _df[_in]
+    contains = find_rows_containing(find_in, _containing)
+
+    if if_result_code is not None:
+        contains &= _df[RESULT].str.contains(if_result_code)
+
+    out = _df[~contains]
+    return out
+
+
+if __name__ == "__main__":
+    run_linkchecker()
+    df = load_output()
+
+    # drop good urls
+    df = ignore_ok_with_no_redirects(df)
+
+    # replace long error messages with short codes
+    df[RESULT] = replace_lines_containing(df[RESULT], "ConnectTimeout", "408 Timeout")
+
+    # special code for SSO urls
+    df[RESULT] = replace_lines_containing(
+        df[RESULT],
+        "https://padlock.idm.uab.edu",
+        "423 Locked",
+        find_in=df[URL_AFTER_REDIRECTION],
+    )
+
+    # special ignore rules
+    df = ignore_rows_containing(
+        df, URL_IN_MARKDOWN, "https://doi.org", if_result_code="200"
+    )
+    df = ignore_rows_containing(
+        df, URL_IN_MARKDOWN, "https://anaconda.org", if_result_code="403"
+    )
+    df = ignore_rows_containing(
+        df, URL_AFTER_REDIRECTION, "https://padlock.idm.uab.edu", if_result_code="423"
+    )
+
+    # organize
+    df = df.sort_values(by=[RESULT, URL_IN_MARKDOWN, MARKDOWN_FILE, LINE, COLUMN])
 
     # output
-    df.to_csv(LINKCHECKER_CSV, index=False)
+    df.to_csv(LINKCHECKER_OUT_CSV, index=False)