Merge branch 'ad-freiburg:main' into test_start_with_mocking

ad-freiburg · Dec 24, 2024 · 7742fe2 · 7742fe2
2 parents 25b67b6 + ab46d69
commit 7742fe2
Show file tree

Hide file tree

Showing 33 changed files with 1,983 additions and 477 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -0,0 +1,29 @@
+name: Unit Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  unit_tests:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{matrix.python-version}}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .
+          pip install pytest pytest-cov
+      - name: Test with pytest
+        run: |
+          pytest -v
diff --git a/.github/workflows/qleverfiles-check.yml b/.github/workflows/qleverfiles-check.yml
@@ -33,6 +33,7 @@ jobs:
       - name: Check that all the files in `src/qlever/Qleverfiles` parse.
         working-directory: ${{github.workspace}}/qlever-control
         run: |
+          export QLEVER_ARGCOMPLETE_ENABLED=1
           for QLEVERFILE in src/qlever/Qleverfiles/Qleverfile.*; do
             echo
             echo -e "\x1b[1;34mChecking ${QLEVERFILE}\x1b[0m"

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "qlever"
 description = "Script for using the QLever SPARQL engine."
-version = "0.5.8"
+version = "0.5.14"
 authors = [
     { name = "Hannah Bast", email = "[email protected]" }
 ]
@@ -35,3 +35,8 @@ package-data = { "qlever" = ["Qleverfiles/*"] }
 
 [tool.pytest.ini_options]
 pythonpath = ["src"]
+
+[tool.ruff]
+line-length = 79 
+[tool.ruff.lint]
+extend-select = ["I"]
diff --git a/src/qlever/Qleverfiles/Qleverfile.dblp b/src/qlever/Qleverfiles/Qleverfile.dblp
@@ -17,7 +17,7 @@ FORMAT       = ttl
 
 [index]
 INPUT_FILES      = *.gz
-MULTI_INPUT_JSON = $$(ls *.gz | awk 'BEGIN { printf "[ " } NR > 1 { printf ", " } { printf "{\"cmd\": \"zcat " $$0 "\"}" } END { printf "]" }')
+MULTI_INPUT_JSON = { "cmd": "zcat {}", "for-each": "*.gz" }
 SETTINGS_JSON    = { "ascii-prefixes-only": false, "num-triples-per-batch": 5000000, "prefixes-external": [""] }
 
 [server]

diff --git a/src/qlever/Qleverfiles/Qleverfile.orkg b/src/qlever/Qleverfiles/Qleverfile.orkg
@@ -0,0 +1,30 @@
+# Qleverfile for ORKG, use with the QLever CLI (`pip install qlever`)
+#
+# qlever get-data  # Get the dataset
+# qlever index     # Build index data structures
+# qlever start     # Start the server
+
+[data]
+NAME         = orkg
+GET_DATA_URL = https://orkg.org/api/rdf/dump
+GET_DATA_CMD = curl -LR -o ${NAME}.ttl ${GET_DATA_URL} 2>&1 | tee ${NAME}.download-log.txt
+VERSION      = $$(date -r ${NAME}.ttl +%d.%m.%Y || echo "NO_DATE")
+DESCRIPTION  = The Open Research Knowledge Graph (ORKG) (data from ${GET_DATA_URL}, version ${VERSION})
+
+[index]
+INPUT_FILES     = ${data:NAME}.ttl
+CAT_INPUT_FILES = cat ${INPUT_FILES}
+SETTINGS_JSON   = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
+
+[server]
+PORT               = 7053
+ACCESS_TOKEN       = ${data:NAME}
+MEMORY_FOR_QUERIES = 10G
+CACHE_MAX_SIZE     = 5G
+
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+
+[ui]
+UI_CONFIG = orkg
diff --git a/src/qlever/Qleverfiles/Qleverfile.pubchem b/src/qlever/Qleverfiles/Qleverfile.pubchem
diff --git a/src/qlever/Qleverfiles/Qleverfile.uniprot b/src/qlever/Qleverfiles/Qleverfile.uniprot
@@ -1,30 +1,62 @@
 # Qleverfile for UniProt, use with https://github.com/ad-freiburg/qlever-control
 #
-# qlever get-data  # takes ~ 30 hours and ~ 2 TB of disk (for the NT files)
-# qlever index     # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever get-data  # takes ~ 30 hours and ~ 1.6 TB of disk (for the TTL files)
+# qlever index     # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 9950X)
 # qlever start     # starts the server (takes a few seconds)
 #
-# Install packages: sudo apt install -y libxml2-utils parallel xz-utils pv
+# Install packages: sudo apt install -y libxml2-utils parallel xz-utils wget
 # Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries)
 #
 # Set DATE to the date of the latest release. Build on SSD (requires ~ 7 TB
-# during build, ~ 3 TB after build). The uniprot.index.???.meta files can be on
-# HDD without significant performance loss (when running the server).
+# during build, ~ 3 TB after build).
 
 [data]
-NAME           = uniprot
-DATE           = 2024-05-29
-DOWNLOAD_URL   = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
-GET_RDFXML_CMD = mkdir -p rdf.${DATE} && curl -s ${DOWNLOAD_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" - | while read URL; do wget --no-verbose -P rdf.${DATE} $$URL 2>&1 | tee -a uniprot.download-log; done
-RDFXML2NT_CMD  = mkdir -p nt.${DATE} && for RDFXML in rdf.${DATE}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=nt 2> /dev/null | gzip -c > nt.${DATE}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/nt.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel
-GET_DATA_CMD   = rdfxml --help && date > ${NAME}.get-data.begin-date && ${GET_RDFXML_CMD} && ${RDFXML2NT_CMD} && date > ${NAME}.get-data.end-date
-DESCRIPTION    = Complete UniProt data from ${DOWNLOAD_URL}, version ${DATE}
+NAME             = uniprot
+DATE             = 2024-11-27
+RDFXML_DIR       = rdf.${DATE}
+TTL_DIR          = ttl.${DATE}
+UNIPROT_URL      = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
+RHEA_URL         = https://ftp.expasy.org/databases/rhea/rdf
+EXAMPLES_URL     = https://github.com/sib-swiss/sparql-examples
+GET_EXAMPLES_CMD = mkdir -p ${TTL_DIR} && git clone ${EXAMPLES_URL} && (cd sparql-examples && ./convertToOneTurtle.sh -p uniprot && gzip examples_uniprot.ttl && mv -f examples_uniprot.ttl.gz ../${TTL_DIR} && cd .. && rm -rf sparql-examples)
+GET_RDFXML_CMD   = mkdir -p ${RDFXML_DIR} && (echo "${RHEA_URL}/chebi.owl.gz"; echo "${RHEA_URL}/rhea.rdf.gz"; curl -s ${UNIPROT_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" -) | while read URL; do wget --no-verbose -P ${RDFXML_DIR} $$URL 2>&1 | tee -a uniprot.download-log; done
+RDFXML2TTL_CMD   = mkdir -p ${TTL_DIR} && for RDFXML in ${RDFXML_DIR}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=ttl -q 2> ${TTL_DIR}/$$(basename $$RDFXML).stderr | gzip -c > ${TTL_DIR}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/ttl.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel
+GET_DATA_CMD     = date > ${NAME}.get-data.begin-date && ${GET_EXAMPLES_CMD} && ${GET_RDFXML_CMD} && ${RDFXML2TTL_CMD} && date > ${NAME}.get-data.end-date
+DESCRIPTION      = Complete UniProt data from ${UNIPROT_URL}, with additional data from ${RHEA_URL} and ${EXAMPLES_URL}
 
 [index]
-INPUT_FILES     = nt.${data:DATE}/*.nt.gz
-CAT_INPUT_FILES = parallel --tmpdir . -j 4 'zcat -f {}' ::: ${INPUT_FILES} | pv -q -B 5G
-SETTINGS_JSON   = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }
-STXXL_MEMORY    = 60G
+INPUT_FILES      = ${data:TTL_DIR}/*.ttl.gz
+MULTI_INPUT_JSON = [{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_reviewed_*.ttl.gz" },
+                    { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_unreviewed_*.ttl.gz" },
+                    { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniparc", "for-each": "${data:TTL_DIR}/uniparc_*.ttl.gz" },
+                    { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniref", "for-each": "${data:TTL_DIR}/uniref*.ttl.gz" },
+                    { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/obsolete", "for-each": "${data:TTL_DIR}/uniprotkb_obsolete_*.ttl.gz" },
+                    { "cmd": "zcat ${data:TTL_DIR}/chebi.ttl.gz", "graph": "http://sparql.uniprot.org/chebi" },
+                    { "cmd": "zcat ${data:TTL_DIR}/citation_mapping.ttl.gz", "graph": "http://sparql.uniprot.org/citationmapping" },
+                    { "cmd": "zcat ${data:TTL_DIR}/citations.ttl.gz", "graph": "http://sparql.uniprot.org/citations" },
+                    { "cmd": "zcat ${data:TTL_DIR}/databases.ttl.gz", "graph": "http://sparql.uniprot.org/databases" },
+                    { "cmd": "zcat ${data:TTL_DIR}/diseases.ttl.gz", "graph": "http://sparql.uniprot.org/diseases" },
+                    { "cmd": "zcat ${data:TTL_DIR}/enzyme-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" },
+                    { "cmd": "zcat ${data:TTL_DIR}/enzyme.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" },
+                    { "cmd": "zcat ${data:TTL_DIR}/go-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/go" },
+                    { "cmd": "zcat ${data:TTL_DIR}/go.ttl.gz", "graph": "http://sparql.uniprot.org/go" },
+                    { "cmd": "zcat ${data:TTL_DIR}/journals.ttl.gz", "graph": "http://sparql.uniprot.org/journal" },
+                    { "cmd": "zcat ${data:TTL_DIR}/keywords-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" },
+                    { "cmd": "zcat ${data:TTL_DIR}/keywords.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" },
+                    { "cmd": "zcat ${data:TTL_DIR}/locations-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/locations" },
+                    { "cmd": "zcat ${data:TTL_DIR}/locations.ttl.gz", "graph": "http://sparql.uniprot.org/locations" },
+                    { "cmd": "zcat ${data:TTL_DIR}/pathways-hierarchy*.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" },
+                    { "cmd": "zcat ${data:TTL_DIR}/pathways.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" },
+                    { "cmd": "zcat ${data:TTL_DIR}/proteomes.ttl.gz", "graph": "http://sparql.uniprot.org/proteomes" },
+                    { "cmd": "zcat ${data:TTL_DIR}/taxonomy-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" },
+                    { "cmd": "zcat ${data:TTL_DIR}/taxonomy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" },
+                    { "cmd": "zcat ${data:TTL_DIR}/tissues.ttl.gz", "graph": "http://sparql.uniprot.org/tissues" },
+                    { "cmd": "zcat ${data:TTL_DIR}/rhea.ttl.gz", "graph": "https://sparql.rhea-db.org/rhea" },
+                    { "cmd": "zcat ${data:TTL_DIR}/examples_uniprot.ttl.gz", "graph": "http://sparql.uniprot.org/.well-known/sparql-examples" },
+                    { "cmd": "zcat ${data:TTL_DIR}/core.ttl.gz", "graph": "http://purl.uniprot.org/core" },
+                    { "cmd": "zcat ${data:TTL_DIR}/void.ttl.gz", "graph": "http://rdfs.org/ns/void" }]
+SETTINGS_JSON    = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }
+STXXL_MEMORY     = 60G
 
 [server]
 PORT                        = 7018

diff --git a/src/qlever/Qleverfiles/Qleverfile.wikidata b/src/qlever/Qleverfiles/Qleverfile.wikidata
@@ -13,29 +13,26 @@ NAME = wikidata
 
 [data]
 GET_DATA_URL      = https://dumps.wikimedia.org/wikidatawiki/entities
-GET_DATA_CMD      = curl -LROC - ${GET_DATA_URL}/latest-all.ttl.bz2 ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1 | tee wikidata.download-log.txt && curl -sL ${GET_DATA_URL}/dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt
+GET_DATA_CMD      = curl -LRC - -O ${GET_DATA_URL}/latest-all.ttl.bz2 -O ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1 | tee wikidata.download-log.txt && curl -sL ${GET_DATA_URL}/dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt
 DATE_WIKIDATA     = $$(date -r latest-all.ttl.bz2 +%d.%m.%Y || echo "NO_DATE")
 DATE_WIKIPEDIA    = $$(date -r wikipedia-abstracts.nt +%d.%m.%Y || echo "NO_DATE")
-DESCRIPTION       = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${DATE_WIKIDATA}) + English Wikipeda abstracts (version ${DATE_WIKIPEDIA}, available via schema:description)
-TEXT_DESCRIPTION  = All English and German literals + all sentences from the English Wikipedia (version ${DATE_WIKIPEDIA}), use with FILTER KEYWORDS(...)
+DESCRIPTION       = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${DATE_WIKIDATA})
 
 [index]
-INPUT_FILES      = latest-all.ttl.bz2 latest-lexemes.ttl.bz2 wikipedia-abstracts.nt dcatap.nt
+INPUT_FILES      = latest-all.ttl.bz2 latest-lexemes.ttl.bz2 dcatap.nt
 MULTI_INPUT_JSON = [{ "cmd": "lbzcat -n 4 latest-all.ttl.bz2", "format": "ttl", "parallel": "true" },
                     { "cmd": "lbzcat -n 1 latest-lexemes.ttl.bz2", "format": "ttl", "parallel": "false" },
-                    { "cmd": "cat wikipedia-abstracts.nt", "format": "nt", "parallel": "false" },
                     { "cmd": "cat dcatap.nt", "format": "nt", "parallel": "false" }]
 SETTINGS_JSON    = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 5000000 }
 STXXL_MEMORY     = 10G
-TEXT_INDEX       = from_text_records
 
 [server]
 PORT                        = 7001
-ACCESS_TOKEN                = ${data:NAME}_3fz47hfzrbf64b
-MEMORY_FOR_QUERIES          = 40G
-CACHE_MAX_SIZE              = 30G
+ACCESS_TOKEN                = ${data:NAME}
+MEMORY_FOR_QUERIES          = 20G
+CACHE_MAX_SIZE              = 15G
 CACHE_MAX_SIZE_SINGLE_ENTRY = 5G
-TIMEOUT                     = 300s
+TIMEOUT                     = 600s
 
 [runtime]
 SYSTEM = docker

diff --git a/src/qlever/commands/add_text_index.py b/src/qlever/commands/add_text_index.py
@@ -64,7 +64,7 @@ def execute(self, args) -> bool:
         # Show the command line.
         self.show(add_text_index_cmd, only_show=args.show)
         if args.show:
-            return False
+            return True
 
         # When running natively, check if the binary exists and works.
         if args.system == "native":
@@ -74,6 +74,7 @@ def execute(self, args) -> bool:
                 log.error(f"Running \"{args.index_binary}\" failed ({e}), "
                           f"set `--index-binary` to a different binary or "
                           f"use `--container_system`")
+                return False
 
         # Check if text index files already exist.
         existing_text_index_files = get_existing_index_files(

diff --git a/src/qlever/commands/cache_stats.py b/src/qlever/commands/cache_stats.py
@@ -47,7 +47,7 @@ def execute(self, args) -> bool:
         self.show("\n".join([cache_stats_cmd, cache_settings_cmd]),
                   only_show=args.show)
         if args.show:
-            return False
+            return True
 
         # Execute them.
         try:

diff --git a/src/qlever/commands/clear_cache.py b/src/qlever/commands/clear_cache.py
@@ -48,7 +48,7 @@ def execute(self, args) -> bool:
                                 f"\"{args.access_token}\"")
         self.show(clear_cache_cmd, only_show=args.show)
         if args.show:
-            return False
+            return True
 
         # Execute the command.
         try:
@@ -76,5 +76,7 @@ def execute(self, args) -> bool:
         # Show cache stats.
         log.info("")
         args.detailed = False
-        CacheStatsCommand().execute(args)
+        if not CacheStatsCommand().execute(args):
+            log.error("Clearing the cache was successful, but showing the "
+                      "cache stats failed {e}")
         return True