Skip to content

Commit

Permalink
Merge branch 'ad-freiburg:main' into test_start_with_mocking
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonL22 authored Dec 24, 2024
2 parents 25b67b6 + ab46d69 commit 7742fe2
Show file tree
Hide file tree
Showing 33 changed files with 1,983 additions and 477 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Unit Tests

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
unit_tests:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{matrix.python-version}}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install .
pip install pytest pytest-cov
- name: Test with pytest
run: |
pytest -v
1 change: 1 addition & 0 deletions .github/workflows/qleverfiles-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ jobs:
- name: Check that all the files in `src/qlever/Qleverfiles` parse.
working-directory: ${{github.workspace}}/qlever-control
run: |
export QLEVER_ARGCOMPLETE_ENABLED=1
for QLEVERFILE in src/qlever/Qleverfiles/Qleverfile.*; do
echo
echo -e "\x1b[1;34mChecking ${QLEVERFILE}\x1b[0m"
Expand Down
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "qlever"
description = "Script for using the QLever SPARQL engine."
version = "0.5.8"
version = "0.5.14"
authors = [
{ name = "Hannah Bast", email = "[email protected]" }
]
Expand Down Expand Up @@ -35,3 +35,8 @@ package-data = { "qlever" = ["Qleverfiles/*"] }

[tool.pytest.ini_options]
pythonpath = ["src"]

[tool.ruff]
line-length = 79
[tool.ruff.lint]
extend-select = ["I"]
2 changes: 1 addition & 1 deletion src/qlever/Qleverfiles/Qleverfile.dblp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ FORMAT = ttl

[index]
INPUT_FILES = *.gz
MULTI_INPUT_JSON = $$(ls *.gz | awk 'BEGIN { printf "[ " } NR > 1 { printf ", " } { printf "{\"cmd\": \"zcat " $$0 "\"}" } END { printf "]" }')
MULTI_INPUT_JSON = { "cmd": "zcat {}", "for-each": "*.gz" }
SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 5000000, "prefixes-external": [""] }

[server]
Expand Down
30 changes: 30 additions & 0 deletions src/qlever/Qleverfiles/Qleverfile.orkg
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Qleverfile for ORKG, use with the QLever CLI (`pip install qlever`)
#
# qlever get-data # Get the dataset
# qlever index # Build index data structures
# qlever start # Start the server

[data]
NAME = orkg
GET_DATA_URL = https://orkg.org/api/rdf/dump
GET_DATA_CMD = curl -LR -o ${NAME}.ttl ${GET_DATA_URL} 2>&1 | tee ${NAME}.download-log.txt
VERSION = $$(date -r ${NAME}.ttl +%d.%m.%Y || echo "NO_DATE")
DESCRIPTION = The Open Research Knowledge Graph (ORKG) (data from ${GET_DATA_URL}, version ${VERSION})

[index]
INPUT_FILES = ${data:NAME}.ttl
CAT_INPUT_FILES = cat ${INPUT_FILES}
SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }

[server]
PORT = 7053
ACCESS_TOKEN = ${data:NAME}
MEMORY_FOR_QUERIES = 10G
CACHE_MAX_SIZE = 5G

[runtime]
SYSTEM = docker
IMAGE = docker.io/adfreiburg/qlever:latest

[ui]
UI_CONFIG = orkg
128 changes: 102 additions & 26 deletions src/qlever/Qleverfiles/Qleverfile.pubchem

Large diffs are not rendered by default.

64 changes: 48 additions & 16 deletions src/qlever/Qleverfiles/Qleverfile.uniprot
Original file line number Diff line number Diff line change
@@ -1,30 +1,62 @@
# Qleverfile for UniProt, use with https://github.com/ad-freiburg/qlever-control
#
# qlever get-data # takes ~ 30 hours and ~ 2 TB of disk (for the NT files)
# qlever index # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 5900X)
# qlever get-data # takes ~ 30 hours and ~ 1.6 TB of disk (for the TTL files)
# qlever index # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 9950X)
# qlever start # starts the server (takes a few seconds)
#
# Install packages: sudo apt install -y libxml2-utils parallel xz-utils pv
# Install packages: sudo apt install -y libxml2-utils parallel xz-utils wget
# Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries)
#
# Set DATE to the date of the latest release. Build on SSD (requires ~ 7 TB
# during build, ~ 3 TB after build). The uniprot.index.???.meta files can be on
# HDD without significant performance loss (when running the server).
# during build, ~ 3 TB after build).

[data]
NAME = uniprot
DATE = 2024-05-29
DOWNLOAD_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
GET_RDFXML_CMD = mkdir -p rdf.${DATE} && curl -s ${DOWNLOAD_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" - | while read URL; do wget --no-verbose -P rdf.${DATE} $$URL 2>&1 | tee -a uniprot.download-log; done
RDFXML2NT_CMD = mkdir -p nt.${DATE} && for RDFXML in rdf.${DATE}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=nt 2> /dev/null | gzip -c > nt.${DATE}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/nt.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel
GET_DATA_CMD = rdfxml --help && date > ${NAME}.get-data.begin-date && ${GET_RDFXML_CMD} && ${RDFXML2NT_CMD} && date > ${NAME}.get-data.end-date
DESCRIPTION = Complete UniProt data from ${DOWNLOAD_URL}, version ${DATE}
NAME = uniprot
DATE = 2024-11-27
RDFXML_DIR = rdf.${DATE}
TTL_DIR = ttl.${DATE}
UNIPROT_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
RHEA_URL = https://ftp.expasy.org/databases/rhea/rdf
EXAMPLES_URL = https://github.com/sib-swiss/sparql-examples
GET_EXAMPLES_CMD = mkdir -p ${TTL_DIR} && git clone ${EXAMPLES_URL} && (cd sparql-examples && ./convertToOneTurtle.sh -p uniprot && gzip examples_uniprot.ttl && mv -f examples_uniprot.ttl.gz ../${TTL_DIR} && cd .. && rm -rf sparql-examples)
GET_RDFXML_CMD = mkdir -p ${RDFXML_DIR} && (echo "${RHEA_URL}/chebi.owl.gz"; echo "${RHEA_URL}/rhea.rdf.gz"; curl -s ${UNIPROT_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" -) | while read URL; do wget --no-verbose -P ${RDFXML_DIR} $$URL 2>&1 | tee -a uniprot.download-log; done
RDFXML2TTL_CMD = mkdir -p ${TTL_DIR} && for RDFXML in ${RDFXML_DIR}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=ttl -q 2> ${TTL_DIR}/$$(basename $$RDFXML).stderr | gzip -c > ${TTL_DIR}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/ttl.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel
GET_DATA_CMD = date > ${NAME}.get-data.begin-date && ${GET_EXAMPLES_CMD} && ${GET_RDFXML_CMD} && ${RDFXML2TTL_CMD} && date > ${NAME}.get-data.end-date
DESCRIPTION = Complete UniProt data from ${UNIPROT_URL}, with additional data from ${RHEA_URL} and ${EXAMPLES_URL}

[index]
INPUT_FILES = nt.${data:DATE}/*.nt.gz
CAT_INPUT_FILES = parallel --tmpdir . -j 4 'zcat -f {}' ::: ${INPUT_FILES} | pv -q -B 5G
SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }
STXXL_MEMORY = 60G
INPUT_FILES = ${data:TTL_DIR}/*.ttl.gz
MULTI_INPUT_JSON = [{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_reviewed_*.ttl.gz" },
{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_unreviewed_*.ttl.gz" },
{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniparc", "for-each": "${data:TTL_DIR}/uniparc_*.ttl.gz" },
{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniref", "for-each": "${data:TTL_DIR}/uniref*.ttl.gz" },
{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/obsolete", "for-each": "${data:TTL_DIR}/uniprotkb_obsolete_*.ttl.gz" },
{ "cmd": "zcat ${data:TTL_DIR}/chebi.ttl.gz", "graph": "http://sparql.uniprot.org/chebi" },
{ "cmd": "zcat ${data:TTL_DIR}/citation_mapping.ttl.gz", "graph": "http://sparql.uniprot.org/citationmapping" },
{ "cmd": "zcat ${data:TTL_DIR}/citations.ttl.gz", "graph": "http://sparql.uniprot.org/citations" },
{ "cmd": "zcat ${data:TTL_DIR}/databases.ttl.gz", "graph": "http://sparql.uniprot.org/databases" },
{ "cmd": "zcat ${data:TTL_DIR}/diseases.ttl.gz", "graph": "http://sparql.uniprot.org/diseases" },
{ "cmd": "zcat ${data:TTL_DIR}/enzyme-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" },
{ "cmd": "zcat ${data:TTL_DIR}/enzyme.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" },
{ "cmd": "zcat ${data:TTL_DIR}/go-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/go" },
{ "cmd": "zcat ${data:TTL_DIR}/go.ttl.gz", "graph": "http://sparql.uniprot.org/go" },
{ "cmd": "zcat ${data:TTL_DIR}/journals.ttl.gz", "graph": "http://sparql.uniprot.org/journal" },
{ "cmd": "zcat ${data:TTL_DIR}/keywords-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" },
{ "cmd": "zcat ${data:TTL_DIR}/keywords.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" },
{ "cmd": "zcat ${data:TTL_DIR}/locations-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/locations" },
{ "cmd": "zcat ${data:TTL_DIR}/locations.ttl.gz", "graph": "http://sparql.uniprot.org/locations" },
{ "cmd": "zcat ${data:TTL_DIR}/pathways-hierarchy*.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" },
{ "cmd": "zcat ${data:TTL_DIR}/pathways.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" },
{ "cmd": "zcat ${data:TTL_DIR}/proteomes.ttl.gz", "graph": "http://sparql.uniprot.org/proteomes" },
{ "cmd": "zcat ${data:TTL_DIR}/taxonomy-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" },
{ "cmd": "zcat ${data:TTL_DIR}/taxonomy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" },
{ "cmd": "zcat ${data:TTL_DIR}/tissues.ttl.gz", "graph": "http://sparql.uniprot.org/tissues" },
{ "cmd": "zcat ${data:TTL_DIR}/rhea.ttl.gz", "graph": "https://sparql.rhea-db.org/rhea" },
{ "cmd": "zcat ${data:TTL_DIR}/examples_uniprot.ttl.gz", "graph": "http://sparql.uniprot.org/.well-known/sparql-examples" },
{ "cmd": "zcat ${data:TTL_DIR}/core.ttl.gz", "graph": "http://purl.uniprot.org/core" },
{ "cmd": "zcat ${data:TTL_DIR}/void.ttl.gz", "graph": "http://rdfs.org/ns/void" }]
SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }
STXXL_MEMORY = 60G

[server]
PORT = 7018
Expand Down
17 changes: 7 additions & 10 deletions src/qlever/Qleverfiles/Qleverfile.wikidata
Original file line number Diff line number Diff line change
Expand Up @@ -13,29 +13,26 @@ NAME = wikidata

[data]
GET_DATA_URL = https://dumps.wikimedia.org/wikidatawiki/entities
GET_DATA_CMD = curl -LROC - ${GET_DATA_URL}/latest-all.ttl.bz2 ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1 | tee wikidata.download-log.txt && curl -sL ${GET_DATA_URL}/dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt
GET_DATA_CMD = curl -LRC - -O ${GET_DATA_URL}/latest-all.ttl.bz2 -O ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1 | tee wikidata.download-log.txt && curl -sL ${GET_DATA_URL}/dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt
DATE_WIKIDATA = $$(date -r latest-all.ttl.bz2 +%d.%m.%Y || echo "NO_DATE")
DATE_WIKIPEDIA = $$(date -r wikipedia-abstracts.nt +%d.%m.%Y || echo "NO_DATE")
DESCRIPTION = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${DATE_WIKIDATA}) + English Wikipeda abstracts (version ${DATE_WIKIPEDIA}, available via schema:description)
TEXT_DESCRIPTION = All English and German literals + all sentences from the English Wikipedia (version ${DATE_WIKIPEDIA}), use with FILTER KEYWORDS(...)
DESCRIPTION = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${DATE_WIKIDATA})

[index]
INPUT_FILES = latest-all.ttl.bz2 latest-lexemes.ttl.bz2 wikipedia-abstracts.nt dcatap.nt
INPUT_FILES = latest-all.ttl.bz2 latest-lexemes.ttl.bz2 dcatap.nt
MULTI_INPUT_JSON = [{ "cmd": "lbzcat -n 4 latest-all.ttl.bz2", "format": "ttl", "parallel": "true" },
{ "cmd": "lbzcat -n 1 latest-lexemes.ttl.bz2", "format": "ttl", "parallel": "false" },
{ "cmd": "cat wikipedia-abstracts.nt", "format": "nt", "parallel": "false" },
{ "cmd": "cat dcatap.nt", "format": "nt", "parallel": "false" }]
SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 5000000 }
STXXL_MEMORY = 10G
TEXT_INDEX = from_text_records

[server]
PORT = 7001
ACCESS_TOKEN = ${data:NAME}_3fz47hfzrbf64b
MEMORY_FOR_QUERIES = 40G
CACHE_MAX_SIZE = 30G
ACCESS_TOKEN = ${data:NAME}
MEMORY_FOR_QUERIES = 20G
CACHE_MAX_SIZE = 15G
CACHE_MAX_SIZE_SINGLE_ENTRY = 5G
TIMEOUT = 300s
TIMEOUT = 600s

[runtime]
SYSTEM = docker
Expand Down
3 changes: 2 additions & 1 deletion src/qlever/commands/add_text_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def execute(self, args) -> bool:
# Show the command line.
self.show(add_text_index_cmd, only_show=args.show)
if args.show:
return False
return True

# When running natively, check if the binary exists and works.
if args.system == "native":
Expand All @@ -74,6 +74,7 @@ def execute(self, args) -> bool:
log.error(f"Running \"{args.index_binary}\" failed ({e}), "
f"set `--index-binary` to a different binary or "
f"use `--container_system`")
return False

# Check if text index files already exist.
existing_text_index_files = get_existing_index_files(
Expand Down
2 changes: 1 addition & 1 deletion src/qlever/commands/cache_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def execute(self, args) -> bool:
self.show("\n".join([cache_stats_cmd, cache_settings_cmd]),
only_show=args.show)
if args.show:
return False
return True

# Execute them.
try:
Expand Down
6 changes: 4 additions & 2 deletions src/qlever/commands/clear_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def execute(self, args) -> bool:
f"\"{args.access_token}\"")
self.show(clear_cache_cmd, only_show=args.show)
if args.show:
return False
return True

# Execute the command.
try:
Expand Down Expand Up @@ -76,5 +76,7 @@ def execute(self, args) -> bool:
# Show cache stats.
log.info("")
args.detailed = False
CacheStatsCommand().execute(args)
if not CacheStatsCommand().execute(args):
log.error("Clearing the cache was successful, but showing the "
"cache stats failed {e}")
return True
Loading

0 comments on commit 7742fe2

Please sign in to comment.