diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml deleted file mode 100644 index 6e02b72..0000000 --- a/.github/FUNDING.yml +++ /dev/null @@ -1,13 +0,0 @@ -# These are supported funding model platforms - -github: maciejwalkowiak -patreon: # Replace with a single Patreon username -open_collective: # Replace with a single Open Collective username -ko_fi: # Replace with a single Ko-fi username -tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel -community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry -liberapay: # Replace with a single Liberapay username -issuehunt: # Replace with a single IssueHunt username -otechie: # Replace with a single Otechie username -lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry -custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] diff --git a/.gitignore b/.gitignore index 3e403e3..d4ba9a6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +output/ target/ !.mvn/wrapper/maven-wrapper.jar !**/src/main/**/target/ diff --git a/LICENSE b/LICENSE index 71997b4..b889d64 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License -Copyright (c) 2022 Maciej Walkowiak +Copyright (c) 2024 Alexander Schonfeld +Copyright (c) 2023 Maciej Walkowiak Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index b3bf5ec..2dbf107 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,18 @@ -# Java CLI project template +# CureHunter Data Transformer Example Java CLI -Probably the fastest way to bootstrap a **Java** Command Line Application (**CLI**) project! +Example of CLI XSL transformer for licensed [CureHunter](https://curehunter.com) data. +Multithreaded conversion of CureHunter sentence relationship and MeSH keyword tagged NLM PubMed Data ".xml.gz" files to TSV etc. -Click the big green *Use This Template* button and start building your own CLI. +"medlineCitationTSV.xsl" extracts all medline citation abstracts from the sample "./data/pubmed-sample*.xml.gz" pubmed xml files which contain Drug-Disease relationships. -## 🤩 Highlights +## Usage -- uses [Picocli](https://picocli.info/) for simple and elegant commands implementation -- builds to an **executable jar** and **GraalVM Native Image** -- preconfigured `help` and `version` commands -- generates Bash/ZSH autocompletion scripts -- uses **Maven** as a build tool -- **no frameworks** like Spring, Micronaut or Quarkus - -## 🛠 Requirements - -- GraalVM distribution of Java (easy to install with https://sdkman.io/) +Run application through Maven -## 🤔 How to use +``` +$ mkdir output +$ ./mvnw -Dexec.args=transform +``` Run tests & build an executable JAR: @@ -31,10 +26,4 @@ Run tests as native image & build a native executable: $ ./mvnw package -Pnative ``` -Run application through Maven - -``` -$ ./mvnw -Dexec.args=--help -``` - -Sounds good? Consider [❤️ Sponsoring](https://github.com/sponsors/maciejwalkowiak) the project! Thank you! +Original Template [java-cli-project-template](https://github.com/maciejwalkowiak/java-cli-project-template)! Thank you! diff --git a/data/pubmed-sample1.xml.gz b/data/pubmed-sample1.xml.gz new file mode 100644 index 0000000..7080007 Binary files /dev/null and b/data/pubmed-sample1.xml.gz differ diff --git a/data/pubmed-sample2.xml.gz b/data/pubmed-sample2.xml.gz new file mode 100644 index 0000000..56199f3 Binary files /dev/null and b/data/pubmed-sample2.xml.gz differ diff --git a/pom.xml b/pom.xml index f10a1e4..50ead9a 100644 --- a/pom.xml +++ b/pom.xml @@ -4,8 +4,8 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - org.example - picocli-project-template + com.curehunter + curehunter-transform-project 1.0-SNAPSHOT @@ -106,7 +106,7 @@ java - org.example.Main + com.curehunter.Main @@ -126,7 +126,7 @@ --force --completionScript ${project.build.directory}/app_completion.sh - org.example.Main + com.curehunter.Main @@ -176,7 +176,7 @@ true - org.example.Main + com.curehunter.Main @@ -209,7 +209,7 @@ ${native-maven-plugin.version} true - org.example.Main + com.curehunter.Main app diff --git a/src/main/java/org/example/ListFilesCommand.java b/src/main/java/com/curehunter/ListFilesCommand.java similarity index 91% rename from src/main/java/org/example/ListFilesCommand.java rename to src/main/java/com/curehunter/ListFilesCommand.java index 888f2a5..b783623 100644 --- a/src/main/java/org/example/ListFilesCommand.java +++ b/src/main/java/com/curehunter/ListFilesCommand.java @@ -1,4 +1,4 @@ -package org.example; +package com.curehunter; import java.io.IOException; import java.nio.file.Files; @@ -17,7 +17,7 @@ public class ListFilesCommand implements Callable { private Path path; @Override public Integer call() { - WildcardFileFilter filter = new WildcardFileFilter("*.java"); + WildcardFileFilter filter = new WildcardFileFilter("*"); try (Stream stream = Files.walk(path)) { stream.filter(path -> filter.accept(path.toFile())) diff --git a/src/main/java/org/example/Main.java b/src/main/java/com/curehunter/Main.java similarity index 64% rename from src/main/java/org/example/Main.java rename to src/main/java/com/curehunter/Main.java index ccb5c7e..adaa231 100644 --- a/src/main/java/org/example/Main.java +++ b/src/main/java/com/curehunter/Main.java @@ -1,10 +1,12 @@ -package org.example; +package com.curehunter; + +import com.curehunter.picocli.PropertiesFileVersionProvider; +import com.curehunter.transform.TransformAll; -import org.example.picocli.PropertiesFileVersionProvider; import picocli.CommandLine; @CommandLine.Command(name = "app", mixinStandardHelpOptions = true, versionProvider = PropertiesFileVersionProvider.class, subcommands = { - HelloCommand.class, ListFilesCommand.class }) + ListFilesCommand.class, TransformAll.class }) public class Main { public static void main(String[] args) { diff --git a/src/main/java/org/example/picocli/PropertiesFileVersionProvider.java b/src/main/java/com/curehunter/picocli/PropertiesFileVersionProvider.java similarity index 96% rename from src/main/java/org/example/picocli/PropertiesFileVersionProvider.java rename to src/main/java/com/curehunter/picocli/PropertiesFileVersionProvider.java index 739e854..6d45c0f 100644 --- a/src/main/java/org/example/picocli/PropertiesFileVersionProvider.java +++ b/src/main/java/com/curehunter/picocli/PropertiesFileVersionProvider.java @@ -1,4 +1,4 @@ -package org.example.picocli; +package com.curehunter.picocli; import java.io.InputStream; import java.net.URL; diff --git a/src/main/java/com/curehunter/transform/TransformAll.java b/src/main/java/com/curehunter/transform/TransformAll.java new file mode 100644 index 0000000..71e996a --- /dev/null +++ b/src/main/java/com/curehunter/transform/TransformAll.java @@ -0,0 +1,122 @@ +package com.curehunter.transform; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.zip.GZIPInputStream; + +import javax.xml.parsers.SAXParserFactory; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; + +import com.curehunter.utils.FileIterator; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.concurrent.Callable; + +import org.apache.commons.io.filefilter.WildcardFileFilter; +import picocli.CommandLine.Command; +import picocli.CommandLine.Option; + +@Command(name = "transform") +public class TransformAll implements Callable { + + @Option(names = { "-h", "--help" }, usageHelp = true, description = "display this help and exit") + boolean help; + + @Option(names = { "-i", "--in" }, description = "directory to search for files, default: ./data") + private Path inputDir = Paths.get(System.getProperty("user.dir")+"/data"); + + @Option(names = { "-f", "--filter" }, description = "filter files with wildcard pattern, default: *.xml.gz", defaultValue = "*.xml.gz") + private String filterPattern; + + @Option(names = { "-o", "--out" }, description = "output files to directory, default: ./output") + private Path outputDir = Paths.get(System.getProperty("user.dir")+"/output"); + + @Option(names = { "-x", "--xsl" }, description = "XSL file to use for transformation, default: ./xsl/medlineCitationTSV.xsl") + private Path xslFile = Paths.get(System.getProperty("user.dir")+"/xsl/medlineCitationTSV.xsl"); + + @Option(names = { "-e", "--outExt" }, description = "extension to append to output files, default: .out", defaultValue = ".out") + private String outputExtension; + + @Option(names = { "-w", "--workers" }, description = "number of worker threads, files to process simultaneously, default: 4", defaultValue = "4") + private int workerThreads; + + @Override + public Integer call() { + long startTime = System.currentTimeMillis(); + FileIterator workerData = new FileIterator(inputDir.toFile()); + WildcardFileFilter filter = new WildcardFileFilter(filterPattern); + try { + System.out.println("in=" + inputDir.toFile().getCanonicalPath() + " out=" + + outputDir.toFile().getCanonicalPath() + + " xsl=" + xslFile.toFile().getCanonicalPath()); + ExecutorService exec = Executors.newFixedThreadPool(workerThreads); + for (int n = 0; n < workerThreads; n++) { + exec.execute(new Worker(workerData, xslFile.toFile(), outputDir.toFile(), + outputExtension, filter)); + } + exec.shutdown(); + exec.awaitTermination(1, TimeUnit.DAYS); + System.out.println("Processing complete in " + (System.currentTimeMillis() - startTime) + "ms"); + return 0; + } catch (InterruptedException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + System.out.println("Processing complete in " + (System.currentTimeMillis() - startTime) + "ms"); + return 1; + } + + static class Worker implements Runnable { + FileIterator workerData; + File xslt; + File outputDir; + String outputExtension; + WildcardFileFilter filter; + + public Worker(FileIterator workerData, File xslt, File outputDir, String outputExtension, + WildcardFileFilter filter) { + this.workerData = workerData; + this.xslt = xslt; + this.outputDir = outputDir; + this.outputExtension = outputExtension; + this.filter = filter; + } + + public void run() { + SAXParserFactory parserFactory = SAXParserFactory.newInstance(); + TransformerFactory transformerFactory = TransformerFactory.newInstance(); + System.out.println("transformerFactory=" + transformerFactory + " parserFactory=" + parserFactory); + try { + Transformer transformer = transformerFactory.newTransformer(new StreamSource(xslt)); + File f = null; + while ((f = workerData.next()) != null) { + if (filter.accept(f)) { + System.out.println("start transforming file=" + f.getCanonicalPath()); + try { + transformer.transform(new StreamSource( + f.getCanonicalPath().toLowerCase().endsWith(".gz") + ? new GZIPInputStream(new FileInputStream(f)) + : new FileInputStream(f)), + new StreamResult(new File(outputDir, f.getName() + outputExtension))); + System.out.println("done transforming file=" + f.getCanonicalPath()); + } catch (Throwable e) { + System.err.printf("error transforming file=" + f.getCanonicalPath(), e); + e.printStackTrace(); + } + } + } + } catch (Exception e) { + e.printStackTrace(); + } + } + } +} diff --git a/src/main/java/com/curehunter/utils/FileIterator.java b/src/main/java/com/curehunter/utils/FileIterator.java new file mode 100644 index 0000000..92fb10b --- /dev/null +++ b/src/main/java/com/curehunter/utils/FileIterator.java @@ -0,0 +1,50 @@ +package com.curehunter.utils; + +import java.io.File; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; + +/** + * Iterate files in a directory, returning each file only once. + */ +public class FileIterator implements Iterator { + + private File parentDir; + private Collection seenFiles = new HashSet(); + + public FileIterator(File parentDir) { + this.parentDir = parentDir; + } + + public boolean hasNext() { + synchronized (this.seenFiles) { + return (nextInternal() != null); + } + } + + public File next() { + synchronized (this.seenFiles) { + File next = nextInternal(); + this.seenFiles.add(next); + return next; + } + } + + private File nextInternal() { + File childFiles[] = this.parentDir.listFiles(); + if (childFiles != null) { + Arrays.sort(childFiles); + for (int n = 0; n < childFiles.length; n++) { + if (!this.seenFiles.contains(childFiles[n]) && childFiles[n].isFile()) { + return childFiles[n]; + } + } + } + return null; + } + + public void remove() { + } +} diff --git a/src/main/java/org/example/HelloCommand.java b/src/main/java/org/example/HelloCommand.java deleted file mode 100644 index 50ae37d..0000000 --- a/src/main/java/org/example/HelloCommand.java +++ /dev/null @@ -1,17 +0,0 @@ -package org.example; - -import java.util.concurrent.Callable; - -import picocli.CommandLine; - -@CommandLine.Command(name = "hello") -public class HelloCommand implements Callable { - - @CommandLine.Option(names = "--name", defaultValue = "World") - private String name; - - @Override public Integer call() { - System.out.printf("Hello %s!%n", name); - return 0; - } -} diff --git a/src/test/java/org/example/MainTests.java b/src/test/java/com/curehunter/MainTests.java similarity index 96% rename from src/test/java/org/example/MainTests.java rename to src/test/java/com/curehunter/MainTests.java index 42f6137..7064710 100644 --- a/src/test/java/org/example/MainTests.java +++ b/src/test/java/com/curehunter/MainTests.java @@ -1,10 +1,11 @@ -package org.example; +package com.curehunter; import java.io.PrintWriter; import java.io.StringWriter; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; + import picocli.CommandLine; import static org.assertj.core.api.Assertions.assertThat; diff --git a/xsl/medlineCitationTSV.xsl b/xsl/medlineCitationTSV.xsl new file mode 100644 index 0000000..e87f8ea --- /dev/null +++ b/xsl/medlineCitationTSV.xsl @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +