diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
deleted file mode 100644
index 6e02b72..0000000
--- a/.github/FUNDING.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-# These are supported funding model platforms
-
-github: maciejwalkowiak
-patreon: # Replace with a single Patreon username
-open_collective: # Replace with a single Open Collective username
-ko_fi: # Replace with a single Ko-fi username
-tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
-community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
-liberapay: # Replace with a single Liberapay username
-issuehunt: # Replace with a single IssueHunt username
-otechie: # Replace with a single Otechie username
-lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
-custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/.gitignore b/.gitignore
index 3e403e3..d4ba9a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+output/
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
diff --git a/LICENSE b/LICENSE
index 71997b4..b889d64 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,7 @@
MIT License
-Copyright (c) 2022 Maciej Walkowiak
+Copyright (c) 2024 Alexander Schonfeld
+Copyright (c) 2023 Maciej Walkowiak
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index b3bf5ec..2dbf107 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,18 @@
-# Java CLI project template
+# CureHunter Data Transformer Example Java CLI
-Probably the fastest way to bootstrap a **Java** Command Line Application (**CLI**) project!
+Example of CLI XSL transformer for licensed [CureHunter](https://curehunter.com) data.
+Multithreaded conversion of CureHunter sentence relationship and MeSH keyword tagged NLM PubMed Data ".xml.gz" files to TSV etc.
-Click the big green *Use This Template* button and start building your own CLI.
+"medlineCitationTSV.xsl" extracts all medline citation abstracts from the sample "./data/pubmed-sample*.xml.gz" pubmed xml files which contain Drug-Disease relationships.
-## 🤩 Highlights
+## Usage
-- uses [Picocli](https://picocli.info/) for simple and elegant commands implementation
-- builds to an **executable jar** and **GraalVM Native Image**
-- preconfigured `help` and `version` commands
-- generates Bash/ZSH autocompletion scripts
-- uses **Maven** as a build tool
-- **no frameworks** like Spring, Micronaut or Quarkus
-
-## 🛠 Requirements
-
-- GraalVM distribution of Java (easy to install with https://sdkman.io/)
+Run application through Maven
-## 🤔 How to use
+```
+$ mkdir output
+$ ./mvnw -Dexec.args=transform
+```
Run tests & build an executable JAR:
@@ -31,10 +26,4 @@ Run tests as native image & build a native executable:
$ ./mvnw package -Pnative
```
-Run application through Maven
-
-```
-$ ./mvnw -Dexec.args=--help
-```
-
-Sounds good? Consider [❤️ Sponsoring](https://github.com/sponsors/maciejwalkowiak) the project! Thank you!
+Original Template [java-cli-project-template](https://github.com/maciejwalkowiak/java-cli-project-template)! Thank you!
diff --git a/data/pubmed-sample1.xml.gz b/data/pubmed-sample1.xml.gz
new file mode 100644
index 0000000..7080007
Binary files /dev/null and b/data/pubmed-sample1.xml.gz differ
diff --git a/data/pubmed-sample2.xml.gz b/data/pubmed-sample2.xml.gz
new file mode 100644
index 0000000..56199f3
Binary files /dev/null and b/data/pubmed-sample2.xml.gz differ
diff --git a/pom.xml b/pom.xml
index f10a1e4..50ead9a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,8 +4,8 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
- org.example
- picocli-project-template
+ com.curehunter
+ curehunter-transform-project
1.0-SNAPSHOT
@@ -106,7 +106,7 @@
java
- org.example.Main
+ com.curehunter.Main
@@ -126,7 +126,7 @@
--force
--completionScript
${project.build.directory}/app_completion.sh
- org.example.Main
+ com.curehunter.Main
@@ -176,7 +176,7 @@
true
- org.example.Main
+ com.curehunter.Main
@@ -209,7 +209,7 @@
${native-maven-plugin.version}
true
- org.example.Main
+ com.curehunter.Main
app
diff --git a/src/main/java/org/example/ListFilesCommand.java b/src/main/java/com/curehunter/ListFilesCommand.java
similarity index 91%
rename from src/main/java/org/example/ListFilesCommand.java
rename to src/main/java/com/curehunter/ListFilesCommand.java
index 888f2a5..b783623 100644
--- a/src/main/java/org/example/ListFilesCommand.java
+++ b/src/main/java/com/curehunter/ListFilesCommand.java
@@ -1,4 +1,4 @@
-package org.example;
+package com.curehunter;
import java.io.IOException;
import java.nio.file.Files;
@@ -17,7 +17,7 @@ public class ListFilesCommand implements Callable {
private Path path;
@Override public Integer call() {
- WildcardFileFilter filter = new WildcardFileFilter("*.java");
+ WildcardFileFilter filter = new WildcardFileFilter("*");
try (Stream stream = Files.walk(path)) {
stream.filter(path -> filter.accept(path.toFile()))
diff --git a/src/main/java/org/example/Main.java b/src/main/java/com/curehunter/Main.java
similarity index 64%
rename from src/main/java/org/example/Main.java
rename to src/main/java/com/curehunter/Main.java
index ccb5c7e..adaa231 100644
--- a/src/main/java/org/example/Main.java
+++ b/src/main/java/com/curehunter/Main.java
@@ -1,10 +1,12 @@
-package org.example;
+package com.curehunter;
+
+import com.curehunter.picocli.PropertiesFileVersionProvider;
+import com.curehunter.transform.TransformAll;
-import org.example.picocli.PropertiesFileVersionProvider;
import picocli.CommandLine;
@CommandLine.Command(name = "app", mixinStandardHelpOptions = true, versionProvider = PropertiesFileVersionProvider.class, subcommands = {
- HelloCommand.class, ListFilesCommand.class })
+ ListFilesCommand.class, TransformAll.class })
public class Main {
public static void main(String[] args) {
diff --git a/src/main/java/org/example/picocli/PropertiesFileVersionProvider.java b/src/main/java/com/curehunter/picocli/PropertiesFileVersionProvider.java
similarity index 96%
rename from src/main/java/org/example/picocli/PropertiesFileVersionProvider.java
rename to src/main/java/com/curehunter/picocli/PropertiesFileVersionProvider.java
index 739e854..6d45c0f 100644
--- a/src/main/java/org/example/picocli/PropertiesFileVersionProvider.java
+++ b/src/main/java/com/curehunter/picocli/PropertiesFileVersionProvider.java
@@ -1,4 +1,4 @@
-package org.example.picocli;
+package com.curehunter.picocli;
import java.io.InputStream;
import java.net.URL;
diff --git a/src/main/java/com/curehunter/transform/TransformAll.java b/src/main/java/com/curehunter/transform/TransformAll.java
new file mode 100644
index 0000000..71e996a
--- /dev/null
+++ b/src/main/java/com/curehunter/transform/TransformAll.java
@@ -0,0 +1,122 @@
+package com.curehunter.transform;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.zip.GZIPInputStream;
+
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.stream.StreamResult;
+import javax.xml.transform.stream.StreamSource;
+
+import com.curehunter.utils.FileIterator;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.concurrent.Callable;
+
+import org.apache.commons.io.filefilter.WildcardFileFilter;
+import picocli.CommandLine.Command;
+import picocli.CommandLine.Option;
+
+@Command(name = "transform")
+public class TransformAll implements Callable {
+
+ @Option(names = { "-h", "--help" }, usageHelp = true, description = "display this help and exit")
+ boolean help;
+
+ @Option(names = { "-i", "--in" }, description = "directory to search for files, default: ./data")
+ private Path inputDir = Paths.get(System.getProperty("user.dir")+"/data");
+
+ @Option(names = { "-f", "--filter" }, description = "filter files with wildcard pattern, default: *.xml.gz", defaultValue = "*.xml.gz")
+ private String filterPattern;
+
+ @Option(names = { "-o", "--out" }, description = "output files to directory, default: ./output")
+ private Path outputDir = Paths.get(System.getProperty("user.dir")+"/output");
+
+ @Option(names = { "-x", "--xsl" }, description = "XSL file to use for transformation, default: ./xsl/medlineCitationTSV.xsl")
+ private Path xslFile = Paths.get(System.getProperty("user.dir")+"/xsl/medlineCitationTSV.xsl");
+
+ @Option(names = { "-e", "--outExt" }, description = "extension to append to output files, default: .out", defaultValue = ".out")
+ private String outputExtension;
+
+ @Option(names = { "-w", "--workers" }, description = "number of worker threads, files to process simultaneously, default: 4", defaultValue = "4")
+ private int workerThreads;
+
+ @Override
+ public Integer call() {
+ long startTime = System.currentTimeMillis();
+ FileIterator workerData = new FileIterator(inputDir.toFile());
+ WildcardFileFilter filter = new WildcardFileFilter(filterPattern);
+ try {
+ System.out.println("in=" + inputDir.toFile().getCanonicalPath() + " out="
+ + outputDir.toFile().getCanonicalPath()
+ + " xsl=" + xslFile.toFile().getCanonicalPath());
+ ExecutorService exec = Executors.newFixedThreadPool(workerThreads);
+ for (int n = 0; n < workerThreads; n++) {
+ exec.execute(new Worker(workerData, xslFile.toFile(), outputDir.toFile(),
+ outputExtension, filter));
+ }
+ exec.shutdown();
+ exec.awaitTermination(1, TimeUnit.DAYS);
+ System.out.println("Processing complete in " + (System.currentTimeMillis() - startTime) + "ms");
+ return 0;
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ System.out.println("Processing complete in " + (System.currentTimeMillis() - startTime) + "ms");
+ return 1;
+ }
+
+ static class Worker implements Runnable {
+ FileIterator workerData;
+ File xslt;
+ File outputDir;
+ String outputExtension;
+ WildcardFileFilter filter;
+
+ public Worker(FileIterator workerData, File xslt, File outputDir, String outputExtension,
+ WildcardFileFilter filter) {
+ this.workerData = workerData;
+ this.xslt = xslt;
+ this.outputDir = outputDir;
+ this.outputExtension = outputExtension;
+ this.filter = filter;
+ }
+
+ public void run() {
+ SAXParserFactory parserFactory = SAXParserFactory.newInstance();
+ TransformerFactory transformerFactory = TransformerFactory.newInstance();
+ System.out.println("transformerFactory=" + transformerFactory + " parserFactory=" + parserFactory);
+ try {
+ Transformer transformer = transformerFactory.newTransformer(new StreamSource(xslt));
+ File f = null;
+ while ((f = workerData.next()) != null) {
+ if (filter.accept(f)) {
+ System.out.println("start transforming file=" + f.getCanonicalPath());
+ try {
+ transformer.transform(new StreamSource(
+ f.getCanonicalPath().toLowerCase().endsWith(".gz")
+ ? new GZIPInputStream(new FileInputStream(f))
+ : new FileInputStream(f)),
+ new StreamResult(new File(outputDir, f.getName() + outputExtension)));
+ System.out.println("done transforming file=" + f.getCanonicalPath());
+ } catch (Throwable e) {
+ System.err.printf("error transforming file=" + f.getCanonicalPath(), e);
+ e.printStackTrace();
+ }
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ }
+}
diff --git a/src/main/java/com/curehunter/utils/FileIterator.java b/src/main/java/com/curehunter/utils/FileIterator.java
new file mode 100644
index 0000000..92fb10b
--- /dev/null
+++ b/src/main/java/com/curehunter/utils/FileIterator.java
@@ -0,0 +1,50 @@
+package com.curehunter.utils;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+
+/**
+ * Iterate files in a directory, returning each file only once.
+ */
+public class FileIterator implements Iterator {
+
+ private File parentDir;
+ private Collection seenFiles = new HashSet();
+
+ public FileIterator(File parentDir) {
+ this.parentDir = parentDir;
+ }
+
+ public boolean hasNext() {
+ synchronized (this.seenFiles) {
+ return (nextInternal() != null);
+ }
+ }
+
+ public File next() {
+ synchronized (this.seenFiles) {
+ File next = nextInternal();
+ this.seenFiles.add(next);
+ return next;
+ }
+ }
+
+ private File nextInternal() {
+ File childFiles[] = this.parentDir.listFiles();
+ if (childFiles != null) {
+ Arrays.sort(childFiles);
+ for (int n = 0; n < childFiles.length; n++) {
+ if (!this.seenFiles.contains(childFiles[n]) && childFiles[n].isFile()) {
+ return childFiles[n];
+ }
+ }
+ }
+ return null;
+ }
+
+ public void remove() {
+ }
+}
diff --git a/src/main/java/org/example/HelloCommand.java b/src/main/java/org/example/HelloCommand.java
deleted file mode 100644
index 50ae37d..0000000
--- a/src/main/java/org/example/HelloCommand.java
+++ /dev/null
@@ -1,17 +0,0 @@
-package org.example;
-
-import java.util.concurrent.Callable;
-
-import picocli.CommandLine;
-
-@CommandLine.Command(name = "hello")
-public class HelloCommand implements Callable {
-
- @CommandLine.Option(names = "--name", defaultValue = "World")
- private String name;
-
- @Override public Integer call() {
- System.out.printf("Hello %s!%n", name);
- return 0;
- }
-}
diff --git a/src/test/java/org/example/MainTests.java b/src/test/java/com/curehunter/MainTests.java
similarity index 96%
rename from src/test/java/org/example/MainTests.java
rename to src/test/java/com/curehunter/MainTests.java
index 42f6137..7064710 100644
--- a/src/test/java/org/example/MainTests.java
+++ b/src/test/java/com/curehunter/MainTests.java
@@ -1,10 +1,11 @@
-package org.example;
+package com.curehunter;
import java.io.PrintWriter;
import java.io.StringWriter;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+
import picocli.CommandLine;
import static org.assertj.core.api.Assertions.assertThat;
diff --git a/xsl/medlineCitationTSV.xsl b/xsl/medlineCitationTSV.xsl
new file mode 100644
index 0000000..e87f8ea
--- /dev/null
+++ b/xsl/medlineCitationTSV.xsl
@@ -0,0 +1,58 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+