Skip to content

Commit

Permalink
add TransformAll CLI command
Browse files Browse the repository at this point in the history
  • Loading branch information
areyasouka committed Jan 23, 2024
1 parent 7008e7e commit d82e3a8
Show file tree
Hide file tree
Showing 15 changed files with 260 additions and 66 deletions.
13 changes: 0 additions & 13 deletions .github/FUNDING.yml

This file was deleted.

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
output/
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
Expand Down
3 changes: 2 additions & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
MIT License

Copyright (c) 2022 Maciej Walkowiak
Copyright (c) 2024 Alexander Schonfeld
Copyright (c) 2023 Maciej Walkowiak

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
33 changes: 11 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,18 @@
# Java CLI project template
# CureHunter Data Transformer Example Java CLI

Probably the fastest way to bootstrap a **Java** Command Line Application (**CLI**) project!
Example of CLI XSL transformer for licensed [CureHunter](https://curehunter.com) data.
Multithreaded conversion of CureHunter sentence relationship and MeSH keyword tagged NLM PubMed Data ".xml.gz" files to TSV etc.

Click the big green *Use This Template* button and start building your own CLI.
"medlineCitationTSV.xsl" extracts all medline citation abstracts from the sample "./data/pubmed-sample*.xml.gz" pubmed xml files which contain Drug-Disease relationships.

## 🤩 Highlights
## Usage

- uses [Picocli](https://picocli.info/) for simple and elegant commands implementation
- builds to an **executable jar** and **GraalVM Native Image**
- preconfigured `help` and `version` commands
- generates Bash/ZSH autocompletion scripts
- uses **Maven** as a build tool
- **no frameworks** like Spring, Micronaut or Quarkus

## 🛠 Requirements

- GraalVM distribution of Java (easy to install with https://sdkman.io/)
Run application through Maven

## 🤔 How to use
```
$ mkdir output
$ ./mvnw -Dexec.args=transform
```

Run tests & build an executable JAR:

Expand All @@ -31,10 +26,4 @@ Run tests as native image & build a native executable:
$ ./mvnw package -Pnative
```

Run application through Maven

```
$ ./mvnw -Dexec.args=--help
```

Sounds good? Consider [❤️ Sponsoring](https://github.com/sponsors/maciejwalkowiak) the project! Thank you!
Original Template [java-cli-project-template](https://github.com/maciejwalkowiak/java-cli-project-template)! Thank you!
Binary file added data/pubmed-sample1.xml.gz
Binary file not shown.
Binary file added data/pubmed-sample2.xml.gz
Binary file not shown.
12 changes: 6 additions & 6 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.example</groupId>
<artifactId>picocli-project-template</artifactId>
<groupId>com.curehunter</groupId>
<artifactId>curehunter-transform-project</artifactId>
<version>1.0-SNAPSHOT</version>

<properties>
Expand Down Expand Up @@ -106,7 +106,7 @@
<goal>java</goal>
</goals>
<configuration>
<mainClass>org.example.Main</mainClass>
<mainClass>com.curehunter.Main</mainClass>
</configuration>
</execution>
<!-- generates bash/zsh autocompletion script -->
Expand All @@ -126,7 +126,7 @@
<argument>--force</argument>
<argument>--completionScript</argument>
<argument>${project.build.directory}/app_completion.sh</argument>
<argument>org.example.Main</argument>
<argument>com.curehunter.Main</argument>
</arguments>
</configuration>
</execution>
Expand Down Expand Up @@ -176,7 +176,7 @@
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>org.example.Main</mainClass>
<mainClass>com.curehunter.Main</mainClass>
</manifest>
</archive>
<descriptorRefs>
Expand Down Expand Up @@ -209,7 +209,7 @@
<version>${native-maven-plugin.version}</version>
<extensions>true</extensions>
<configuration>
<mainClass>org.example.Main</mainClass>
<mainClass>com.curehunter.Main</mainClass>
<!-- executable file name -->
<imageName>app</imageName>
<!-- enable reachability metadata https://graalvm.github.io/native-build-tools/latest/maven-plugin.html#metadata-support -->
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.example;
package com.curehunter;

import java.io.IOException;
import java.nio.file.Files;
Expand All @@ -17,7 +17,7 @@ public class ListFilesCommand implements Callable<Integer> {
private Path path;

@Override public Integer call() {
WildcardFileFilter filter = new WildcardFileFilter("*.java");
WildcardFileFilter filter = new WildcardFileFilter("*");

try (Stream<Path> stream = Files.walk(path)) {
stream.filter(path -> filter.accept(path.toFile()))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package org.example;
package com.curehunter;

import com.curehunter.picocli.PropertiesFileVersionProvider;
import com.curehunter.transform.TransformAll;

import org.example.picocli.PropertiesFileVersionProvider;
import picocli.CommandLine;

@CommandLine.Command(name = "app", mixinStandardHelpOptions = true, versionProvider = PropertiesFileVersionProvider.class, subcommands = {
HelloCommand.class, ListFilesCommand.class })
ListFilesCommand.class, TransformAll.class })
public class Main {

public static void main(String[] args) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.example.picocli;
package com.curehunter.picocli;

import java.io.InputStream;
import java.net.URL;
Expand Down
122 changes: 122 additions & 0 deletions src/main/java/com/curehunter/transform/TransformAll.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package com.curehunter.transform;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream;

import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import com.curehunter.utils.FileIterator;

import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.Callable;

import org.apache.commons.io.filefilter.WildcardFileFilter;
import picocli.CommandLine.Command;
import picocli.CommandLine.Option;

@Command(name = "transform")
public class TransformAll implements Callable<Integer> {

@Option(names = { "-h", "--help" }, usageHelp = true, description = "display this help and exit")
boolean help;

@Option(names = { "-i", "--in" }, description = "directory to search for files, default: ./data")
private Path inputDir = Paths.get(System.getProperty("user.dir")+"/data");

@Option(names = { "-f", "--filter" }, description = "filter files with wildcard pattern, default: *.xml.gz", defaultValue = "*.xml.gz")
private String filterPattern;

@Option(names = { "-o", "--out" }, description = "output files to directory, default: ./output")
private Path outputDir = Paths.get(System.getProperty("user.dir")+"/output");

@Option(names = { "-x", "--xsl" }, description = "XSL file to use for transformation, default: ./xsl/medlineCitationTSV.xsl")
private Path xslFile = Paths.get(System.getProperty("user.dir")+"/xsl/medlineCitationTSV.xsl");

@Option(names = { "-e", "--outExt" }, description = "extension to append to output files, default: .out", defaultValue = ".out")
private String outputExtension;

@Option(names = { "-w", "--workers" }, description = "number of worker threads, files to process simultaneously, default: 4", defaultValue = "4")
private int workerThreads;

@Override
public Integer call() {
long startTime = System.currentTimeMillis();
FileIterator workerData = new FileIterator(inputDir.toFile());
WildcardFileFilter filter = new WildcardFileFilter(filterPattern);
try {
System.out.println("in=" + inputDir.toFile().getCanonicalPath() + " out="
+ outputDir.toFile().getCanonicalPath()
+ " xsl=" + xslFile.toFile().getCanonicalPath());
ExecutorService exec = Executors.newFixedThreadPool(workerThreads);
for (int n = 0; n < workerThreads; n++) {
exec.execute(new Worker(workerData, xslFile.toFile(), outputDir.toFile(),
outputExtension, filter));
}
exec.shutdown();
exec.awaitTermination(1, TimeUnit.DAYS);
System.out.println("Processing complete in " + (System.currentTimeMillis() - startTime) + "ms");
return 0;
} catch (InterruptedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Processing complete in " + (System.currentTimeMillis() - startTime) + "ms");
return 1;
}

static class Worker implements Runnable {
FileIterator workerData;
File xslt;
File outputDir;
String outputExtension;
WildcardFileFilter filter;

public Worker(FileIterator workerData, File xslt, File outputDir, String outputExtension,
WildcardFileFilter filter) {
this.workerData = workerData;
this.xslt = xslt;
this.outputDir = outputDir;
this.outputExtension = outputExtension;
this.filter = filter;
}

public void run() {
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
TransformerFactory transformerFactory = TransformerFactory.newInstance();
System.out.println("transformerFactory=" + transformerFactory + " parserFactory=" + parserFactory);
try {
Transformer transformer = transformerFactory.newTransformer(new StreamSource(xslt));
File f = null;
while ((f = workerData.next()) != null) {
if (filter.accept(f)) {
System.out.println("start transforming file=" + f.getCanonicalPath());
try {
transformer.transform(new StreamSource(
f.getCanonicalPath().toLowerCase().endsWith(".gz")
? new GZIPInputStream(new FileInputStream(f))
: new FileInputStream(f)),
new StreamResult(new File(outputDir, f.getName() + outputExtension)));
System.out.println("done transforming file=" + f.getCanonicalPath());
} catch (Throwable e) {
System.err.printf("error transforming file=" + f.getCanonicalPath(), e);
e.printStackTrace();
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
50 changes: 50 additions & 0 deletions src/main/java/com/curehunter/utils/FileIterator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package com.curehunter.utils;

import java.io.File;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;

/**
* Iterate files in a directory, returning each file only once.
*/
public class FileIterator implements Iterator<File> {

private File parentDir;
private Collection<File> seenFiles = new HashSet<File>();

public FileIterator(File parentDir) {
this.parentDir = parentDir;
}

public boolean hasNext() {
synchronized (this.seenFiles) {
return (nextInternal() != null);
}
}

public File next() {
synchronized (this.seenFiles) {
File next = nextInternal();
this.seenFiles.add(next);
return next;
}
}

private File nextInternal() {
File childFiles[] = this.parentDir.listFiles();
if (childFiles != null) {
Arrays.sort(childFiles);
for (int n = 0; n < childFiles.length; n++) {
if (!this.seenFiles.contains(childFiles[n]) && childFiles[n].isFile()) {
return childFiles[n];
}
}
}
return null;
}

public void remove() {
}
}
17 changes: 0 additions & 17 deletions src/main/java/org/example/HelloCommand.java

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package org.example;
package com.curehunter;

import java.io.PrintWriter;
import java.io.StringWriter;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import picocli.CommandLine;

import static org.assertj.core.api.Assertions.assertThat;
Expand Down
Loading

0 comments on commit d82e3a8

Please sign in to comment.