diff --git a/oap-formats/oap-tsv/README.md b/oap-formats/oap-tsv/README.md new file mode 100644 index 0000000000..81b572d199 --- /dev/null +++ b/oap-formats/oap-tsv/README.md @@ -0,0 +1,12 @@ +# oap-tsv + +TSV is a Tab Separated Value format. Unlike Comma Separated Value (CSV) it contains ONLY tabular character to distingish columns. +CSV my be divided by comma, tab, semi-colon, pipe etc. +It also has some rules to separate columns with data inside them, if it contains special characters (a.k.a. escaping). +TSV alsways wrap dta into quotes if separator is comma. +Like [1..3] becomes '"1","2","3"' (with tabs) +and [1..3] becomes '1 2 3' (with comma) + +Strict rules also give TSV ability to be little bit faster than CSV + +See https://github.com/eBay/tsv-utils/blob/master/docs/comparing-tsv-and-csv.md diff --git a/oap-formats/oap-tsv/oap-tsv-test/pom.xml b/oap-formats/oap-tsv/oap-tsv-test/pom.xml new file mode 100644 index 0000000000..d7836d3ead --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/pom.xml @@ -0,0 +1,59 @@ + + + + + 4.0.0 + oap-tsv-test + oap-tsv-test + + + oap + oap-tsv-parent + ${oap.project.version} + + + + + oap + oap-tsv + ${project.version} + + + + oap + oap-stdlib-test + ${oap.project.version} + + + + org.projectlombok + lombok + ${oap.deps.lombok.version} + provided + + + diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/main/java/oap/tsv/test/TsvAssertion.java b/oap-formats/oap-tsv/oap-tsv-test/src/main/java/oap/tsv/test/TsvAssertion.java new file mode 100644 index 0000000000..8d325e1334 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/main/java/oap/tsv/test/TsvAssertion.java @@ -0,0 +1,232 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv.test; + +import lombok.EqualsAndHashCode; +import lombok.ToString; +import oap.io.Files; +import oap.io.content.ContentReader; +import oap.tsv.Tsv; +import oap.tsv.TsvStream.Header; +import oap.util.Lists; +import org.assertj.core.api.AbstractAssert; + +import java.io.File; +import java.io.InputStream; +import java.nio.file.Path; +import java.util.List; + +import static oap.io.content.ContentReader.ofString; +import static org.assertj.core.api.Assertions.assertThat; + +public class TsvAssertion extends AbstractAssert { + protected TsvAssertion( String value ) { + this( value, true ); + } + + protected TsvAssertion( String value, boolean withHeaders ) { + this( withHeaders ? ContentReader.read( value, Tsv.tsv.ofSeparatedValues() ).withHeaders().toTsv() + : ContentReader.read( value, Tsv.tsv.ofSeparatedValues() ).toTsv() ); + } + + protected TsvAssertion( Tsv value ) { + super( value, TsvAssertion.class ); + } + + public static TsvAssertion assertTsv( String tsv ) { + return assertTsv( tsv, true ); + } + + public static TsvAssertion assertTsv( String tsv, boolean withHeaders ) { + return new TsvAssertion( tsv, withHeaders ); + } + + public static TsvAssertion assertTsv( Tsv tsv ) { + return new TsvAssertion( tsv ); + } + + public static TsvAssertion assertTsv( Path path ) { + return assertTsv( path, true ); + } + + public static TsvAssertion assertTsv( Path path, boolean withHeaders ) { + return assertTsv( Files.read( path, ofString() ), withHeaders ); + } + + public static TsvAssertion assertTsv( File file ) { + return assertTsv( file, true ); + } + + public static TsvAssertion assertTsv( File file, boolean withHeaders ) { + return assertTsv( Files.read( file.toPath(), ofString() ), withHeaders ); + } + + public static TsvAssertion assertTsv( InputStream is ) { + return assertTsv( is, true ); + } + + public static TsvAssertion assertTsv( InputStream is, boolean withHeaders ) { + return assertTsv( ContentReader.read( is, ofString() ), withHeaders ); + } + + public static Row row( String... cols ) { + return new Row( cols ); + } + + public static Header header( String... cols ) { + return new Header( cols ); + } + + public TsvAssertion hasHeaders( String... headers ) { + assertThat( actual.headers ).contains( headers ); + return this; + } + + public TsvAssertion hasHeaders( Iterable headers ) { + assertThat( actual.headers ).containsAll( headers ); + return this; + } + + public TsvAssertion hasHeaders( Header header ) { + assertThat( actual.headers ).containsAll( header.cols ); + return this; + } + + public TsvAssertion containOnlyHeaders( String... headers ) { + assertThat( actual.headers ).containsOnly( headers ); + return this; + } + + @SafeVarargs + public final TsvAssertion containsExactlyInAnyOrderEntriesOf( List... entries ) { + assertThat( actual.data ) + .containsExactlyInAnyOrderElementsOf( List.of( entries ) ); + return this; + } + + public TsvAssertion containsExactlyInAnyOrderEntriesOf( Header header, Row... rows ) { + hasHeaders( header ); + for( var row : rows ) { + assertThat( row.cols ) + .withFailMessage( "entries length doesnt match headers" ) + .hasSize( header.size() ); + } + assertThat( actual.stream() + .select( header ) + .stripHeaders() + .toTsv() + .data ) + .containsExactlyInAnyOrderElementsOf( Lists.map( rows, r -> r.cols ) ); + + return this; + } + + public TsvAssertion containsAnyEntriesOf( Header header, Row... rows ) { + hasHeaders( header.cols ); + for( var row : rows ) { + assertThat( row.cols ) + .withFailMessage( "entries length doesnt match headers" ) + .hasSize( header.size() ); + } + + assertThat( actual.stream() + .select( header ) + .stripHeaders() + .toTsv() + .data ) + .containsAnyElementsOf( Lists.map( rows, r -> r.cols ) ); + return this; + } + + public TsvAssertion containsOnlyOnceEntriesOf( Header header, Row... rows ) { + hasHeaders( header ); + for( var row : rows ) { + assertThat( row.cols ) + .withFailMessage( "entries length doesnt match headers" ) + .hasSize( header.size() ); + } + assertThat( actual.stream() + .select( header ) + .stripHeaders() + .toTsv() + .data ).containsOnlyOnceElementsOf( Lists.map( rows, r -> r.cols ) ); + return this; + } + + public TsvAssertion doesNotContainAnyEntriesOf( Header header, Row... rows ) { + hasHeaders( header ); + for( var row : rows ) { + assertThat( row.cols ) + .withFailMessage( "entries length doesnt match headers" ) + .hasSize( header.size() ); + } + + assertThat( actual.stream() + .select( header ) + .stripHeaders() + .toTsv() + .data ).doesNotContainAnyElementsOf( Lists.map( rows, r -> r.cols ) ); + return this; + } + + public TsvAssertion doesNotContainAnyEntriesOf( Row... rows ) { + assertThat( actual.headers ) + .withFailMessage( "tsv must contain headers" ) + .isNotEmpty(); + for( var row : rows ) { + assertThat( row.cols ) + .withFailMessage( "entries length doesnt match headers" ) + .hasSize( actual.headers.size() ); + } + assertThat( actual.data ).doesNotContainAnyElementsOf( Lists.map( rows, r -> r.cols ) ); + return this; + } + + public TsvAssertion isNotEmpty() { + assertThat( actual.data ).isNotEmpty(); + return this; + } + + public TsvAssertion isEqualToTsv( String tsv ) { + Tsv expected = ContentReader.read( tsv, Tsv.tsv.ofSeparatedValues() ).withHeaders().toTsv(); + hasHeaders( expected.headers ); + assertThat( this.actual.data ).containsExactlyInAnyOrderElementsOf( expected.data ); + return this; + } + + public TsvAssertion isEqualToTsv( Path tsv ) { + return isEqualToTsv( Files.read( tsv, ofString() ) ); + } + + @ToString + @EqualsAndHashCode + public static class Row { + private final List cols; + + public Row( String... cols ) { + this.cols = List.of( cols ); + } + } +} diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/PrinterTest.java b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/PrinterTest.java new file mode 100644 index 0000000000..92adfc82f5 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/PrinterTest.java @@ -0,0 +1,43 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv; + +import org.testng.annotations.Test; + +import java.util.List; + +import static oap.testng.Asserts.assertString; + +public class PrinterTest { + @Test + public void print() { + assertString( Printer.print( List.of( 1, 2, 3 ), Tsv.DELIMITER_TAB ) ) + .isEqualTo( "1\t2\t3\n" ); + assertString( Printer.print( List.of( 1, 2, 3 ), Tsv.DELIMITER_COMMA, true ) ) + .isEqualTo( "\"1\",\"2\",\"3\"\n" ); + assertString( Printer.print( List.of( 1, "\"2\\\"", 3 ), Tsv.DELIMITER_COMMA, true ) ) + .isEqualTo( "\"1\",\"\"\"2\\\\\"\"\",\"3\"\n" ); + } +} diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TokenizerPerformance.java b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TokenizerPerformance.java new file mode 100644 index 0000000000..d5479ea544 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TokenizerPerformance.java @@ -0,0 +1,47 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv; + +import org.apache.commons.lang3.StringUtils; +import org.testng.annotations.Test; + +import static oap.benchmark.Benchmark.benchmark; +import static oap.tsv.Tokenizer.parse; +import static oap.tsv.Tsv.DELIMITER_TAB; +import static org.assertj.core.api.Assertions.assertThat; + +public class TokenizerPerformance { + + @Test + public void perf() { + String tsv = "aaaa\tbbbb\txxxx\tddd\t19/11/2011\t33.3\taaaa\t11\txxx\tvvvv\tS\tS\t444\txxx\t4444\t1234\tN\tN"; + assertThat( parse( tsv, DELIMITER_TAB ) ).hasSize( 18 ); + benchmark( "split", 1000000, () -> StringUtils.splitByWholeSeparatorPreserveAllTokens( tsv, "\t" ) ) + .run(); + benchmark( "tokenizer", 1000000, () -> parse( tsv, DELIMITER_TAB ) ) + .run(); + } + +} diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TokenizerTest.java b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TokenizerTest.java new file mode 100644 index 0000000000..c715a54364 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TokenizerTest.java @@ -0,0 +1,63 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv; + +import org.testng.annotations.Test; + +import static oap.tsv.Tokenizer.parse; +import static oap.tsv.Tsv.DELIMITER_COMMA; +import static oap.tsv.Tsv.DELIMITER_TAB; +import static org.assertj.core.api.Assertions.assertThat; + +public class TokenizerTest { + + @Test + public void parseSimple() { + assertThat( parse( "1,22,33,44", DELIMITER_COMMA ) ) + .containsExactly( "1", "22", "33", "44" ); + assertThat( parse( "1,22,33,", DELIMITER_COMMA ) ) + .containsExactly( "1", "22", "33", "" ); + assertThat( parse( "1\t22\t33\t", DELIMITER_TAB ) ) + .containsExactly( "1", "22", "33", "" ); + } + + @Test + public void parseLimited() { + assertThat( parse( "1,22,33,44", DELIMITER_COMMA, 3, false ) ) + .containsExactly( "1", "22", "33" ); + } + + @Test + public void parseQuoted() { + assertThat( parse( "1,\"22\",33,\"44\"", DELIMITER_COMMA, true ) ) + .containsExactly( "1", "22", "33", "44" ); + assertThat( parse( "1,\"22\",33,44", DELIMITER_COMMA, true ) ) + .containsExactly( "1", "22", "33", "44" ); + assertThat( parse( "1,\"2,2\",33,44", DELIMITER_COMMA, true ) ) + .containsExactly( "1", "2,2", "33", "44" ); + assertThat( parse( "1,\"2\"\"2\",33,44", DELIMITER_COMMA, true ) ) + .containsExactly( "1", "2\"2", "33", "44" ); + } +} diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TsvArrayTest.java b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TsvArrayTest.java new file mode 100644 index 0000000000..890fd7297b --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TsvArrayTest.java @@ -0,0 +1,49 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv; + +import junit.framework.TestCase; +import org.testng.annotations.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class TsvArrayTest extends TestCase { + @Test + public void testPrint() { + assertThat( TsvArray.print( List.of( 1, 2, 3 ), null ) ) + .isEqualTo( "[1,2,3]" ); + assertThat( TsvArray.print( List.of( "1", "2", "'3\t" ), null ) ) + .isEqualTo( "['1','2','\\'3\t']" ); + } + + @Test + public void testParse() { + assertThat( TsvArray.parse( "[1,2,3]" ) ).isEqualTo( List.of( "1", "2", "3" ) ); + assertThat( TsvArray.parse( "['1','2','3']" ) ).isEqualTo( List.of( "1", "2", "3" ) ); + assertThat( TsvArray.parse( "['1','\\'2','3']" ) ).isEqualTo( List.of( "1", "'2", "3" ) ); + } +} diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TsvInputStreamTest.java b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TsvInputStreamTest.java new file mode 100644 index 0000000000..d6da5023c3 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TsvInputStreamTest.java @@ -0,0 +1,71 @@ +package oap.tsv; + +import org.testng.annotations.Test; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.ArrayList; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public class TsvInputStreamTest { + + @Test + public void readCellsIndexOf() throws IOException { + var data = "a\tb\tc\t".getBytes( UTF_8 ); + var is = new TsvInputStream( new ByteArrayInputStream( data ), new byte[1024] ); + assertTrue( is.readCells() ); + + assertThat( is.line.indexOf( "a" ) ).isEqualTo( 0 ); + assertThat( is.line.indexOf( "c" ) ).isEqualTo( 2 ); + assertThat( is.line.indexOf( "d" ) ).isEqualTo( -1 ); + + assertFalse( is.readCells() ); + } + + @Test + public void testSplit() { + ArrayList split = new ArrayList<>(); + TsvInputStream.split( "1", split ); + assertThat( split ).containsExactly( "1" ); + } + + @Test + public void testEmptyLine() { + ArrayList split = new ArrayList<>(); + TsvInputStream.split( "", split ); + assertThat( split ).containsExactly( "" ); + } + + @Test + public void testSplitTab() { + ArrayList split = new ArrayList<>(); + TsvInputStream.split( "1\t5\tttt", split ); + assertThat( split ).containsExactly( "1", "5", "ttt" ); + } + + @Test + public void testSplitTabEscape() { + ArrayList split = new ArrayList<>(); + TsvInputStream.split( "1\\t5\t\\r\\nttt", split ); + assertThat( split ).containsExactly( "1\\t5", "\\r\\nttt" ); + } + + @Test + public void testEmptyCell() { + ArrayList split = new ArrayList<>(); + TsvInputStream.split( "start\t\tend", split ); + assertThat( split ).containsExactly( "start", "", "end" ); + } + + @Test + public void testEmptyCellEnd() { + ArrayList split = new ArrayList<>(); + TsvInputStream.split( "start\t\t", split ); + assertThat( split ).containsExactly( "start", "", "" ); + } + +} diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TsvStreamTest.java b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TsvStreamTest.java new file mode 100644 index 0000000000..aa1898da90 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/TsvStreamTest.java @@ -0,0 +1,185 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv; + +import oap.io.content.ContentReader; +import org.testng.annotations.Test; + +import java.io.ByteArrayOutputStream; +import java.util.List; + +import static oap.testng.Asserts.assertString; +import static oap.tsv.test.TsvAssertion.assertTsv; +import static oap.tsv.test.TsvAssertion.header; +import static oap.tsv.test.TsvAssertion.row; +import static org.assertj.core.api.Assertions.assertThat; + +public class TsvStreamTest { + @Test + public void csv() { + String csv = """ + "1","2","3" + "1","2","3" + """; + assertString( ContentReader.read( csv, Tsv.csv.ofSeparatedValues() ) + .toCsvString() ) + .isEqualTo( csv ); + } + + @Test + public void csvUnquoted() { + String csv = """ + 1,2,3 + 1,2,3 + """; + assertString( ContentReader.read( csv, Tsv.csv.ofSeparatedValues() ) + .toCsvString( false ) ) + .isEqualTo( csv ); + } + + @Test + public void toList() { + assertThat( ContentReader.read( "1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .toList() ) + .containsExactly( + List.of( "1", "2", "3" ), + List.of( "1", "2", "3" ) ); + } + + @Test + public void toStrng() { + assertString( ContentReader.read( "1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .toTsvString() ) + .isEqualTo( "1\t2\t3\n1\t2\t3\n" ); + } + + @Test + public void withHeaders() { + assertThat( ContentReader.read( "a\tb\tc\n1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .withHeaders() + .withHeaders() + .withHeaders() + .withHeaders() + .headers() ) + .containsExactly( "a", "b", "c" ); + assertTsv( ContentReader.read( "a\tb\tc\n1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .withHeaders() + .toTsv() ) + .containsExactlyInAnyOrderEntriesOf( + header( "a", "b", "c" ), + row( "1", "2", "3" ), + row( "1", "2", "3" ) ); + } + + @Test + public void withHeadersEmpty() { + assertTsv( ContentReader.read( "", Tsv.tsv.ofSeparatedValues() ).withHeaders().toTsv() ) + .isEqualToTsv( "" ); + } + + @Test + public void select() { + assertTsv( ContentReader.read( "a\tb\tc\n1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .withHeaders() + .select( 0, 2 ) + .toTsv() ) + .containsExactlyInAnyOrderEntriesOf( + header( "a", "c" ), + row( "1", "3" ), + row( "1", "3" ) ); + } + + @Test + public void toStream() { + assertThat( ContentReader.read( "a\tb\tc\n1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .withHeaders() + .select( 0, 2 ) + .toStream() + .skip( 1 ) + .toList() ) + .containsExactly( + List.of( "1", "3" ), + List.of( "1", "3" ) ); + } + + @Test + public void selectByHeaders() { + assertTsv( ContentReader.read( "a\tb\tc\n1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .select( "a", "c" ) + .toTsv() ) + .containsExactlyInAnyOrderEntriesOf( + header( "a", "c" ), + row( "1", "3" ), + row( "1", "3" ) ); + assertTsv( ContentReader.read( "a\tb\tc\n1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .withHeaders() + .select( "a", "c" ) + .toTsv() ) + .containsExactlyInAnyOrderEntriesOf( + header( "a", "c" ), + row( "1", "3" ), + row( "1", "3" ) ); + } + + @Test + public void stripHeaders() { + assertTsv( ContentReader.read( "a\tb\tc\n1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .stripHeaders() + .toTsv() ) + .containsExactlyInAnyOrderEntriesOf( + List.of( "1", "2", "3" ), + List.of( "1", "2", "3" ) ); + } + + @Test + public void filter() { + assertTsv( ContentReader.read( "\n\na\tb\tc\n\n1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .filter( line -> line.size() == 3 ) + .withHeaders() + .select( 0, 2 ) + .toTsv() ) + .containsExactlyInAnyOrderEntriesOf( + header( "a", "c" ), + row( "1", "3" ), + row( "1", "3" ) ); + } + + @Test + public void toTsvOutputStream() { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + ContentReader.read( "a\tb\tc\n1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .collect( TsvStream.Collectors.toTsvOutputStream( bytes ) ); + assertString( bytes.toString() ).isEqualTo( "a\tb\tc\n1\t2\t3\n1\t2\t3\n" ); + } + + @Test + public void mapToObj() { + assertThat( ContentReader.read( "a\tb\tc\n1\t2\t3\n1\t2\t3", Tsv.tsv.ofSeparatedValues() ) + .withHeaders() + .mapToObj( l -> Integer.parseInt( l.get( 0 ) ) ) ) + .containsExactly( 1, 1 ); + } + +} diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/mapper/MapperTest.java b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/mapper/MapperTest.java new file mode 100644 index 0000000000..ce4950ec1a --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/mapper/MapperTest.java @@ -0,0 +1,64 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv.mapper; + +import oap.io.content.ContentReader; +import oap.tsv.Tsv; +import org.testng.annotations.Test; + +import static oap.io.content.ContentReader.ofJson; +import static oap.testng.Asserts.contentOfTestResource; +import static org.assertj.core.api.Assertions.assertThat; + +public class MapperTest { + @Test + public void readConfiguration() { + assertThat( contentOfTestResource( getClass(), "config.json", ofJson( Configuration.class ) ) ) + .isEqualTo( new Configuration( + new Configuration.Column( 0, "a" ), + new Configuration.Column( 1, "b" ) ) + .withColumnsNumber( 3 ) + .withValidateInput( true ) ); + } + + @Test + public void mapToObj() { + assertThat( ContentReader.read( "a\tb\tc\n1\t2\t3\n4\t5\t6", Tsv.tsv.ofSeparatedValues() ) + .withHeaders() + .mapToObj( Mapper.of( Bean.class, + contentOfTestResource( getClass(), "config.json", ofJson( Configuration.class ) ) ) ) ) + .containsExactly( new Bean( 1, 2 ), new Bean( 4, 5 ) ); + } + + @Test + public void mapToObjValidations() { + Configuration config = contentOfTestResource( getClass(), "config.json", ofJson( Configuration.class ) ); + assertThat( config.configure( ContentReader.read( "a\tb\tc\n1\t\n1\t2\t3\n4\t5\t6", Tsv.tsv.ofSeparatedValues() ) ) + .mapToObj( Mapper.of( Bean.class, config ) ) ) + .containsExactly( new Bean( 1, 2 ), new Bean( 4, 5 ) ); + } + + record Bean( int a, int b ) {} +} diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/test/TsvAssertionTest.java b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/test/TsvAssertionTest.java new file mode 100644 index 0000000000..b8c8ce52b8 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/test/java/oap/tsv/test/TsvAssertionTest.java @@ -0,0 +1,135 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv.test; + +import oap.io.content.ContentReader; +import oap.tsv.Tsv; +import org.testng.annotations.Test; + +import static oap.tsv.test.TsvAssertion.assertTsv; +import static oap.tsv.test.TsvAssertion.header; +import static oap.tsv.test.TsvAssertion.row; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +public class TsvAssertionTest { + + @Test + public void containsHeaders() { + assertTsv( "a\tb\tc\n1\t2\t3" ).hasHeaders( "a" ); + + assertThatThrownBy( () -> + assertTsv( "a\tb\tc\n1\t2\t3" ).hasHeaders( "unknown" ) ) + .isInstanceOf( AssertionError.class ); + } + + @Test + public void containsExactlyInAnyOrderEntriesOf() { + String tsv = """ + a\tb\tc + 11\t12\t13 + 21\t22\t23 + """; + assertTsv( tsv ) + .containsExactlyInAnyOrderEntriesOf( + header( "a", "b" ), + row( "21", "22" ), + row( "11", "12" ) + ); + + assertThatThrownBy( () -> + assertTsv( tsv ) + .containsExactlyInAnyOrderEntriesOf( + header( "a", "b" ), + row( "21", "22" ) + ) + .isInstanceOf( AssertionError.class ) ); + + assertThatThrownBy( () -> + assertTsv( tsv ) + .containsExactlyInAnyOrderEntriesOf( + header( "a", "b" ), + row( "11", "22" ) ) + .isInstanceOf( AssertionError.class ) ); + } + + @Test + public void containsAnyEntriesOf() { + String tsv = """ + a\tb\tc + 11\t12\t13 + 21\t22\t23 + """; + assertTsv( tsv ) + .containsAnyEntriesOf( + header( "a", "b" ), + row( "21", "22" ), + row( "11", "12" ) + ); + + assertTsv( tsv ) + .containsAnyEntriesOf( + header( "a", "b" ), + row( "21", "22" ) + ); + + assertThatThrownBy( () -> + assertTsv( tsv ) + .containsAnyEntriesOf( + header( "a", "b" ), + row( "11", "22" ) ) + .isInstanceOf( AssertionError.class ) ); + } + + @Test + public void doesNotContainAnyEntriesOf() { + String tsv = """ + a\tb\tc + 11\t12\t13 + 21\t22\t23 + """; + assertTsv( tsv ) + .doesNotContainAnyEntriesOf( row( "11", "12", "14" ) ); + + assertThatThrownBy( () -> + assertTsv( tsv ) + .doesNotContainAnyEntriesOf( row( "11", "12", "13" ) ) + .isInstanceOf( AssertionError.class ) ); + } + + @Test + public void isEqualTo() { + String tsv = """ + a\tb\tc + 1\t2\t3 + 3\t2\t1 + """; + assertTsv( tsv ).isEqualTo( ContentReader.read( tsv, Tsv.tsv.ofSeparatedValues() ).withHeaders().toTsv() ); + assertTsv( tsv ).isEqualToTsv( """ + a\tb\tc + 3\t2\t1 + 1\t2\t3 + """ ); + } +} diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/resources/logback-test.xml b/oap-formats/oap-tsv/oap-tsv-test/src/test/resources/logback-test.xml new file mode 100644 index 0000000000..63c9d0e094 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/test/resources/logback-test.xml @@ -0,0 +1,39 @@ + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + + diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/resources/oap/tsv/TsvTest/1.tsv.zip b/oap-formats/oap-tsv/oap-tsv-test/src/test/resources/oap/tsv/TsvTest/1.tsv.zip new file mode 100644 index 0000000000..56f8fedf80 Binary files /dev/null and b/oap-formats/oap-tsv/oap-tsv-test/src/test/resources/oap/tsv/TsvTest/1.tsv.zip differ diff --git a/oap-formats/oap-tsv/oap-tsv-test/src/test/resources/oap/tsv/mapper/MapperTest/config.json b/oap-formats/oap-tsv/oap-tsv-test/src/test/resources/oap/tsv/mapper/MapperTest/config.json new file mode 100644 index 0000000000..0ccda0737f --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv-test/src/test/resources/oap/tsv/mapper/MapperTest/config.json @@ -0,0 +1,14 @@ +{ + "columnsNumber": 3, + "validateInput": true, + "columns": [ + { + "index": 0, + "name": "a" + }, + { + "index": 1, + "name": "b" + } + ] +} diff --git a/oap-formats/oap-tsv/oap-tsv/pom.xml b/oap-formats/oap-tsv/oap-tsv/pom.xml new file mode 100644 index 0000000000..6500cc9683 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv/pom.xml @@ -0,0 +1,53 @@ + + + + + 4.0.0 + oap-tsv + oap-tsv + + + oap + oap-tsv-parent + ${oap.project.version} + + + + + oap + oap-stdlib + ${oap.project.version} + + + + org.projectlombok + lombok + ${oap.deps.lombok.version} + provided + + + diff --git a/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/Printer.java b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/Printer.java new file mode 100644 index 0000000000..c3c5c525e9 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/Printer.java @@ -0,0 +1,70 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv; + +import com.google.common.base.Strings; +import oap.util.Stream; + +import java.util.List; +import java.util.stream.Collectors; + +public class Printer { + + public static String print( Stream> stream, char delimiter ) { + return stream.map( l -> print( l, delimiter ) ).collect( Collectors.joining() ); + } + + public static String print( List list, char delimiter ) { + return print( list, delimiter, false ); + } + + public static String print( List list, char delimiter, boolean quoted ) { + return Stream.of( list ) + .map( e -> escape( String.valueOf( e ), quoted ) ) + .collect( Collectors.joining( String.valueOf( delimiter ) ) ) + "\n"; + } + + public static String escape( String text, boolean quoted ) { + if( Strings.isNullOrEmpty( text ) ) return ""; + + var sb = new StringBuilder(); + if( quoted ) sb.append( '"' ); + for( var i = 0; i < text.length(); i++ ) { + char c = text.charAt( i ); + sb.append( switch( c ) { + case '\n' -> "\\n"; + case '\r' -> "\\r"; + case '\t' -> "\\t"; + case '\\' -> "\\\\"; + case '"' -> quoted ? "\"\"" : "\""; + default -> String.valueOf( c ); + } ); + } + if( quoted ) sb.append( '"' ); + + return sb.toString(); + } + +} diff --git a/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/Tokenizer.java b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/Tokenizer.java new file mode 100644 index 0000000000..c34857ecc5 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/Tokenizer.java @@ -0,0 +1,61 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv; + +import java.util.LinkedList; +import java.util.List; + +public class Tokenizer { + public static List parse( String line, char delimiter ) { + return parse( line, delimiter, Integer.MIN_VALUE, false ); + } + + public static List parse( String line, char delimiter, boolean quoted ) { + return parse( line, delimiter, Integer.MIN_VALUE, quoted ); + } + + public static List parse( String line, char delimiter, int limit, boolean quoted ) { + List tokens = new LinkedList<>(); + int beginIndex = 0; + boolean inQuote = false; + for( int i = 0; i < line.length(); i++ ) { + char c = line.charAt( i ); + if( c == delimiter && !inQuote ) { + if( quoted && line.charAt( beginIndex ) == '"' && line.charAt( i - 1 ) == '"' ) + tokens.add( line.substring( beginIndex + 1, i - 1 ).replaceAll( "\"\"", "\"" ) ); + else tokens.add( line.substring( beginIndex, i ) ); + beginIndex = i + 1; + } + if( quoted && c == '"' ) inQuote = !inQuote; + + if( tokens.size() == limit ) return tokens; + } + if( quoted && line.charAt( beginIndex ) == '"' && line.charAt( line.length() - 1 ) == '"' ) + tokens.add( line.substring( beginIndex + 1, line.length() - 1 ).replaceAll( "\"\"", "\"" ) ); + else tokens.add( line.substring( beginIndex ) ); + + return tokens; + } +} diff --git a/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/Tsv.java b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/Tsv.java new file mode 100644 index 0000000000..a32bda3e29 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/Tsv.java @@ -0,0 +1,153 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv; + +import lombok.EqualsAndHashCode; +import oap.io.IoStreams; +import oap.io.Resources; +import oap.io.content.ContentReader; +import oap.util.Stream; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.file.Path; +import java.util.List; +import java.util.Optional; +import java.util.function.Consumer; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static oap.io.IoStreams.Encoding.PLAIN; +import static oap.io.IoStreams.lines; + +@EqualsAndHashCode +public class Tsv { + + public static final char DELIMITER_SEMICOLON = ';'; + public static final char DELIMITER_TAB = '\t'; + public static final char DELIMITER_COMMA = ','; + + public static final AbstractParser tsv = new AbstractParser() { + @Override + public List parse( String line ) { + return Tokenizer.parse( line, DELIMITER_TAB, Integer.MAX_VALUE, false ); + } + }; + + public static final AbstractParser csv = new AbstractParser() { + @Override + public List parse( String line ) { + return Tokenizer.parse( line, DELIMITER_COMMA, Integer.MAX_VALUE, true ); + } + }; + + public final List headers; + public final List> data; + + public Tsv( List headers, List> data ) { + this.headers = headers; + this.data = data; + } + + public TsvStream stream() { + return TsvStream.of( headers, Stream.of( data ) ); + } + + @Override + public String toString() { + return stream().toTsvString(); + } + + public abstract static class AbstractParser { + @Deprecated + public TsvStream from( byte[] bytes ) { + return from( new ByteArrayInputStream( bytes ) ); + } + + @Deprecated + public TsvStream from( byte[] bytes, int offset, int length ) { + return from( new ByteArrayInputStream( bytes, offset, length ) ); + } + + @Deprecated + public TsvStream from( InputStream inputStream ) { + return from( inputStream, UTF_8 ); + } + + public TsvStream from( InputStream inputStream, Charset charset ) { + return from( new InputStreamReader( inputStream, charset ) ); + } + + public TsvStream from( Reader reader ) { + return fromStream( Stream.of( new BufferedReader( reader ).lines() ) ); + } + + @Deprecated + public TsvStream fromString( String tsv ) { + return from( new StringReader( tsv ) ); + } + + public TsvStream fromStream( Stream stream ) { + return TsvStream.of( stream.map( this::parse ) ); + } + + @Deprecated + public Optional fromResource( Class contextClass, String name ) { + return Resources.url( contextClass, name ).map( this::fromUrl ); + } + + @Deprecated + public TsvStream fromPath( Path path ) { + return fromStream( lines( path ) ); + } + + @Deprecated + public TsvStream fromUrl( URL url ) { + return fromUrl( url, PLAIN, p -> {} ); + } + + public TsvStream fromUrl( URL url, IoStreams.Encoding encoding, + Consumer progressCallback ) { + return fromStream( lines( url, encoding, progressCallback ) ); + } + + public ContentReader ofSeparatedValues() { + return new ContentReader<>() { + @Override + public TsvStream read( InputStream is ) { + return from( is ); + } + }; + } + + public abstract List parse( String line ); + } + +} diff --git a/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/TsvArray.java b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/TsvArray.java new file mode 100644 index 0000000000..640311d107 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/TsvArray.java @@ -0,0 +1,103 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv; + +import org.apache.commons.lang3.StringUtils; +import org.joda.time.DateTime; +import org.joda.time.format.DateTimeFormatter; + +import java.util.ArrayList; +import java.util.List; +import java.util.StringJoiner; + +public class TsvArray { + public static String print( List list, DateTimeFormatter dateTimeFormatter ) { + StringJoiner sj = new StringJoiner( ",", "[", "]" ); + + for( var item : list ) { + if( item instanceof String items ) { + sj.add( "'" + escape( items ) + "'" ); + } else if( item instanceof DateTime itemd ) { + sj.add( "'" + dateTimeFormatter.print( itemd ) + "'" ); + } else { + sj.add( String.valueOf( item ) ); + } + } + + return sj.toString(); + } + + private static String escape( String item ) { + return StringUtils.replace( item, "'", "\\'" ); + } + + public static List parse( String item ) { + var array = item.substring( 1, item.length() - 1 ); + List ret = new ArrayList<>(); + + if( array.isEmpty() ) return ret; + + StringBuilder sb = new StringBuilder(); + + boolean strBegin = false; + boolean escape = false; + + for( var i = 0; i < array.length(); i++ ) { + var ch = array.charAt( i ); + switch( ch ) { + case '\'': + if( strBegin && escape ) { + sb.append( '\'' ); + escape = false; + } + strBegin = !strBegin; + break; + case ',': + ret.add( sb.toString() ); + sb.delete( 0, sb.length() ); + break; + case '\\': + if( escape ) { + sb.append( "\\" ); + escape = false; + } else { + escape = true; + } + break; + default: + if( escape ) { + escape = false; + sb.append( "\\" ).append( ch ); + } else { + sb.append( ch ); + } + } + } + + ret.add( sb.toString() ); + + return ret; + } +} diff --git a/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/TsvInputStream.java b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/TsvInputStream.java new file mode 100644 index 0000000000..d792935d83 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/TsvInputStream.java @@ -0,0 +1,102 @@ +package oap.tsv; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.io.FastBufferedInputStream; +import lombok.ToString; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Objects; + +import static java.nio.charset.StandardCharsets.UTF_8; + +public class TsvInputStream extends FastBufferedInputStream { + private static final char TAB = '\t'; + private static final char ESCAPE = '\\'; + public final Line line; + + public TsvInputStream( InputStream is, byte[] bytes ) { + super( is ); + + line = new Line( bytes ); + } + + public static void split( byte[] line, int len, IntArrayList list ) { + int i = 0; + boolean escape = false; + while( i < len ) { + var ch = line[i]; + switch( ch ) { + case ESCAPE -> escape = !escape; + case TAB -> { + if( !escape ) list.add( i + 1 ); + escape = false; + } + default -> escape = false; + } + i++; + } + list.add( i + 1 ); + } + + public static void split( String line, List list ) { + Objects.requireNonNull( line ); + + var len = line.length(); + + int start = 0, i = 0; + boolean escape = false; + while( i < len ) { + var ch = line.charAt( i ); + switch( ch ) { + case ESCAPE -> escape = !escape; + case TAB -> { + if( !escape ) list.add( line.substring( start, i ) ); + start = i + 1; + escape = false; + } + default -> escape = false; + } + i++; + } + list.add( line.substring( start, i ) ); + } + + public boolean readCells() throws IOException { + line.cells.clear(); + var buffer = line.buffer; + var len = readLine( buffer ); + + line.len = len; + + if( len <= 0 ) return false; + + split( buffer, len, line.cells ); + + return true; + } + + @ToString + public static class Line { + public final byte[] buffer; + public final IntArrayList cells = new IntArrayList(); + public int len = 0; + + public Line( byte[] buffer ) { + this.buffer = buffer; + } + + public int indexOf( String value ) { + for( int i = 0; i < cells.size(); i++ ) { + var offset = i == 0 ? 0 : cells.getInt( i - 1 ); + var length = cells.getInt( i ) - offset - 1; + var str = new String( buffer, offset, length, UTF_8 ); + + if( value.equals( str ) ) return i; + } + + return -1; + } + } +} diff --git a/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/TsvStream.java b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/TsvStream.java new file mode 100644 index 0000000000..43313c0b66 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/TsvStream.java @@ -0,0 +1,197 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv; + +import lombok.EqualsAndHashCode; +import lombok.ToString; +import oap.util.Arrays; +import oap.util.IndexTranslatingList; +import oap.util.Lists; +import oap.util.Stream; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.UncheckedIOException; +import java.util.Iterator; +import java.util.List; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collector; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static oap.tsv.Printer.print; +import static oap.tsv.Tsv.DELIMITER_COMMA; +import static oap.tsv.Tsv.DELIMITER_TAB; + +public class TsvStream { + + private final List headers; + private final Stream> data; + + protected TsvStream( Stream> data ) { + this( List.of(), data ); + } + + + protected TsvStream( List headers, Stream> data ) { + this.headers = headers; + this.data = data; + } + + public static TsvStream of( Stream> data ) { + return new TsvStream( data ); + } + + public static TsvStream of( List headers, Stream> data ) { + return new TsvStream( headers, data ); + } + + public TsvStream withHeaders() { + if( !headers.isEmpty() ) return this; + Iterator> iterator = data.iterator(); + if( iterator.hasNext() ) return new TsvStream( iterator.next(), Stream.of( iterator ) ); + return of( Stream.of( iterator ) ); + } + + public List headers() { + return headers; + } + + public TsvStream select( int... columns ) { + return new TsvStream( + this.headers.isEmpty() ? this.headers : new IndexTranslatingList<>( this.headers, columns ), + data.map( line -> new IndexTranslatingList<>( line, columns ) ) ); + } + + public TsvStream select( String... headers ) { + TsvStream tsv = withHeaders(); + return tsv.select( Lists.indices( tsv.headers, headers ) ); + } + + public TsvStream select( List headers ) { + return select( Arrays.of( String.class, headers ) ); + } + + public TsvStream select( Header header ) { + return select( header.cols ); + } + + public TsvStream stripHeaders() { + return headers.isEmpty() ? new TsvStream( List.of(), withHeaders().data ) + : new TsvStream( List.of(), this.data ); + } + + public TsvStream filter( Predicate> filter ) { + return new TsvStream( this.headers, data.filter( filter ) ); + } + + public Stream mapToObj( Function, ? extends E> mapper ) { + return data.map( mapper ); + } + + public Stream> toStream() { + return headers.isEmpty() ? data : Stream.of( List.of( headers ) ).concat( data ); + } + + public List> toList() { + return collect( java.util.stream.Collectors.toList() ); + } + + public String toTsvString() { + return collect( Collectors.toTsvString() ); + } + + public String toCsvString() { + return collect( Collectors.toCsvString() ); + } + + public String toCsvString( boolean quoted ) { + return collect( Collectors.toCsvString( quoted ) ); + } + + public Tsv toTsv() { + return new Tsv( headers, data.toList() ); + } + + public R collect( Collector, A, R> collector ) { + var container = collector.supplier().get(); + if( !headers.isEmpty() ) collector.accumulator().accept( container, headers ); + return data.collect( Collector.of( () -> container, + collector.accumulator(), + collector.combiner(), + collector.finisher(), + collector.characteristics().toArray( new Collector.Characteristics[0] ) ) ); + } + + + public static class Collectors { + public static Collector, ?, OutputStream> toTsvOutputStream( OutputStream os ) { + return Collector.of( + () -> os, + ( out, line ) -> { + try { + out.write( print( line, DELIMITER_TAB ).getBytes( UTF_8 ) ); + } catch( IOException e ) { + throw new UncheckedIOException( e ); + } + }, + ( out, outIgnored ) -> out ); + } + + public static Collector, ?, String> toCsvString() { + return toCsvString( true ); + } + + public static Collector, ?, String> toCsvString( boolean quoted ) { + return toXsv( line -> print( line, DELIMITER_COMMA, quoted ) ); + } + + public static Collector, ?, String> toTsvString() { + return toXsv( line -> print( line, DELIMITER_TAB ) ); + } + + private static Collector, ?, String> toXsv( Function, String> joiner ) { + return Collector.of( + StringBuilder::new, + ( sb, line ) -> sb.append( joiner.apply( line ) ), + StringBuilder::append, + StringBuilder::toString ); + } + } + + @ToString + @EqualsAndHashCode + public static class Header { + public final List cols; + + public Header( String... cols ) { + this.cols = List.of( cols ); + } + + public int size() { + return cols.size(); + } + } +} diff --git a/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/mapper/Configuration.java b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/mapper/Configuration.java new file mode 100644 index 0000000000..d2ebf8df45 --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/mapper/Configuration.java @@ -0,0 +1,83 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv.mapper; + +import lombok.EqualsAndHashCode; +import lombok.ToString; +import lombok.extern.slf4j.Slf4j; +import oap.tsv.TsvStream; + +import java.util.ArrayList; +import java.util.List; + +@EqualsAndHashCode +@ToString +@Slf4j +public class Configuration { + public List columns = new ArrayList<>(); + public boolean hasHeaders = true; + public boolean skipErrors = true; + public int columnsNumber = 0; + public boolean validateInput = false; + + public Configuration() { + } + + public Configuration( List columns ) { + this.columns = columns; + } + + public Configuration( Column... columns ) { + this( List.of( columns ) ); + } + + public Configuration withColumnsNumber( int columnsNumber ) { + this.columnsNumber = columnsNumber; + return this; + } + + public Configuration withValidateInput( boolean validateInput ) { + this.validateInput = validateInput; + return this; + } + + public TsvStream configure( TsvStream stream ) { + var result = stream; + if( hasHeaders ) result = result.withHeaders(); + if( validateInput ) { + result = result.filter( line -> { + if( line.size() != columnsNumber ) { + log.error( "erroneous line in configuration '{}'", line ); + if( skipErrors ) return false; + throw new IllegalArgumentException( "erroneous line " + line ); + } + return true; + } ); + } + return result; + } + + public record Column( int index, String name ) {} +} diff --git a/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/mapper/Mapper.java b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/mapper/Mapper.java new file mode 100644 index 0000000000..eadee26efc --- /dev/null +++ b/oap-formats/oap-tsv/oap-tsv/src/main/java/oap/tsv/mapper/Mapper.java @@ -0,0 +1,54 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) Open Application Platform Authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package oap.tsv.mapper; + +import oap.reflect.Reflect; +import oap.util.Stream; + +import java.util.List; +import java.util.function.Function; + +import static oap.util.Pair.__; + +public class Mapper implements Function, E> { + private final Class clazz; + private final Configuration config; + + public Mapper( Class clazz, Configuration config ) { + this.clazz = clazz; + this.config = config; + } + + public static Mapper of( Class clazz, Configuration config ) { + return new Mapper<>( clazz, config ); + } + + @Override + public E apply( List line ) { + return Reflect.reflect( clazz ).newInstance( Stream.of( config.columns ) + .mapToPairs( f -> __( f.name(), line.get( f.index() ) ) ) + .toMap() ); + } +} diff --git a/oap-formats/oap-tsv/pom.xml b/oap-formats/oap-tsv/pom.xml new file mode 100644 index 0000000000..b5a4e3fffb --- /dev/null +++ b/oap-formats/oap-tsv/pom.xml @@ -0,0 +1,18 @@ + + + 4.0.0 + + + oap + oap-formats-parent + ${oap.project.version} + + + pom + oap-tsv-parent + + + oap-tsv + oap-tsv-test + + \ No newline at end of file diff --git a/oap-formats/pom.xml b/oap-formats/pom.xml new file mode 100644 index 0000000000..a4ecea627a --- /dev/null +++ b/oap-formats/pom.xml @@ -0,0 +1,17 @@ + + + 4.0.0 + + + oap + oap + ${oap.project.version} + + + pom + oap-formats-parent + + + oap-tsv + + \ No newline at end of file diff --git a/pom.xml b/pom.xml index e1831d4435..70c516eb91 100644 --- a/pom.xml +++ b/pom.xml @@ -21,6 +21,7 @@ oap-jpath oap-pnio oap-hadoop + oap-formats oap-maven-plugin @@ -45,7 +46,7 @@ - 21.3.2 + 21.3.3 21.0.0