Skip to content

Commit

Permalink
oap-tsv
Browse files Browse the repository at this point in the history
  • Loading branch information
galaxina committed Jan 26, 2024
1 parent 90edd01 commit 7bcd4a7
Show file tree
Hide file tree
Showing 26 changed files with 1,926 additions and 1 deletion.
12 changes: 12 additions & 0 deletions oap-formats/oap-tsv/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# oap-tsv

TSV is a Tab Separated Value format. Unlike Comma Separated Value (CSV) it contains ONLY tabular character to distingish columns.
CSV my be divided by comma, tab, semi-colon, pipe etc.
It also has some rules to separate columns with data inside them, if it contains special characters (a.k.a. escaping).
TSV alsways wrap dta into quotes if separator is comma.
Like [1..3] becomes '"1","2","3"' (with tabs)
and [1..3] becomes '1 2 3' (with comma)

Strict rules also give TSV ability to be little bit faster than CSV

See https://github.com/eBay/tsv-utils/blob/master/docs/comparing-tsv-and-csv.md
59 changes: 59 additions & 0 deletions oap-formats/oap-tsv/oap-tsv-test/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ The MIT License (MIT)
~
~ Copyright (c) Open Application Platform Authors
~
~ Permission is hereby granted, free of charge, to any person obtaining a copy
~ of this software and associated documentation files (the "Software"), to deal
~ in the Software without restriction, including without limitation the rights
~ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
~ copies of the Software, and to permit persons to whom the Software is
~ furnished to do so, subject to the following conditions:
~
~ The above copyright notice and this permission notice shall be included in all
~ copies or substantial portions of the Software.
~
~ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
~ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
~ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
~ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
~ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
~ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
~ SOFTWARE.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<name>oap-tsv-test</name>
<artifactId>oap-tsv-test</artifactId>

<parent>
<groupId>oap</groupId>
<artifactId>oap-tsv-parent</artifactId>
<version>${oap.project.version}</version>
</parent>

<dependencies>
<dependency>
<groupId>oap</groupId>
<artifactId>oap-tsv</artifactId>
<version>${project.version}</version>
</dependency>

<dependency>
<groupId>oap</groupId>
<artifactId>oap-stdlib-test</artifactId>
<version>${oap.project.version}</version>
</dependency>

<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${oap.deps.lombok.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
/*
* The MIT License (MIT)
*
* Copyright (c) Open Application Platform Authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package oap.tsv.test;

import lombok.EqualsAndHashCode;
import lombok.ToString;
import oap.io.Files;
import oap.io.content.ContentReader;
import oap.tsv.Tsv;
import oap.tsv.TsvStream.Header;
import oap.util.Lists;
import org.assertj.core.api.AbstractAssert;

import java.io.File;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.List;

import static oap.io.content.ContentReader.ofString;
import static org.assertj.core.api.Assertions.assertThat;

public class TsvAssertion extends AbstractAssert<TsvAssertion, Tsv> {
protected TsvAssertion( String value ) {
this( value, true );
}

protected TsvAssertion( String value, boolean withHeaders ) {
this( withHeaders ? ContentReader.read( value, Tsv.tsv.ofSeparatedValues() ).withHeaders().toTsv()
: ContentReader.read( value, Tsv.tsv.ofSeparatedValues() ).toTsv() );
}

protected TsvAssertion( Tsv value ) {
super( value, TsvAssertion.class );
}

public static TsvAssertion assertTsv( String tsv ) {
return assertTsv( tsv, true );
}

public static TsvAssertion assertTsv( String tsv, boolean withHeaders ) {
return new TsvAssertion( tsv, withHeaders );
}

public static TsvAssertion assertTsv( Tsv tsv ) {
return new TsvAssertion( tsv );
}

public static TsvAssertion assertTsv( Path path ) {
return assertTsv( path, true );
}

public static TsvAssertion assertTsv( Path path, boolean withHeaders ) {
return assertTsv( Files.read( path, ofString() ), withHeaders );
}

public static TsvAssertion assertTsv( File file ) {
return assertTsv( file, true );
}

public static TsvAssertion assertTsv( File file, boolean withHeaders ) {
return assertTsv( Files.read( file.toPath(), ofString() ), withHeaders );
}

public static TsvAssertion assertTsv( InputStream is ) {
return assertTsv( is, true );
}

public static TsvAssertion assertTsv( InputStream is, boolean withHeaders ) {
return assertTsv( ContentReader.read( is, ofString() ), withHeaders );
}

public static Row row( String... cols ) {
return new Row( cols );
}

public static Header header( String... cols ) {
return new Header( cols );
}

public TsvAssertion hasHeaders( String... headers ) {
assertThat( actual.headers ).contains( headers );
return this;
}

public TsvAssertion hasHeaders( Iterable<String> headers ) {
assertThat( actual.headers ).containsAll( headers );
return this;
}

public TsvAssertion hasHeaders( Header header ) {
assertThat( actual.headers ).containsAll( header.cols );
return this;
}

public TsvAssertion containOnlyHeaders( String... headers ) {
assertThat( actual.headers ).containsOnly( headers );
return this;
}

@SafeVarargs
public final TsvAssertion containsExactlyInAnyOrderEntriesOf( List<String>... entries ) {
assertThat( actual.data )
.containsExactlyInAnyOrderElementsOf( List.of( entries ) );
return this;
}

public TsvAssertion containsExactlyInAnyOrderEntriesOf( Header header, Row... rows ) {
hasHeaders( header );
for( var row : rows ) {
assertThat( row.cols )
.withFailMessage( "entries length doesnt match headers" )
.hasSize( header.size() );
}
assertThat( actual.stream()
.select( header )
.stripHeaders()
.toTsv()
.data )
.containsExactlyInAnyOrderElementsOf( Lists.map( rows, r -> r.cols ) );

return this;
}

public TsvAssertion containsAnyEntriesOf( Header header, Row... rows ) {
hasHeaders( header.cols );
for( var row : rows ) {
assertThat( row.cols )
.withFailMessage( "entries length doesnt match headers" )
.hasSize( header.size() );
}

assertThat( actual.stream()
.select( header )
.stripHeaders()
.toTsv()
.data )
.containsAnyElementsOf( Lists.map( rows, r -> r.cols ) );
return this;
}

public TsvAssertion containsOnlyOnceEntriesOf( Header header, Row... rows ) {
hasHeaders( header );
for( var row : rows ) {
assertThat( row.cols )
.withFailMessage( "entries length doesnt match headers" )
.hasSize( header.size() );
}
assertThat( actual.stream()
.select( header )
.stripHeaders()
.toTsv()
.data ).containsOnlyOnceElementsOf( Lists.map( rows, r -> r.cols ) );
return this;
}

public TsvAssertion doesNotContainAnyEntriesOf( Header header, Row... rows ) {
hasHeaders( header );
for( var row : rows ) {
assertThat( row.cols )
.withFailMessage( "entries length doesnt match headers" )
.hasSize( header.size() );
}

assertThat( actual.stream()
.select( header )
.stripHeaders()
.toTsv()
.data ).doesNotContainAnyElementsOf( Lists.map( rows, r -> r.cols ) );
return this;
}

public TsvAssertion doesNotContainAnyEntriesOf( Row... rows ) {
assertThat( actual.headers )
.withFailMessage( "tsv must contain headers" )
.isNotEmpty();
for( var row : rows ) {
assertThat( row.cols )
.withFailMessage( "entries length doesnt match headers" )
.hasSize( actual.headers.size() );
}
assertThat( actual.data ).doesNotContainAnyElementsOf( Lists.map( rows, r -> r.cols ) );
return this;
}

public TsvAssertion isNotEmpty() {
assertThat( actual.data ).isNotEmpty();
return this;
}

public TsvAssertion isEqualToTsv( String tsv ) {
Tsv expected = ContentReader.read( tsv, Tsv.tsv.ofSeparatedValues() ).withHeaders().toTsv();
hasHeaders( expected.headers );
assertThat( this.actual.data ).containsExactlyInAnyOrderElementsOf( expected.data );
return this;
}

public TsvAssertion isEqualToTsv( Path tsv ) {
return isEqualToTsv( Files.read( tsv, ofString() ) );
}

@ToString
@EqualsAndHashCode
public static class Row {
private final List<String> cols;

public Row( String... cols ) {
this.cols = List.of( cols );
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* The MIT License (MIT)
*
* Copyright (c) Open Application Platform Authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package oap.tsv;

import org.testng.annotations.Test;

import java.util.List;

import static oap.testng.Asserts.assertString;

public class PrinterTest {
@Test
public void print() {
assertString( Printer.print( List.of( 1, 2, 3 ), Tsv.DELIMITER_TAB ) )
.isEqualTo( "1\t2\t3\n" );
assertString( Printer.print( List.of( 1, 2, 3 ), Tsv.DELIMITER_COMMA, true ) )
.isEqualTo( "\"1\",\"2\",\"3\"\n" );
assertString( Printer.print( List.of( 1, "\"2\\\"", 3 ), Tsv.DELIMITER_COMMA, true ) )
.isEqualTo( "\"1\",\"\"\"2\\\\\"\"\",\"3\"\n" );
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* The MIT License (MIT)
*
* Copyright (c) Open Application Platform Authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package oap.tsv;

import org.apache.commons.lang3.StringUtils;
import org.testng.annotations.Test;

import static oap.benchmark.Benchmark.benchmark;
import static oap.tsv.Tokenizer.parse;
import static oap.tsv.Tsv.DELIMITER_TAB;
import static org.assertj.core.api.Assertions.assertThat;

public class TokenizerPerformance {

@Test
public void perf() {
String tsv = "aaaa\tbbbb\txxxx\tddd\t19/11/2011\t33.3\taaaa\t11\txxx\tvvvv\tS\tS\t444\txxx\t4444\t1234\tN\tN";
assertThat( parse( tsv, DELIMITER_TAB ) ).hasSize( 18 );
benchmark( "split", 1000000, () -> StringUtils.splitByWholeSeparatorPreserveAllTokens( tsv, "\t" ) )
.run();
benchmark( "tokenizer", 1000000, () -> parse( tsv, DELIMITER_TAB ) )
.run();
}

}
Loading

0 comments on commit 7bcd4a7

Please sign in to comment.