From bb1838da9a163f4d8ad3f68c18ca7c09ad395766 Mon Sep 17 00:00:00 2001 From: Sebastian Riedel Date: Tue, 30 Jan 2024 15:57:17 +0100 Subject: [PATCH] Add learning data conversion tool --- lib/Cavil/Command/learn.pm | 37 ++++++++++++++++++++++++++++--------- t/command_learn.t | 22 ++++++++++++++++++++++ 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/lib/Cavil/Command/learn.pm b/lib/Cavil/Command/learn.pm index 525a43f32d..d36976778f 100644 --- a/lib/Cavil/Command/learn.pm +++ b/lib/Cavil/Command/learn.pm @@ -24,11 +24,13 @@ has usage => sub ($self) { $self->extract_usage }; sub run ($self, @args) { getopt \@args, - 'i|input=s' => \my $input, - 'o|output=s' => \my $output, - 'p|patterns' => \my $patterns; - die 'Input or output directory is required' unless (defined $output || defined $input); + 'c|convert=s' => \my $convert, + 'i|input=s' => \my $input, + 'o|output=s' => \my $output, + 'p|patterns' => \my $patterns; + die 'Input or output directory is required' unless (defined $output || defined $input || defined $convert); + return $self->_convert($convert) if $convert; return $self->_output($output, $patterns) if $output; return $self->_input($input); } @@ -42,6 +44,20 @@ sub _classify ($db, $name, $license) { $license, $checksum)->rows; } +sub _convert ($self, $convert) { + my $patterns = $self->app->patterns; + my $dir = path($convert); + + for my $old ($dir->list->each) { + my $content = $old->slurp; + my $checksum = $patterns->checksum($content); + my $new = $old->sibling("$checksum.txt"); + $new->spew($content); + $old->remove; + say "Converted @{[$old->basename]} to @{[$new->basename]}"; + } +} + sub _input ($self, $input) { my $db = $self->app->pg->db; @@ -144,12 +160,15 @@ Cavil::Command::learn - Cavil learn command script/cavil learn -o ./ml-data script/cavil learn -p -o ./ml-data script/cavil learn -i ./ml-data + script/cavil learn -c ./text-files Options: - -i, --input Import snippet classifications from training data - -o, --output Export snippets for training machine learning models - -p, --patterns Convert license patterns into snippets and export - those instead - -h, --help Show this summary of available options + -c, --convert Convert a directory with arbitary text files into + training data (this is a destructive operation) + -i, --input Import snippet classifications from training data + -o, --output Export snippets for training machine learning models + -p, --patterns Convert license patterns into snippets and export + those instead + -h, --help Show this summary of available options =cut diff --git a/t/command_learn.t b/t/command_learn.t index 15d6a2823a..1c43376479 100644 --- a/t/command_learn.t +++ b/t/command_learn.t @@ -129,6 +129,28 @@ subtest 'Snippets added' => sub { } like $buffer, qr/Imported 0 snippet classifications/, 'no snippets imported'; }; + + subtest 'Convert arbitrary text files' => sub { + my $dir = $tmp->child('convert')->make_path; + $dir->child('test.txt')->spew("Hello\nCavil\n"); + $dir->child('test2')->spew("Hello\nAgain\n"); + ok -e $dir->child('test.txt'), 'file exists'; + ok -e $dir->child('test2'), 'file exists'; + + my $buffer = ''; + { + open my $handle, '>', \$buffer; + local *STDOUT = $handle; + $app->start('learn', '-c', "$dir"); + } + like $buffer, qr/Converted test.txt to c512411bea5f292484180fb72e5ea0f9.txt/, 'first file'; + like $buffer, qr/Converted test2 to 00911bf540aebe36e7c2908760515b25.txt/, 'second file'; + + ok !-e $dir->child('test.txt'), 'file no longer exists'; + ok !-e $dir->child('test2'), 'file no longer exists'; + is $dir->child('c512411bea5f292484180fb72e5ea0f9.txt')->slurp, "Hello\nCavil\n", 'right content'; + is $dir->child('00911bf540aebe36e7c2908760515b25.txt')->slurp, "Hello\nAgain\n", 'right content'; + }; }; done_testing();