Skip to content

Commit

Permalink
Add learning data conversion tool
Browse files Browse the repository at this point in the history
  • Loading branch information
kraih committed Jan 30, 2024
1 parent a0b1c6e commit bb1838d
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 9 deletions.
37 changes: 28 additions & 9 deletions lib/Cavil/Command/learn.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@ has usage => sub ($self) { $self->extract_usage };

sub run ($self, @args) {
getopt \@args,
'i|input=s' => \my $input,
'o|output=s' => \my $output,
'p|patterns' => \my $patterns;
die 'Input or output directory is required' unless (defined $output || defined $input);
'c|convert=s' => \my $convert,
'i|input=s' => \my $input,
'o|output=s' => \my $output,
'p|patterns' => \my $patterns;
die 'Input or output directory is required' unless (defined $output || defined $input || defined $convert);

return $self->_convert($convert) if $convert;
return $self->_output($output, $patterns) if $output;
return $self->_input($input);
}
Expand All @@ -42,6 +44,20 @@ sub _classify ($db, $name, $license) {
$license, $checksum)->rows;
}

sub _convert ($self, $convert) {
my $patterns = $self->app->patterns;
my $dir = path($convert);

for my $old ($dir->list->each) {
my $content = $old->slurp;
my $checksum = $patterns->checksum($content);
my $new = $old->sibling("$checksum.txt");
$new->spew($content);
$old->remove;
say "Converted @{[$old->basename]} to @{[$new->basename]}";
}
}

sub _input ($self, $input) {
my $db = $self->app->pg->db;

Expand Down Expand Up @@ -144,12 +160,15 @@ Cavil::Command::learn - Cavil learn command
script/cavil learn -o ./ml-data
script/cavil learn -p -o ./ml-data
script/cavil learn -i ./ml-data
script/cavil learn -c ./text-files
Options:
-i, --input <dir> Import snippet classifications from training data
-o, --output <dir> Export snippets for training machine learning models
-p, --patterns Convert license patterns into snippets and export
those instead
-h, --help Show this summary of available options
-c, --convert <dir> Convert a directory with arbitary text files into
training data (this is a destructive operation)
-i, --input <dir> Import snippet classifications from training data
-o, --output <dir> Export snippets for training machine learning models
-p, --patterns Convert license patterns into snippets and export
those instead
-h, --help Show this summary of available options
=cut
22 changes: 22 additions & 0 deletions t/command_learn.t
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,28 @@ subtest 'Snippets added' => sub {
}
like $buffer, qr/Imported 0 snippet classifications/, 'no snippets imported';
};

subtest 'Convert arbitrary text files' => sub {
my $dir = $tmp->child('convert')->make_path;
$dir->child('test.txt')->spew("Hello\nCavil\n");
$dir->child('test2')->spew("Hello\nAgain\n");
ok -e $dir->child('test.txt'), 'file exists';
ok -e $dir->child('test2'), 'file exists';

my $buffer = '';
{
open my $handle, '>', \$buffer;
local *STDOUT = $handle;
$app->start('learn', '-c', "$dir");
}
like $buffer, qr/Converted test.txt to c512411bea5f292484180fb72e5ea0f9.txt/, 'first file';
like $buffer, qr/Converted test2 to 00911bf540aebe36e7c2908760515b25.txt/, 'second file';

ok !-e $dir->child('test.txt'), 'file no longer exists';
ok !-e $dir->child('test2'), 'file no longer exists';
is $dir->child('c512411bea5f292484180fb72e5ea0f9.txt')->slurp, "Hello\nCavil\n", 'right content';
is $dir->child('00911bf540aebe36e7c2908760515b25.txt')->slurp, "Hello\nAgain\n", 'right content';
};
};

done_testing();

0 comments on commit bb1838d

Please sign in to comment.