Skip to content

Commit

Permalink
Add ability to reimport training data classifications
Browse files Browse the repository at this point in the history
  • Loading branch information
kraih committed Jan 12, 2024
1 parent 8d0c282 commit 7ce6dec
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 8 deletions.
36 changes: 32 additions & 4 deletions lib/Cavil/Command/learn.pm
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,41 @@ has description => 'Training data for machine learning';
has usage => sub ($self) { $self->extract_usage };

sub run ($self, @args) {
getopt \@args, 'e|export=s' => \my $export;
die 'Export directory is required' unless defined $export;
getopt \@args,
'i|input=s' => \my $input,
'o|output=s' => \my $output;
die 'Input or output directory is required' unless (defined $output || defined $input);

my $app = $self->app;
my $db = $app->pg->db;

my $root = path($export);
return _output($db, $output) if $output;
return _input($db, $input);
}

sub _classify ($db, $name, $license) {
return 0 unless $name =~ /^(\w+).txt$/;
return $db->query(
'UPDATE snippets SET license = ?, classified = true, approved = true WHERE hash = ? AND approved = false',
$license, $1)->rows;
}

sub _input ($db, $input) {
my $root = path($input);
my $good = $root->child('good');
my $bad = $root->child('bad');

return unless -d $good && -d $bad;

my $count = 0;
$count += _classify($db, $_->basename, 1) for $good->list->each;
$count += _classify($db, $_->basename, 0) for $bad->list->each;

say "Imported $count snippet classifications";
}

sub _output ($db, $output) {
my $root = path($output);
my $good = $root->child('good')->make_path;
my $bad = $root->child('bad')->make_path;

Expand Down Expand Up @@ -72,7 +100,7 @@ Cavil::Command::learn - Cavil learn command
script/cavil learn -e ./input
Options:
-e, --export <dir> Export snippets for training machine learning models
-o, --output <dir> Export snippets for training machine learning models
-h, --help Show this summary of available options
=cut
47 changes: 43 additions & 4 deletions t/command_learn.t
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ subtest 'Empty database' => sub {
{
open my $handle, '>', \$buffer;
local *STDOUT = $handle;
$app->start('learn', '-e', "$dir");
$app->start('learn', '-o', "$dir");
}
like $buffer, qr/Exported 0 snippets/, 'no snippets';
ok -e $dir->child('good'), 'directory exists';
Expand All @@ -54,14 +54,14 @@ subtest 'Snippets added' => sub {
$db->query('UPDATE snippets SET license = false, approved = true WHERE id = 1');
$db->query('UPDATE snippets SET license = true, approved = true WHERE id = 2');
$db->query('UPDATE snippets SET license = true, approved = false WHERE id = 3');
my $dir = $tmp->child('two');

subtest 'Export snippets' => sub {
my $dir = $tmp->child('two');
subtest 'Output snippets' => sub {
my $buffer = '';
{
open my $handle, '>', \$buffer;
local *STDOUT = $handle;
$app->start('learn', '-e', "$dir");
$app->start('learn', '-o', "$dir");
}
like $buffer, qr/Exporting snippet 1/, 'first snippet';
like $buffer, qr/Exporting snippet 2/, 'second snippet';
Expand All @@ -74,6 +74,45 @@ subtest 'Snippets added' => sub {
is $good->size, 1, 'one file';
like $good->first->slurp, qr/Copyright Holder/, 'right content';
};

$db->query('UPDATE snippets SET license = true, approved = false WHERE id = 1');
$db->query('UPDATE snippets SET license = false, approved = false WHERE id = 2');
$dir->child('good', 'doesnotexist.txt')->spew('Whatever');
$dir->child('bad', 'doesnotexist.txt')->spew('Whatever');

subtest 'Input snippets' => sub {
my $buffer = '';
{
open my $handle, '>', \$buffer;
local *STDOUT = $handle;
$app->start('learn', '-i', "$dir");
}
like $buffer, qr/Imported 2 snippet classifications/, 'two snippets imported';

my $first = $db->select('snippets', '*', {id => 1})->hash;
is $first->{license}, 0, 'is not a license';
is $first->{classified}, 1, 'is classified';
is $first->{approved}, 1, 'is approved';

my $second = $db->select('snippets', '*', {id => 2})->hash;
is $second->{license}, 1, 'is license';
is $second->{approved}, 1, 'is approved';
is $second->{classified}, 1, 'is classified';

my $third = $db->select('snippets', '*', {id => 3})->hash;
is $third->{approved}, 0, 'not approved';
is $third->{classified}, 0, 'not classified';
};

subtest 'Input snippets (repeat does nothing)' => sub {
my $buffer = '';
{
open my $handle, '>', \$buffer;
local *STDOUT = $handle;
$app->start('learn', '-i', "$dir");
}
like $buffer, qr/Imported 0 snippet classifications/, 'no snippets imported';
};
};

done_testing();

0 comments on commit 7ce6dec

Please sign in to comment.