Skip to content

Commit

Permalink
Add scripts for downloading open source corpora. (#4193)
Browse files Browse the repository at this point in the history
Add scripts for downloading open source corpora.
  • Loading branch information
munificent authored Dec 4, 2024
1 parent cdee9bc commit 1b0e339
Show file tree
Hide file tree
Showing 9 changed files with 429 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/dart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ jobs:
- name: dart pub get (working/macros/example)
run: dart pub get
working-directory: working/macros/example
- name: dart pub get (tools/corpus)
run: dart pub get
working-directory: tools/corpus/scripts
- name: dart pub get (accepted/2.3/spread-collections/benchmarks)
run: dart pub get
working-directory: accepted/2.3/spread-collections/benchmarks
Expand Down
3 changes: 3 additions & 0 deletions tools/corpus/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Don't commit the downloaded files.
download/
out/
25 changes: 25 additions & 0 deletions tools/corpus/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
This directory contains a package with scripts for downloading corpora of open
source Dart code for automated analysis. There are a few scripts for
downloading from various places:

* `clone_flutter_apps.dart`: Clones GitHub repositories linked to from
[github.com/tortuvshin/open-source-flutter-apps](https://github.com/tortuvshin/open-source-flutter-apps), which is a registry of open source Flutter apps.
Downloads them to `download/apps`.

* `clone_widgets.apps.dart`: Clones GitHub repositories referenced by
[itsallwidgets.com](https://itsallwidgets.com/), which is a collection of
open source Flutter apps and widgets. Downloads them to `download/widgets`.

* `download_packages.dart`: Downloads recent packages from
[pub.dev](https://pub.dev/). Downloads to `download/pub`.

Once a corpus is downloaded, there is another script that copies over just the
`.dart` files while discardinging "uninteresting" files like generated ones:

* `copy_corpus.dart`: Copies `.dart` files from one of the download
directories. Pass `apps`, `widgets`, `pub`, etc. Can also copy sources from
the Dart SDK repo (`dart`) or Flutter repo (`flutter`). For that to work,
those repos must be in directories next to the language repo.

You can pass `--sample=<percent>` to take a random sample of a corpus. For
example, `--sample=5` will copy over only 5% of the files, chosen randomly.
39 changes: 39 additions & 0 deletions tools/corpus/scripts/bin/clone_flutter_apps.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import 'package:corpus/utils.dart';

/// Match URIs that point to GitHub repos. Look for a trailing ")" (after an
/// allowed trailing "/") in order to only find Markdown link URIs that are
/// directly to repos and not to paths within them like the images in the
/// header.
final _gitHubRepoPattern =
RegExp(r'https://github.com/([a-zA-Z0-9_-]+)/([a-zA-Z0-9_-]+)/?\)');

const _readmeUri =
'https://raw.githubusercontent.com/tortuvshin/open-source-flutter-apps/'
'refs/heads/master/README.md';

/// Clones the GitHub repos listed on:
///
/// https://github.com/tortuvshin/open-source-flutter-apps
///
/// Downloads them to downloads/apps.
void main(List<String> arguments) async {
clean('download/apps');

print('Getting README.md...');
var readme = await httpGet(_readmeUri);

// Find all the repo URLs and remove the duplicates.
var repoPaths = _gitHubRepoPattern
.allMatches(readme)
.map((match) => (user: match[1]!, repo: match[2]!))
.toSet()
.toList();

// Skip the reference to the repo itself.
repoPaths.remove((user: 'tortuvshin', repo: 'open-source-flutter-apps'));

var downloader = Downloader(totalResources: repoPaths.length, concurrency: 5);
for (var (:user, :repo) in repoPaths) {
downloader.cloneGitHubRepo('apps', user, repo);
}
}
34 changes: 34 additions & 0 deletions tools/corpus/scripts/bin/clone_widgets.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import 'package:corpus/utils.dart';

/// Match URIs that point to GitHub repos.
final _gitHubRepoPattern =
RegExp(r'https://github.com/([a-zA-Z0-9_-]+)/([a-zA-Z0-9_-]+)');

/// Download open source apps from itsallwidgets.com.
void main(List<String> arguments) async {
clean("download/widgets");

print('Getting page feed...');
var feed =
await httpGetJson('https://itsallwidgets.com/feed?open_source=true');

var repos = <({String user, String repo})>{};
for (var entry in (feed as List<Object?>)) {
var entryMap = entry as Map<String, Object?>;
if (entryMap['type'] != 'app') continue;

var repo = entryMap['repo_url'] as String?;
if (repo == null) continue;

// Only know how to download from GitHub. There are a couple of BitBucket
// ones in there.
if (_gitHubRepoPattern.firstMatch(repo) case var match?) {
repos.add((user: match[1]!, repo: match[2]!));
}
}

var downloader = Downloader(totalResources: repos.length, concurrency: 10);
for (var (:user, :repo) in repos) {
downloader.cloneGitHubRepo('widgets', user, repo);
}
}
92 changes: 92 additions & 0 deletions tools/corpus/scripts/bin/copy_corpus.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import 'dart:io';
import 'dart:math';

import 'package:args/args.dart';
import 'package:path/path.dart' as p;

/// What percentage of files should be copied over. Used to take a random
/// sample of a corpus.
int _samplePercent = 100;

final _random = Random();

const _ignoreDirs = [
'pkg/dev_compiler/gen/',
'tests/co19/',
'third_party/observatory_pub_packages/',
'tools/sdks/',
'out/',
'xcodebuild/',

// Redundant stuff in Flutter.
'bin/cache/',

// Redundant packages that are in the SDK.
'analyzer-',
'compiler_unsupported-',
'dev_compiler-',
];

// Note! Assumes the Dart SDK and Flutter repos have been cloned in
// directories next to the corpus repo. Also assumes this script has been run
// from the root directory of this repo.
const _corpora = [
('apps', 'download/apps'),
('dart', '../../../dart/sdk'),
('flutter', '../../../flutter'),
('pub', 'download/pub'),
('widgets', 'download/widgets'),
];

final generatedSuffixes = ['.g.dart', '.freezed.dart'];

void main(List<String> arguments) async {
var argParser = ArgParser();
argParser.addFlag('omit-slow');
argParser.addOption('sample', abbr: 's', defaultsTo: '100');

var argResults = argParser.parse(arguments);
_samplePercent = int.parse(argResults['sample']);

for (var (name, directory) in _corpora) {
if (arguments.contains(name)) await copyDir(directory, name);
}
}

Future<void> copyDir(String fromDirectory, String toDirectory) async {
// If we're taking a random sample, put that in a separate directory.
if (_samplePercent != 100) {
toDirectory += '-$_samplePercent';
}

var i = 0;
var inDir = Directory(fromDirectory);

await inDir.list(recursive: true, followLinks: false).listen((entry) async {
var relative = p.relative(entry.path, from: inDir.path);

if (entry is Link) return;
if (entry is! File || !entry.path.endsWith('.dart')) return;

// Skip redundant stuff.
for (var ignore in _ignoreDirs) {
if (relative.startsWith(ignore)) return;
}

if (_random.nextInt(100) >= _samplePercent) return;

// If the path is in a subdirectory starting with '.', ignore it.
var parts = p.split(relative);
if (parts.any((part) => part.startsWith('.'))) return;

var outPath = p.join('out', toDirectory, relative);

var outDir = Directory(p.dirname(outPath));
if (!await outDir.exists()) await outDir.create(recursive: true);

await entry.copy(outPath);

i++;
if (i % 100 == 0) print(relative);
}).asFuture();
}
57 changes: 57 additions & 0 deletions tools/corpus/scripts/bin/download_packages.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import 'dart:io';

import 'package:corpus/utils.dart';

const _totalPackages = 2000;

void main(List<String> arguments) async {
clean('download/pub');

// Iterate through the pages (which are in most recent order) until we get
// enough packages.
var packagePage = 'http://pub.dartlang.org/api/packages';
var downloaded = 1;

var downloader = Downloader(totalResources: _totalPackages);
for (;;) {
downloader.log('Getting index page $downloaded...');
var packages = await httpGetJson(packagePage);

for (var package in packages['packages']) {
downloader.withResource((logger) async {
var name = package['name'] as String;
var version = package['latest']['version'] as String;
var archiveUrl = package['latest']['archive_url'] as String;

try {
logger.begin('Downloading $archiveUrl...');
var archiveBytes = await httpGetBytes(archiveUrl);
var tarFile = 'download/pub/$name-$version.tar.gz';
await File(tarFile).writeAsBytes(archiveBytes);

logger.log('Extracting $tarFile...');
var outputDir = 'download/pub/$name-$version';
await Directory(outputDir).create(recursive: true);
var result =
await Process.run('tar', ['-xf', tarFile, '-C', outputDir]);

if (result.exitCode != 0) {
logger.end('Could not extract $tarFile:\n${result.stderr}');
} else {
await File(tarFile).delete();
logger.end('Finished $outputDir');
}
} catch (error) {
logger.end('Error downloading $archiveUrl:\n$error');
}
});

downloaded++;
if (downloaded >= _totalPackages) return;
}

var nextUrl = packages['next_url'];
if (nextUrl is! String) break;
packagePage = nextUrl;
}
}
Loading

0 comments on commit 1b0e339

Please sign in to comment.