From 1b0e339c52e73acc875bd68451b46cef9391f028 Mon Sep 17 00:00:00 2001 From: Bob Nystrom Date: Wed, 4 Dec 2024 10:30:20 -0800 Subject: [PATCH] Add scripts for downloading open source corpora. (#4193) Add scripts for downloading open source corpora. --- .github/workflows/dart.yml | 3 + tools/corpus/.gitignore | 3 + tools/corpus/README.md | 25 +++ .../scripts/bin/clone_flutter_apps.dart | 39 ++++ tools/corpus/scripts/bin/clone_widgets.dart | 34 ++++ tools/corpus/scripts/bin/copy_corpus.dart | 92 ++++++++++ .../corpus/scripts/bin/download_packages.dart | 57 ++++++ tools/corpus/scripts/lib/utils.dart | 168 ++++++++++++++++++ tools/corpus/scripts/pubspec.yaml | 8 + 9 files changed, 429 insertions(+) create mode 100644 tools/corpus/.gitignore create mode 100644 tools/corpus/README.md create mode 100644 tools/corpus/scripts/bin/clone_flutter_apps.dart create mode 100644 tools/corpus/scripts/bin/clone_widgets.dart create mode 100644 tools/corpus/scripts/bin/copy_corpus.dart create mode 100644 tools/corpus/scripts/bin/download_packages.dart create mode 100644 tools/corpus/scripts/lib/utils.dart create mode 100644 tools/corpus/scripts/pubspec.yaml diff --git a/.github/workflows/dart.yml b/.github/workflows/dart.yml index 539c767145..0c6b69b264 100644 --- a/.github/workflows/dart.yml +++ b/.github/workflows/dart.yml @@ -24,6 +24,9 @@ jobs: - name: dart pub get (working/macros/example) run: dart pub get working-directory: working/macros/example + - name: dart pub get (tools/corpus) + run: dart pub get + working-directory: tools/corpus/scripts - name: dart pub get (accepted/2.3/spread-collections/benchmarks) run: dart pub get working-directory: accepted/2.3/spread-collections/benchmarks diff --git a/tools/corpus/.gitignore b/tools/corpus/.gitignore new file mode 100644 index 0000000000..53719d8b80 --- /dev/null +++ b/tools/corpus/.gitignore @@ -0,0 +1,3 @@ +# Don't commit the downloaded files. +download/ +out/ diff --git a/tools/corpus/README.md b/tools/corpus/README.md new file mode 100644 index 0000000000..64c548826b --- /dev/null +++ b/tools/corpus/README.md @@ -0,0 +1,25 @@ +This directory contains a package with scripts for downloading corpora of open +source Dart code for automated analysis. There are a few scripts for +downloading from various places: + +* `clone_flutter_apps.dart`: Clones GitHub repositories linked to from + [github.com/tortuvshin/open-source-flutter-apps](https://github.com/tortuvshin/open-source-flutter-apps), which is a registry of open source Flutter apps. + Downloads them to `download/apps`. + +* `clone_widgets.apps.dart`: Clones GitHub repositories referenced by + [itsallwidgets.com](https://itsallwidgets.com/), which is a collection of + open source Flutter apps and widgets. Downloads them to `download/widgets`. + +* `download_packages.dart`: Downloads recent packages from + [pub.dev](https://pub.dev/). Downloads to `download/pub`. + +Once a corpus is downloaded, there is another script that copies over just the +`.dart` files while discardinging "uninteresting" files like generated ones: + +* `copy_corpus.dart`: Copies `.dart` files from one of the download + directories. Pass `apps`, `widgets`, `pub`, etc. Can also copy sources from + the Dart SDK repo (`dart`) or Flutter repo (`flutter`). For that to work, + those repos must be in directories next to the language repo. + + You can pass `--sample=` to take a random sample of a corpus. For + example, `--sample=5` will copy over only 5% of the files, chosen randomly. diff --git a/tools/corpus/scripts/bin/clone_flutter_apps.dart b/tools/corpus/scripts/bin/clone_flutter_apps.dart new file mode 100644 index 0000000000..25e3fd1094 --- /dev/null +++ b/tools/corpus/scripts/bin/clone_flutter_apps.dart @@ -0,0 +1,39 @@ +import 'package:corpus/utils.dart'; + +/// Match URIs that point to GitHub repos. Look for a trailing ")" (after an +/// allowed trailing "/") in order to only find Markdown link URIs that are +/// directly to repos and not to paths within them like the images in the +/// header. +final _gitHubRepoPattern = + RegExp(r'https://github.com/([a-zA-Z0-9_-]+)/([a-zA-Z0-9_-]+)/?\)'); + +const _readmeUri = + 'https://raw.githubusercontent.com/tortuvshin/open-source-flutter-apps/' + 'refs/heads/master/README.md'; + +/// Clones the GitHub repos listed on: +/// +/// https://github.com/tortuvshin/open-source-flutter-apps +/// +/// Downloads them to downloads/apps. +void main(List arguments) async { + clean('download/apps'); + + print('Getting README.md...'); + var readme = await httpGet(_readmeUri); + + // Find all the repo URLs and remove the duplicates. + var repoPaths = _gitHubRepoPattern + .allMatches(readme) + .map((match) => (user: match[1]!, repo: match[2]!)) + .toSet() + .toList(); + + // Skip the reference to the repo itself. + repoPaths.remove((user: 'tortuvshin', repo: 'open-source-flutter-apps')); + + var downloader = Downloader(totalResources: repoPaths.length, concurrency: 5); + for (var (:user, :repo) in repoPaths) { + downloader.cloneGitHubRepo('apps', user, repo); + } +} diff --git a/tools/corpus/scripts/bin/clone_widgets.dart b/tools/corpus/scripts/bin/clone_widgets.dart new file mode 100644 index 0000000000..51b4568476 --- /dev/null +++ b/tools/corpus/scripts/bin/clone_widgets.dart @@ -0,0 +1,34 @@ +import 'package:corpus/utils.dart'; + +/// Match URIs that point to GitHub repos. +final _gitHubRepoPattern = + RegExp(r'https://github.com/([a-zA-Z0-9_-]+)/([a-zA-Z0-9_-]+)'); + +/// Download open source apps from itsallwidgets.com. +void main(List arguments) async { + clean("download/widgets"); + + print('Getting page feed...'); + var feed = + await httpGetJson('https://itsallwidgets.com/feed?open_source=true'); + + var repos = <({String user, String repo})>{}; + for (var entry in (feed as List)) { + var entryMap = entry as Map; + if (entryMap['type'] != 'app') continue; + + var repo = entryMap['repo_url'] as String?; + if (repo == null) continue; + + // Only know how to download from GitHub. There are a couple of BitBucket + // ones in there. + if (_gitHubRepoPattern.firstMatch(repo) case var match?) { + repos.add((user: match[1]!, repo: match[2]!)); + } + } + + var downloader = Downloader(totalResources: repos.length, concurrency: 10); + for (var (:user, :repo) in repos) { + downloader.cloneGitHubRepo('widgets', user, repo); + } +} diff --git a/tools/corpus/scripts/bin/copy_corpus.dart b/tools/corpus/scripts/bin/copy_corpus.dart new file mode 100644 index 0000000000..8e0bdb5555 --- /dev/null +++ b/tools/corpus/scripts/bin/copy_corpus.dart @@ -0,0 +1,92 @@ +import 'dart:io'; +import 'dart:math'; + +import 'package:args/args.dart'; +import 'package:path/path.dart' as p; + +/// What percentage of files should be copied over. Used to take a random +/// sample of a corpus. +int _samplePercent = 100; + +final _random = Random(); + +const _ignoreDirs = [ + 'pkg/dev_compiler/gen/', + 'tests/co19/', + 'third_party/observatory_pub_packages/', + 'tools/sdks/', + 'out/', + 'xcodebuild/', + + // Redundant stuff in Flutter. + 'bin/cache/', + + // Redundant packages that are in the SDK. + 'analyzer-', + 'compiler_unsupported-', + 'dev_compiler-', +]; + +// Note! Assumes the Dart SDK and Flutter repos have been cloned in +// directories next to the corpus repo. Also assumes this script has been run +// from the root directory of this repo. +const _corpora = [ + ('apps', 'download/apps'), + ('dart', '../../../dart/sdk'), + ('flutter', '../../../flutter'), + ('pub', 'download/pub'), + ('widgets', 'download/widgets'), +]; + +final generatedSuffixes = ['.g.dart', '.freezed.dart']; + +void main(List arguments) async { + var argParser = ArgParser(); + argParser.addFlag('omit-slow'); + argParser.addOption('sample', abbr: 's', defaultsTo: '100'); + + var argResults = argParser.parse(arguments); + _samplePercent = int.parse(argResults['sample']); + + for (var (name, directory) in _corpora) { + if (arguments.contains(name)) await copyDir(directory, name); + } +} + +Future copyDir(String fromDirectory, String toDirectory) async { + // If we're taking a random sample, put that in a separate directory. + if (_samplePercent != 100) { + toDirectory += '-$_samplePercent'; + } + + var i = 0; + var inDir = Directory(fromDirectory); + + await inDir.list(recursive: true, followLinks: false).listen((entry) async { + var relative = p.relative(entry.path, from: inDir.path); + + if (entry is Link) return; + if (entry is! File || !entry.path.endsWith('.dart')) return; + + // Skip redundant stuff. + for (var ignore in _ignoreDirs) { + if (relative.startsWith(ignore)) return; + } + + if (_random.nextInt(100) >= _samplePercent) return; + + // If the path is in a subdirectory starting with '.', ignore it. + var parts = p.split(relative); + if (parts.any((part) => part.startsWith('.'))) return; + + var outPath = p.join('out', toDirectory, relative); + + var outDir = Directory(p.dirname(outPath)); + if (!await outDir.exists()) await outDir.create(recursive: true); + + await entry.copy(outPath); + + i++; + if (i % 100 == 0) print(relative); + }).asFuture(); +} diff --git a/tools/corpus/scripts/bin/download_packages.dart b/tools/corpus/scripts/bin/download_packages.dart new file mode 100644 index 0000000000..d07946a230 --- /dev/null +++ b/tools/corpus/scripts/bin/download_packages.dart @@ -0,0 +1,57 @@ +import 'dart:io'; + +import 'package:corpus/utils.dart'; + +const _totalPackages = 2000; + +void main(List arguments) async { + clean('download/pub'); + + // Iterate through the pages (which are in most recent order) until we get + // enough packages. + var packagePage = 'http://pub.dartlang.org/api/packages'; + var downloaded = 1; + + var downloader = Downloader(totalResources: _totalPackages); + for (;;) { + downloader.log('Getting index page $downloaded...'); + var packages = await httpGetJson(packagePage); + + for (var package in packages['packages']) { + downloader.withResource((logger) async { + var name = package['name'] as String; + var version = package['latest']['version'] as String; + var archiveUrl = package['latest']['archive_url'] as String; + + try { + logger.begin('Downloading $archiveUrl...'); + var archiveBytes = await httpGetBytes(archiveUrl); + var tarFile = 'download/pub/$name-$version.tar.gz'; + await File(tarFile).writeAsBytes(archiveBytes); + + logger.log('Extracting $tarFile...'); + var outputDir = 'download/pub/$name-$version'; + await Directory(outputDir).create(recursive: true); + var result = + await Process.run('tar', ['-xf', tarFile, '-C', outputDir]); + + if (result.exitCode != 0) { + logger.end('Could not extract $tarFile:\n${result.stderr}'); + } else { + await File(tarFile).delete(); + logger.end('Finished $outputDir'); + } + } catch (error) { + logger.end('Error downloading $archiveUrl:\n$error'); + } + }); + + downloaded++; + if (downloaded >= _totalPackages) return; + } + + var nextUrl = packages['next_url']; + if (nextUrl is! String) break; + packagePage = nextUrl; + } +} diff --git a/tools/corpus/scripts/lib/utils.dart b/tools/corpus/scripts/lib/utils.dart new file mode 100644 index 0000000000..b9133d905e --- /dev/null +++ b/tools/corpus/scripts/lib/utils.dart @@ -0,0 +1,168 @@ +import 'dart:convert'; +import 'dart:io'; + +import 'package:http/http.dart' as http; +import 'package:pool/pool.dart'; + +final _client = http.Client(); + +/// Creates an empty directory at [dirPath]. +/// +/// If a directory is already there, removes it first. +void clean(String dirPath) { + var directory = Directory(dirPath); + if (directory.existsSync()) { + print('Deleting $dirPath...'); + directory.deleteSync(recursive: true); + } + directory.createSync(recursive: true); +} + +Future cloneGitHubRepo(String destination, String user, String repo, + {String prefix = ''}) async { + print('${prefix}Cloning $user/$repo...'); + try { + var gitHubUri = 'https://github.com/$user/$repo.git'; + + var outputDir = 'download/$destination/$user-$repo'; + var result = await Process.run( + 'git', ['clone', '--depth', '1', gitHubUri, outputDir]); + if (result.exitCode != 0) { + print('${prefix}Could not clone $gitHubUri:\n${result.stderr}'); + } else { + print('${prefix}Cloned $outputDir'); + } + } catch (error) { + print('${prefix}Error cloning $user/$repo:\n$error'); + } +} + +/// Gets the body of the HTTP response to sending a GET to [uri]. +Future httpGet(String uri) async { + return (await _client.get(Uri.parse(uri))).body; +} + +/// Gets the body of the HTTP response to sending a GET to [uri]. +Future> httpGetBytes(String uri) async { + return (await _client.get(Uri.parse(uri))).bodyBytes; +} + +/// Gets the body of the HTTP response to sending a GET to [uri]. +Future httpGetJson(String uri) async { + return jsonDecode(await httpGet(uri)); +} + +class Downloader { + /// The total number of resources that will be downloaded using this pool. + final int _totalResources; + + /// The maximum number of concurrent downloads. + final int _maxConcurrency; + + final Pool _pool; + + /// The number of operations that have finished. + int _completedResources = 0; + + /// Which "slots" are currently in use for drawing the ongoing download bars. + final _slots = {}; + + Downloader({required int totalResources, int concurrency = 20}) + : _totalResources = totalResources, + _maxConcurrency = concurrency, + _pool = Pool(concurrency); + + void log(String message) { + _log(-1, '', message); + } + + void withResource(Future Function(Logger) callback) { + var logger = Logger._(this); + + _pool.withResource(() async { + await callback(logger); + }); + } + + void cloneGitHubRepo(String destination, String user, String repo) { + withResource((logger) async { + logger.begin('Cloning $user/$repo...'); + try { + var gitHubUri = 'https://github.com/$user/$repo.git'; + var outputDir = 'download/$destination/$user-$repo'; + var result = await Process.run( + 'git', ['clone', '--depth', '1', gitHubUri, outputDir]); + if (result.exitCode != 0) { + logger.end('Could not clone $gitHubUri:\n${result.stderr}'); + } else { + logger.end('Cloned $outputDir'); + } + } catch (error) { + logger.end('Error cloning $user/$repo:\n$error'); + } + }); + } + + void _log(int slot, String marker, String message) { + var buffer = StringBuffer(); + + // Show the overall progress. + var width = _totalResources.toString().length; + buffer.write('['); + buffer.write(_completedResources.toString().padLeft(width)); + buffer.write('/'); + buffer.write(_totalResources.toString().padLeft(width)); + buffer.write(']'); + + // Show the slot bars. + for (var i = 0; i < _maxConcurrency; i++) { + buffer.write(switch ((i == slot, _slots.contains(i))) { + (true, _) => marker, + (_, true) => '│', + _ => ' ' + }); + } + + buffer.write(' '); + buffer.write(message); + print(buffer); + } + + /// Find an unused slot for this operation. + int _claimSlot() { + for (var i = 0; i < _maxConcurrency; i++) { + if (!_slots.contains(i)) { + _slots.add(i); + return i; + } + } + + throw StateError('Unreachable.'); + } + + void _releaseSlot(int slot) { + _slots.remove(slot); + } +} + +class Logger { + final Downloader _pool; + late final int _slot; + + Logger._(this._pool); + + void begin(String message) { + _slot = _pool._claimSlot(); + _pool._log(_slot, '┌', message); + } + + void log(String message) { + _pool._log(_slot, '├', message); + } + + void end(String message) { + _pool._completedResources++; + _pool._log(_slot, '└', message); + _pool._releaseSlot(_slot); + } +} diff --git a/tools/corpus/scripts/pubspec.yaml b/tools/corpus/scripts/pubspec.yaml new file mode 100644 index 0000000000..ec9cf6c25a --- /dev/null +++ b/tools/corpus/scripts/pubspec.yaml @@ -0,0 +1,8 @@ +name: corpus +environment: + sdk: '^3.0.0' +dependencies: + args: '^2.6.0' + http: '^1.2.2' + path: '^1.9.1' + pool: '^1.5.1'