discojs-core/tasks: add wikitext

epfml · Mar 4, 2024 · 682cc60 · 682cc60
1 parent e7da809
commit 682cc60
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 2 deletions.
diff --git a/discojs/discojs-core/src/default_tasks/index.ts b/discojs/discojs-core/src/default_tasks/index.ts
@@ -1,7 +1,8 @@
 export { cifar10 } from './cifar10'
+export { geotags } from './geotags'
 export { lusCovid } from './lus_covid'
 export { mnist } from './mnist'
-export { titanic } from './titanic'
 export { simpleFace } from './simple_face'
-export { geotags } from './geotags'
 export { skinMnist } from './skin_mnist'
+export { titanic } from './titanic'
+export { wikitext } from './wikitext'
diff --git a/discojs/discojs-core/src/default_tasks/wikitext.ts b/discojs/discojs-core/src/default_tasks/wikitext.ts
@@ -0,0 +1,46 @@
+import type { Model, Task, TaskProvider } from '..'
+import { TrainingSchemes, models } from '..'
+
+export const wikitext: TaskProvider = {
+  getTask (): Task {
+    return {
+      id: 'wikitext-103',
+      displayInformation: {
+        taskTitle: 'Wikitext 103 Raw',
+        summary: {
+          preview:
+                        'In this challenge, we ask you to do next word prediction on a dataset of Wikipedia articles.',
+          overview:
+                        'Wikitext-103-raw is a dataset comprising unprocessed text excerpts from Wikipedia articles, designed for tasks related to natural language processing and language modeling.'
+        },
+        limitations:
+                    'The dataset may contain noise, inconsistencies, and unstructured content due to its raw nature, potentially posing challenges for certain NLP tasks.',
+        tradeoffs:
+                    'The raw format may lack structured annotations and may require additional preprocessing for specific applications.',
+        dataFormatInformation:
+                    'The dataset is organized as a large text file, with each line representing a segment of raw text from Wikipedia articles.',
+        dataExampleText:
+                    'An example excerpt from the dataset could be: "The history of artificial intelligence dates back to ancient times, with philosophical discussions on the nature of thought and reasoning."'
+      },
+      trainingInformation: {
+        dataType: 'text',
+        modelID: 'wikitext-103-raw-model',
+        validationSplit: 0.2, // TODO: is this used somewhere? because train, eval and test are already split in dataset
+        epochs: 10_000,
+        // constructing a batch is taken care automatically in the dataset to make things faster
+        // so we fake a batch size of 1
+        batchSize: 1,
+        scheme: TrainingSchemes.DECENTRALIZED,
+        noiseScale: undefined,
+        decentralizedSecure: true,
+        minimumReadyPeers: 3,
+        maxShareValue: 100,
+        roundDuration: 10
+      }
+    }
+  },
+
+  async getModel (): Promise<Model> {
+    return new models.GPT()
+  }
+}