Skip to content

Commit

Permalink
discojs-core/tasks: add wikitext
Browse files Browse the repository at this point in the history
  • Loading branch information
tharvik committed Mar 4, 2024
1 parent e7da809 commit 682cc60
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 2 deletions.
5 changes: 3 additions & 2 deletions discojs/discojs-core/src/default_tasks/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
export { cifar10 } from './cifar10'
export { geotags } from './geotags'
export { lusCovid } from './lus_covid'
export { mnist } from './mnist'
export { titanic } from './titanic'
export { simpleFace } from './simple_face'
export { geotags } from './geotags'
export { skinMnist } from './skin_mnist'
export { titanic } from './titanic'
export { wikitext } from './wikitext'
46 changes: 46 additions & 0 deletions discojs/discojs-core/src/default_tasks/wikitext.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import type { Model, Task, TaskProvider } from '..'
import { TrainingSchemes, models } from '..'

export const wikitext: TaskProvider = {
getTask (): Task {
return {
id: 'wikitext-103',
displayInformation: {
taskTitle: 'Wikitext 103 Raw',
summary: {
preview:
'In this challenge, we ask you to do next word prediction on a dataset of Wikipedia articles.',
overview:
'Wikitext-103-raw is a dataset comprising unprocessed text excerpts from Wikipedia articles, designed for tasks related to natural language processing and language modeling.'
},
limitations:
'The dataset may contain noise, inconsistencies, and unstructured content due to its raw nature, potentially posing challenges for certain NLP tasks.',
tradeoffs:
'The raw format may lack structured annotations and may require additional preprocessing for specific applications.',
dataFormatInformation:
'The dataset is organized as a large text file, with each line representing a segment of raw text from Wikipedia articles.',
dataExampleText:
'An example excerpt from the dataset could be: "The history of artificial intelligence dates back to ancient times, with philosophical discussions on the nature of thought and reasoning."'
},
trainingInformation: {
dataType: 'text',
modelID: 'wikitext-103-raw-model',
validationSplit: 0.2, // TODO: is this used somewhere? because train, eval and test are already split in dataset
epochs: 10_000,
// constructing a batch is taken care automatically in the dataset to make things faster
// so we fake a batch size of 1
batchSize: 1,
scheme: TrainingSchemes.DECENTRALIZED,
noiseScale: undefined,
decentralizedSecure: true,
minimumReadyPeers: 3,
maxShareValue: 100,
roundDuration: 10
}
}
},

async getModel (): Promise<Model> {
return new models.GPT()
}
}

0 comments on commit 682cc60

Please sign in to comment.