-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support layer parallelism in transformer application (#2420)
This PR adds the capability to support layer parallelism in transformers, variable-length version of The Pile pretokenized dataset, updates to the LBANN graph visualizer script, and some minor tweaks to weights layer.
- Loading branch information
Showing
18 changed files
with
660 additions
and
128 deletions.
There are no files selected for viewing
34 changes: 34 additions & 0 deletions
34
applications/nlp/transformer/datasets/pretokenize/varlen/pretokenize-validation.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from tqdm import trange | ||
from multiprocessing import Pool | ||
import numpy as np | ||
import pickle | ||
|
||
|
||
class Processor: | ||
|
||
def __init__(self, total_threads: int): | ||
self.threads = total_threads | ||
|
||
def __call__(self, tid: int): | ||
import thepile as dataset | ||
num_samples = dataset.num_val_samples() | ||
filename = f'/p/vast1/data/datasets/the-pile-huggingface/pretokenized-varlen/val.bin' | ||
len_filename = f'/p/vast1/data/datasets/the-pile-huggingface/pretokenized-varlen/val-seqlen.bin' | ||
|
||
with open(filename, 'ab') as fp: | ||
with open(len_filename, 'ab') as slfp: | ||
for i in trange(num_samples): | ||
text = dataset.dataset_val[i]['text'] | ||
tokenized = dataset.tokenize(text) | ||
sample = np.array(tokenized, dtype=np.uint16) | ||
sample_len = np.array([len(sample)], dtype=np.uint32) | ||
sample.tofile(fp) | ||
sample_len.tofile(slfp) | ||
|
||
print('Done') | ||
|
||
|
||
if __name__ == '__main__': | ||
threads = 1 | ||
with Pool(threads) as pool: | ||
pool.map(Processor(threads), range(threads)) |
78 changes: 78 additions & 0 deletions
78
applications/nlp/transformer/datasets/pretokenize/varlen/pretokenize.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from tqdm import trange | ||
from multiprocessing import Pool | ||
import numpy as np | ||
import os | ||
import argparse | ||
from pathlib import Path | ||
|
||
|
||
class Processor: | ||
|
||
def __init__(self, total_threads: int): | ||
self.threads = total_threads | ||
|
||
def __call__(self, tid: int): | ||
import thepile as dataset | ||
num_samples = dataset.num_train_samples() | ||
np.random.seed(20231023) | ||
indices = np.random.permutation(num_samples) | ||
local_samples = num_samples // self.threads | ||
offset = tid * local_samples | ||
# Add remainder | ||
if tid == self.threads - 1: | ||
local_samples += num_samples % self.threads | ||
section = indices[offset:offset + local_samples] | ||
filename = f'/p/vast1/data/datasets/the-pile-huggingface/pretokenized-varlen/train-pretokenized-{tid:02d}-of-{self.threads}.bin' | ||
len_filename = f'/p/vast1/data/datasets/the-pile-huggingface/pretokenized-varlen/train-seqlen-{tid:02d}-of-{self.threads}.bin' | ||
|
||
# Create file | ||
if not os.path.isfile(filename): | ||
Path(filename).touch() | ||
if not os.path.isfile(len_filename): | ||
Path(len_filename).touch() | ||
|
||
sz = os.path.getsize(len_filename) | ||
assert sz % 4 == 0 | ||
sequences_processed = sz // 4 | ||
print(tid, ': Size in bytes:', sz, '. Sequences processed:', | ||
sequences_processed) | ||
|
||
with open(filename, 'ab') as fp: | ||
with open(len_filename, 'ab') as slfp: | ||
for i in trange(sequences_processed, | ||
section.shape[0], | ||
desc=f'Thread {tid}'): | ||
text = dataset.dataset_train[int(section[i])]['text'] | ||
sample = dataset.tokenize(text) | ||
sample = np.array(sample, dtype=np.uint16) | ||
sample.tofile(fp) | ||
sample_len = np.array([len(sample)], dtype=np.uint32) | ||
sample_len.tofile(slfp) | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument('-j', | ||
action='store', | ||
default=0, | ||
type=int, | ||
help='Threads (default 0 = number of cores)') | ||
parser.add_argument('-t', | ||
action='store', | ||
default=0, | ||
type=int, | ||
help='Total Chunks (default 0 = number of threads)') | ||
parser.add_argument('-o', | ||
action='store', | ||
default=0, | ||
type=int, | ||
help='Chunk offset (default 0)') | ||
args = parser.parse_args() | ||
|
||
threads = args.j or os.cpu_count() | ||
total_chunks = args.t or threads | ||
offset = args.o | ||
assert offset + threads <= total_chunks | ||
with Pool(threads) as pool: | ||
pool.map(Processor(total_chunks), range(offset, offset + threads)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
105 changes: 105 additions & 0 deletions
105
applications/nlp/transformer/datasets/thepile_pretokenized_varlen.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
""" | ||
The Pile dataset, stored as pre-tokenized binary files for optimized processing. | ||
""" | ||
import os | ||
import os.path | ||
|
||
import numpy as np | ||
# ---------------------------------------------- | ||
# Options | ||
# ---------------------------------------------- | ||
|
||
sequence_length = int(os.getenv('THE_PILE_SEQUENCE_LENGTH', default='512')) | ||
|
||
# ---------------------------------------------- | ||
# Setup | ||
# ---------------------------------------------- | ||
|
||
# Load the datasets | ||
data_dir = os.getenv('THE_PILE_DATA_DIR', | ||
'/p/vast1/data/datasets/the-pile-pretokenized') | ||
dataset_train = np.memmap(os.path.join(data_dir, 'train.bin'), | ||
dtype=np.uint16, | ||
mode='r') | ||
sample_lengths_train = np.fromfile(os.path.join(data_dir, 'train-seqlen.bin'), | ||
dtype=np.uint32).astype(np.uint64) | ||
sample_offsets_train = np.zeros_like(sample_lengths_train) | ||
sample_offsets_train[1:] = np.cumsum(sample_lengths_train)[:-1] | ||
dataset_val = np.memmap(os.path.join(data_dir, 'val.bin'), | ||
dtype=np.uint16, | ||
mode='r') | ||
sample_lengths_val = np.fromfile(os.path.join(data_dir, 'val-seqlen.bin'), | ||
dtype=np.uint32).astype(np.uint64) | ||
sample_offsets_val = np.zeros_like(sample_lengths_val) | ||
sample_offsets_val[1:] = np.cumsum(sample_lengths_val)[:-1] | ||
|
||
# Uses the definition from the GPT-NeoX-20B tokenizer | ||
pad_index = 1 # '<|padding|>' | ||
_vocab_size = 50277 | ||
|
||
# ---------------------------------------------- | ||
# Sample access functions | ||
# ---------------------------------------------- | ||
|
||
|
||
def trim_and_pad(sample, random: bool): | ||
# Trim long sequences | ||
if len(sample) > sequence_length: | ||
if random: | ||
pos = np.random.rand() | ||
offset = (len(sample) - sequence_length + 1) * pos | ||
offset = int(np.floor(offset)) | ||
sample = sample[offset:offset + sequence_length] | ||
else: | ||
sample = sample[0:sequence_length] | ||
|
||
# Left-pad short sequences | ||
if len(sample) < sequence_length: | ||
sample_pad = np.full(sequence_length, pad_index, dtype=np.int32) | ||
if len(sample) > 0: | ||
sample_pad[-len(sample):] = sample | ||
return sample_pad | ||
|
||
return sample | ||
|
||
|
||
def get_train_sample(index: int): | ||
sample = np.copy( | ||
dataset_train[sample_offsets_train[index]:sample_offsets_train[index] + | ||
sample_lengths_train[index]]).astype(np.int32) | ||
return trim_and_pad(sample, True) | ||
|
||
|
||
def get_val_sample(index): | ||
sample = np.copy( | ||
dataset_val[sample_offsets_val[index]:sample_offsets_val[index] + | ||
sample_lengths_val[index]]).astype(np.int32) | ||
return trim_and_pad(sample, False) | ||
|
||
|
||
def num_train_samples(): | ||
return sample_lengths_train.shape[0] | ||
|
||
|
||
def num_val_samples(): | ||
return sample_lengths_val.shape[0] | ||
|
||
|
||
def sample_dims(): | ||
return (sequence_length, ) | ||
|
||
|
||
def vocab_size(): | ||
return _vocab_size | ||
|
||
|
||
if __name__ == '__main__': | ||
print('Training samples:', num_train_samples()) | ||
print('Validation samples:', num_val_samples()) | ||
from tokenizers import Tokenizer | ||
tokenizer = Tokenizer.from_file( | ||
os.path.join(data_dir, '20B_tokenizer.json')) | ||
print('Training sample 101:') | ||
print(tokenizer.decode(get_train_sample(101))) | ||
print('Validation sample 233:') | ||
print(tokenizer.decode(get_val_sample(233))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.