Skip to content
This repository has been archived by the owner on Jul 7, 2023. It is now read-only.

Commit

Permalink
Merge pull request #57 from ReDeiPirati/algorithmic_reverse_nlplike_2
Browse files Browse the repository at this point in the history
algorithmic_reverse_nlplike generator
  • Loading branch information
lukaszkaiser authored Jun 29, 2017
2 parents 06df1d4 + 31f5dfa commit a2a6178
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Compiled python modules.
*.pyc
# Byte-compiled
__pycache__/

# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
Expand Down
10 changes: 10 additions & 0 deletions tensor2tensor/bin/t2t-datagen
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,16 @@ _SUPPORTED_PROBLEM_GENERATORS = {
"algorithmic_multiplication_decimal40": (
lambda: algorithmic.multiplication_generator(10, 40, 100000),
lambda: algorithmic.multiplication_generator(10, 400, 10000)),
"algorithmic_reverse_nlplike_decimal8K": (
lambda: algorithmic.reverse_generator_nlplike(8000, 70, 100000,
10, 1.300),
lambda: algorithmic.reverse_generator_nlplike(8000, 700, 10000,
10, 1.300)),
"algorithmic_reverse_nlplike_decimal32K": (
lambda: algorithmic.reverse_generator_nlplike(32000, 70, 100000,
10, 1.050),
lambda: algorithmic.reverse_generator_nlplike(32000, 700, 10000,
10, 1.050)),
"algorithmic_algebra_inverse": (
lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),
Expand Down
Empty file modified tensor2tensor/bin/t2t-trainer
100644 → 100755
Empty file.
69 changes: 69 additions & 0 deletions tensor2tensor/data_generators/algorithmic.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,75 @@ def reverse_generator(nbr_symbols, max_length, nbr_cases):
"targets": list(reversed(inputs)) + [1]} # [1] for EOS


def zipf_distribution(nbr_symbols, alpha):
"""Helper function: Create a Zipf distribution.
Args:
nbr_symbols: number of symbols to use in the distribution.
alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
Usually for modelling natural text distribution is in
the range [1.1-1.6].
Return:
distr_map: list of float, Zipf's distribution over nbr_symbols.
"""
tmp = np.power(np.arange(1, nbr_symbols+1), -alpha)
zeta = np.r_[0.0, np.cumsum(tmp)]
return [x / zeta[-1] for x in zeta]


def zipf_random_sample(distr_map, sample_len):
"""Helper function: Generate a random Zipf sample of given lenght.
Args:
distr_map: list of float, Zipf's distribution over nbr_symbols.
sample_len: integer, length of sequence to generate.
Return:
sample: list of integer, Zipf's random sample over nbr_symbols.
"""
u = np.random.random(sample_len)
# Random produces values in range [0.0,1.0); even if it is almost
# improbable(but possible) that it can generate a clear 0.000..0,
# we have made a sanity check to overcome this issue. On the other hand,
# t+1 is enough from saving us to generate PAD(0) and EOS(1) which are
# reservated symbols.
return [t+1 if t > 0 else t+2 for t in np.searchsorted(distr_map, u)]


def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases, \
scale_std_dev=100, alpha=1.5):
"""Generator for the reversing nlp-like task on sequences of symbols.
The length of the sequence is drawn from a Gaussian(Normal) distribution
at random from [1, max_length] and with std deviation of 1%,
then symbols are drawn from Zipf's law at random from [2, nbr_symbols] until
nbr_cases sequences have been produced.
Args:
max_length: integer, maximum length of sequences to generate.
nbr_cases: the number of cases to generate.
scale_std_dev: float, Normal distribution's standard deviation scale factor
used to draw the lenght of sequence. Default = 1% of the max_length.
alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
Usually for modelling natural text distribution is in
the range [1.1-1.6].
Yields:
A dictionary {"inputs": input-list, "targets": target-list} where
target-list is input-list reversed.
"""
std_dev = max_length / scale_std_dev
distr_map = zipf_distribution(nbr_symbols, alpha)
for _ in xrange(nbr_cases):
l = int(abs(np.random.normal(loc=max_length/2, scale=std_dev)) + 1)
inputs = zipf_random_sample(distr_map, l)
yield {"inputs": inputs,
"targets": list(reversed(inputs)) + [1]} # [1] for EOS


def lower_endian_to_number(l, base):
"""Helper function: convert a list of digits in the given base to a number."""
return sum([d * (base**i) for i, d in enumerate(l)])
Expand Down
16 changes: 16 additions & 0 deletions tensor2tensor/data_generators/algorithmic_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ def testReverseGenerator(self):
self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
self.assertEqual(counter, 10)

def testZipfDistribution(self):
# Following Zipf's Law with alpha equals 1: the first in rank is two times
# more probable/frequent that the second in rank, three times more prob/freq
# that the third in rank and so on.
d = algorithmic.zipf_distribution(10, 1.0001)
for i in xrange(len(d[1:])-1):
self.assertEqual("%.4f" % (abs(d[i+1]-d[i+2])*(i+2)), \
"%.4f" % d[1])

def testReverseGeneratorNlpLike(self):
counter = 0
for d in algorithmic.reverse_generator_nlplike(3, 8, 10):
counter += 1
self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
self.assertEqual(counter, 10)

def testLowerEndianToNumber(self):
self.assertEqual(algorithmic.lower_endian_to_number([0], 2), 0)
self.assertEqual(algorithmic.lower_endian_to_number([0], 7), 0)
Expand Down
2 changes: 2 additions & 0 deletions tensor2tensor/data_generators/problem_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,8 @@ def image_mscoco_tokens(model_hparams, vocab_count):
"algorithmic_multiplication_decimal40": lambda p: algorithmic(12, p),
"algorithmic_reverse_binary40": lambda p: algorithmic(4, p),
"algorithmic_reverse_decimal40": lambda p: algorithmic(12, p),
"algorithmic_reverse_nlplike_decimal8K": lambda p: algorithmic(8002, p),
"algorithmic_reverse_nlplike_decimal32K": lambda p: algorithmic(32002, p),
"algorithmic_shift_decimal40": lambda p: algorithmic(22, p),
"audio_timit_characters_tune": audio_timit_characters,
"audio_timit_characters_test": audio_timit_characters,
Expand Down

0 comments on commit a2a6178

Please sign in to comment.