Skip to content

Commit

Permalink
Support position-dependent weighting with fastText CBOW and negatives
Browse files Browse the repository at this point in the history
  • Loading branch information
Witiko committed Jul 29, 2020
1 parent c0e0169 commit dc56ae2
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 21 deletions.
42 changes: 39 additions & 3 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6,
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
max_final_vocab=None):
max_final_vocab=None, position_dependent_weights=0):
"""Train, use and evaluate word representations learned using the method
described in `Enriching Word Vectors with Subword Information <https://arxiv.org/abs/1607.04606>`_,
aka FastText.
Expand Down Expand Up @@ -421,6 +421,14 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
``min_count```. If the specified ``min_count`` is more than the
automatically calculated ``min_count``, the former will be used.
Set to ``None`` if not required.
position_dependent_weights : {1,0}, optional
If position vectors should be computed beside word and n-gram vectors, and used to weight the
context words during the training (1), or if all context words should be uniformly weighted (0).
Notes
-----
Positional vectors are only implemented for CBOW with negative sampling, not SG or hierarchical softmax.
Locking positional vectors is not supported. BLAS primitives are not used by the implementation.
Examples
--------
Expand Down Expand Up @@ -451,6 +459,10 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
self.callbacks = callbacks
if word_ngrams != 1:
raise NotImplementedError("Gensim's FastText implementation does not yet support word_ngrams != 1.")
if position_dependent_weights and (sg or hs):
raise NotImplementedError("Gensim's FastText implementation does not yet support position-dependent "
"weighting with SG or hierarchical softmax")
self.position_dependent_weights = position_dependent_weights
self.word_ngrams = word_ngrams
if max_n < min_n:
# with no eligible char-ngram lengths, no buckets need be allocated
Expand All @@ -468,7 +480,8 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha)

def prepare_weights(self, update=False):
"""In addition to superclass allocations, compute ngrams of all words present in vocabulary.
"""In addition to superclass allocations, compute ngrams of all words present in vocabulary
and initialize positional vectors.
Parameters
----------
Expand All @@ -479,6 +492,8 @@ def prepare_weights(self, update=False):
super(FastText, self).prepare_weights(update=update)
if not update:
self.wv.init_ngrams_weights(self.seed)
if self.position_dependent_weights:
self.wv.init_positional_weights(self.seed, self.window)
# EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0)
# advanced users should directly resize/adjust as necessary
self.wv.vectors_vocab_lockf = ones(1, dtype=REAL)
Expand Down Expand Up @@ -570,6 +585,8 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog
"""
if not update:
self.wv.init_ngrams_weights(self.seed)
if self.position_dependent_weights:
self.wv.init_positional_weights(self.seed, self.window)
elif not len(self.wv):
raise RuntimeError(
"You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
Expand Down Expand Up @@ -1190,6 +1207,7 @@ def __init__(self, vector_size, min_n, max_n, bucket):
self.vectors_vocab = None # fka syn0_vocab
self.vectors_ngrams = None # fka syn0_ngrams
self.buckets_word = None
self.vectors_positions = None
self.min_n = min_n
self.max_n = max_n
self.bucket = bucket # count of buckets, fka num_ngram_vectors
Expand Down Expand Up @@ -1329,7 +1347,6 @@ def init_ngrams_weights(self, seed):
vocab_shape = (len(self), self.vector_size)
ngrams_shape = (self.bucket, self.vector_size)
self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL)

#
# We could have initialized vectors_ngrams at construction time, but we
# do it here for two reasons:
Expand All @@ -1341,6 +1358,25 @@ def init_ngrams_weights(self, seed):
#
self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL)

def init_positional_weights(self, seed, window):
"""Initialize the positional weights prior to training.
Creates the weight matrix and initializes it with uniform random values.
Parameters
----------
seed : float
The seed for the PRNG.
window : int
The size of the window used during the training.
"""
rand_obj = np.random.default_rng(seed=seed) # use new instance of numpy's recommended generator/algorithm

lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size
positional_shape = (2 * window, self.vector_size)
self.vectors_positions = rand_obj.uniform(lo, hi, positional_shape).astype(REAL)

def update_ngrams_weights(self, seed, old_vocab_len):
"""Update the vocabulary weights for training continuation.
Expand Down
11 changes: 6 additions & 5 deletions gensim/models/fasttext_inner.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,18 @@ cdef struct FastTextConfig:
#
# Model parameters. These get copied as-is from the Python model.
#
int sg, hs, negative, sample, size, window, cbow_mean, workers
int sg, hs, pdw, negative, sample, size, window, cbow_mean, workers
REAL_t alpha

#
# The syn0_vocab and syn0_ngrams arrays store vectors for vocabulary terms
# and ngrams, respectively, as 1D arrays in scanline order. For example,
# syn0_vocab[i * size : (i + 1) * size] contains the elements for the ith
# vocab term.
# The syn0_vocab, syn0_ngrams, and syn0_positions arrays store vectors for
# vocabulary terms, ngrams, and positions, respectively, as 1D arrays in
# scanline order. For example, syn0_vocab[i * size : (i + 1) * size]
# contains the elements for the ith vocab term.
#
REAL_t *syn0_vocab
REAL_t *syn0_ngrams
REAL_t *syn0_positions

#
# EXPERIMENTAL
Expand Down
52 changes: 39 additions & 13 deletions gensim/models/fasttext_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -242,22 +242,32 @@ cdef void fasttext_fast_sentence_cbow_neg(FastTextConfig *c, int i, int j, int k

cdef long long row2
cdef unsigned long long modulo = 281474976710655ULL
cdef REAL_t f, g, count, inv_count = 1.0, label, f_dot
cdef REAL_t f, g, count, inv_count = 1.0, label, f_dot, positional_feature
cdef np.uint32_t target_index, word_index
cdef int d, m
cdef int d, m, n, o

word_index = c.indexes[i]

memset(c.neu1, 0, c.size * cython.sizeof(REAL_t))
count = <REAL_t>0.0
n = j - i + c.window
for m in range(j, k):
if m == i:
continue
count += ONEF
our_saxpy(&c.size, &ONEF, &c.syn0_vocab[c.indexes[m] * c.size], &ONE, c.neu1, &ONE)
for d in range(c.subwords_idx_len[m]):
count += ONEF
our_saxpy(&c.size, &ONEF, &c.syn0_ngrams[c.subwords_idx[m][d] * c.size], &ONE, c.neu1, &ONE)
if c.pdw:
for d in range(c.size): # TODO make into a Hadamard product using a BLAS primitive: DSBMV, followed by SAXPY
c.neu1[d] += c.syn0_vocab[c.indexes[m] * c.size + d] * c.syn0_positions[n * c.size + d]
for o in range(c.subwords_idx_len[m]):
count += ONEF
for d in range(c.size): # TODO make into a Hadamard product using a BLAS primitive: DSBMV, followed by SAXPY
c.neu1[d] += c.syn0_ngrams[c.subwords_idx[m][o] * c.size + d] * c.syn0_positions[n * c.size + d]
else:
our_saxpy(&c.size, &ONEF, &c.syn0_vocab[c.indexes[m] * c.size], &ONE, c.neu1, &ONE)
for o in range(c.subwords_idx_len[m]):
count += ONEF
our_saxpy(&c.size, &ONEF, &c.syn0_ngrams[c.subwords_idx[m][o] * c.size], &ONE, c.neu1, &ONE)
n += 1

if count > (<REAL_t>0.5):
inv_count = ONEF / count
Expand Down Expand Up @@ -293,16 +303,29 @@ cdef void fasttext_fast_sentence_cbow_neg(FastTextConfig *c, int i, int j, int k
if not c.cbow_mean: # divide error over summed window vectors
sscal(&c.size, &inv_count, c.work, &ONE)

for m in range(j,k):
n = j - i + c.window
for m in range(j, k):
if m == i:
continue
our_saxpy(
&c.size, &c.vocab_lockf[c.indexes[m] % c.vocab_lockf_len], c.work, &ONE,
&c.syn0_vocab[c.indexes[m]*c.size], &ONE)
for d in range(c.subwords_idx_len[m]):
if c.pdw:
for d in range(c.size): # TODO make into a Hadamard product using a BLAS primitive: DSBMV, followed by SAXPY
positional_feature = c.syn0_positions[n * c.size + d]
c.syn0_positions[n * c.size + d] += c.work[d] * c.syn0_vocab[c.indexes[m] * c.size + d]
c.syn0_vocab[c.indexes[m] * c.size + d] += c.vocab_lockf[c.indexes[m] % c.vocab_lockf_len] * c.work[d] * positional_feature
for o in range(c.subwords_idx_len[m]):
for d in range(c.size): # TODO make into two Hadamard products using a BLAS primitive: DSBMV, followed by SAXPY
positional_feature = c.syn0_positions[n * c.size + d]
c.syn0_positions[n * c.size + d] += c.work[d] * c.syn0_ngrams[c.subwords_idx[m][o] * c.size + d]
c.syn0_ngrams[c.subwords_idx[m][o] * c.size + d] += c.ngrams_lockf[c.subwords_idx[m][o] % c.ngrams_lockf_len] * c.work[d] * positional_feature
else:
our_saxpy(
&c.size, &c.ngrams_lockf[c.subwords_idx[m][d] % c.ngrams_lockf_len], c.work, &ONE,
&c.syn0_ngrams[c.subwords_idx[m][d]*c.size], &ONE)
&c.size, &c.vocab_lockf[c.indexes[m] % c.vocab_lockf_len], c.work, &ONE,
&c.syn0_vocab[c.indexes[m] * c.size], &ONE)
for o in range(c.subwords_idx_len[m]):
our_saxpy(
&c.size, &c.ngrams_lockf[c.subwords_idx[m][o] % c.ngrams_lockf_len], c.work, &ONE,
&c.syn0_ngrams[c.subwords_idx[m][o] * c.size], &ONE)
n += 1


cdef void fasttext_fast_sentence_cbow_hs(FastTextConfig *c, int i, int j, int k) nogil:
Expand Down Expand Up @@ -398,9 +421,12 @@ cdef void init_ft_config(FastTextConfig *c, model, alpha, _work, _neu1):
c.cbow_mean = model.cbow_mean
c.window = model.window
c.workers = model.workers
c.pdw = model.position_dependent_weights

c.syn0_vocab = <REAL_t *>(np.PyArray_DATA(model.wv.vectors_vocab))
c.syn0_ngrams = <REAL_t *>(np.PyArray_DATA(model.wv.vectors_ngrams))
if c.pdw:
c.syn0_positions = <REAL_t *>(np.PyArray_DATA(model.wv.vectors_positions))

# EXPERIMENTAL lockf scaled suppression/enablement of training
c.vocab_lockf = <REAL_t *>(np.PyArray_DATA(model.wv.vectors_vocab_lockf))
Expand Down

0 comments on commit dc56ae2

Please sign in to comment.