diff --git a/keras/src/layers/attention/additive_attention.py b/keras/src/layers/attention/additive_attention.py index 787dd50e71a9..b56185f85bc2 100644 --- a/keras/src/layers/attention/additive_attention.py +++ b/keras/src/layers/attention/additive_attention.py @@ -10,7 +10,7 @@ class AdditiveAttention(Attention): Inputs are a list with 2 or 3 elements: 1. A `query` tensor of shape `(batch_size, Tq, dim)`. 2. A `value` tensor of shape `(batch_size, Tv, dim)`. - 3. A optional `key` tensor of shape `(batch_size, Tv, dim)`. If none + 3. An optional `key` tensor of shape `(batch_size, Tv, dim)`. If none supplied, `value` will be used as `key`. The calculation follows the steps: @@ -33,8 +33,8 @@ class AdditiveAttention(Attention): - `query`: Query tensor of shape `(batch_size, Tq, dim)`. - `value`: Value tensor of shape `(batch_size, Tv, dim)`. - `key`: Optional key tensor of shape `(batch_size, Tv, dim)`. If - not given, will use `value` for both `key` and `value`, which is - the most common case. + not given, will use the`value` for both `key` and `value`, + which is the most common case. mask: List of the following tensors: - `query_mask`: A boolean mask tensor of shape `(batch_size, Tq)`. If given, the output will be zero at the positions where