Decoder represents a neural network component that converts internal model representations into desired outputs. In sequence models like machine translation systems, chatbots, or text generators, decoders generate output sequences one element at a time while considering both the encoded input and previously generated outputs.
Core Architecture
Basic Decoder Structure
class SimpleDecoder(nn.Module):
def __init__(self, hidden_size, vocab_size):
super().__init__()
self.hidden_size = hidden_size
self.vocab_size = vocab_size
# Core components
self.embedding = nn.Embedding(vocab_size, hidden_size)
self.rnn = nn.GRU(hidden_size, hidden_size)
self.output = nn.Linear(hidden_size, vocab_size)
def forward(self, input_token, hidden_state):
# Convert token to embedding
embedded = self.embedding(input_token)
# Process through RNN
output, hidden = self.rnn(embedded, hidden_state)
# Generate vocabulary distribution
predictions = self.output(output)
return predictions, hidden
Advanced Decoder Components
Input Processing:
class DecoderInput:
def __init__(self, max_length=100):
self.max_length = max_length
def prepare_input(self, target_sequence):
# Add start token
decoder_input = ['<START>'] + target_sequence
# Add positional encoding
positions = create_positional_encoding(len(decoder_input))
# Create attention mask
attention_mask = create_causal_mask(len(decoder_input))
return {
'input_ids': decoder_input,
'positions': positions,
'attention_mask': attention_mask
}
Output Generation:
class DecoderOutput:
def process_output(self, logits, temperature=1.0):
# Apply temperature scaling
scaled_logits = logits / temperature
# Convert to probabilities
probs = F.softmax(scaled_logits, dim=-1)
# Sample from distribution
next_token = torch.multinomial(probs, 1)
return next_token
Types of Decoders
1. Autoregressive Decoder
Used in text generation, where each output depends on previous outputs:
class AutoregressiveDecoder(nn.Module):
def __init__(self, hidden_size, vocab_size):
super().__init__()
self.transformer_layer = nn.TransformerDecoderLayer(
d_model=hidden_size,
nhead=8
)
def generate(self, encoded_input, max_length=50):
output_sequence = ['<START>']
for i in range(max_length):
# Get next token prediction
next_token = self.predict_next(
encoded_input,
output_sequence
)
# Stop if end token generated
if next_token == '<END>':
break
output_sequence.append(next_token)
return output_sequence
def predict_next(self, encoded_input, current_output):
# Apply self-attention to current output
self_attended = self.transformer_layer(
current_output,
current_output,
current_output
)
# Cross-attend to encoded input
cross_attended = self.transformer_layer(
self_attended,
encoded_input,
encoded_input
)
# Generate next token
next_token_logits = self.output_layer(cross_attended)
return self.sample_token(next_token_logits)
2. Non-autoregressive Decoder
Generates all outputs in parallel, useful for tasks like image segmentation:
class NonAutoregressiveDecoder(nn.Module):
def __init__(self, input_channels, output_channels):
super().__init__()
self.layers = nn.Sequential(
nn.ConvTranspose2d(input_channels, 512, 4, 2, 1),
nn.ReLU(),
nn.ConvTranspose2d(512, 256, 4, 2, 1),
nn.ReLU(),
nn.ConvTranspose2d(256, output_channels, 4, 2, 1)
)
def forward(self, encoded_input):
# Generate entire output at once
return self.layers(encoded_input)
Advanced Techniques
1. Beam Search Implementation
class BeamSearchDecoder:
def __init__(self, model, beam_width=5):
self.model = model
self.beam_width = beam_width
def decode(self, encoded_input):
# Initialize beams with start tokens
beams = [([START_TOKEN], 0.0)] # (sequence, log_prob)
while not self._all_completed(beams):
candidates = []
for sequence, score in beams:
if self._is_completed(sequence):
candidates.append((sequence, score))
continue
# Get next token predictions
logits = self.model.predict_next(encoded_input, sequence)
probs = F.softmax(logits, dim=-1)
# Add top-k candidates
top_k = torch.topk(probs, self.beam_width)
for prob, token in zip(top_k.values, top_k.indices):
new_sequence = sequence + [token]
new_score = score + torch.log(prob)
candidates.append((new_sequence, new_score))
# Select top beam_width candidates
beams = sorted(candidates, key=lambda x: x[1])[-self.beam_width:]
return beams[0][0] # Return best sequence
2. Copy Mechanism
Allows decoder to copy tokens directly from input:
class CopyDecoder(nn.Module):
def __init__(self, hidden_size, vocab_size):
super().__init__()
self.attention = BahdanauAttention(hidden_size)
self.copy_gate = nn.Linear(hidden_size * 2, 1)
def forward(self, decoder_state, encoder_states, encoder_tokens):
# Regular vocabulary distribution
vocab_dist = self.get_vocab_distribution(decoder_state)
# Attention over encoder states
attention_weights = self.attention(decoder_state, encoder_states)
# Copy distribution over source tokens
copy_dist = torch.zeros(self.vocab_size)
copy_dist.scatter_add_(0, encoder_tokens, attention_weights)
# Combine distributions
gate = torch.sigmoid(self.copy_gate(
torch.cat([decoder_state, attention_weights], dim=-1)
))
final_distribution = gate * vocab_dist + (1 - gate) * copy_dist
return final_distribution
Optimization Techniques
1. Teacher Forcing
def train_with_teacher_forcing(model, input_seq, target_seq, teacher_forcing_ratio=0.5):
decoder_input = target_seq[0]
outputs = []
for t in range(1, len(target_seq)):
# Generate prediction
output, hidden = model(decoder_input, hidden)
outputs.append(output)
# Decide whether to use true target or prediction
use_teacher_forcing = random.random() < teacher_forcing_ratio
decoder_input = target_seq[t] if use_teacher_forcing else output.argmax()
return outputs
2. Scheduled Sampling
class ScheduledSamplingTrainer:
def __init__(self, model, schedule='linear'):
self.model = model
self.schedule = schedule
def get_sampling_probability(self, epoch):
if self.schedule == 'linear':
return max(0.0, 1.0 - epoch / self.total_epochs)
elif self.schedule == 'exponential':
return math.exp(-epoch / self.total_epochs)
def train_step(self, input_seq, target_seq, epoch):
sampling_prob = self.get_sampling_probability(epoch)
outputs = []
for t in range(len(target_seq) - 1):
output = self.model(decoder_input)
outputs.append(output)
# Probabilistically choose between target and predicted token
use_target = random.random() < sampling_prob
decoder_input = target_seq[t] if use_target else output.argmax()
return outputs
Performance Metrics
class DecoderEvaluator:
def __init__(self):
self.metrics = {
'bleu': evaluate.load('bleu'),
'rouge': evaluate.load('rouge'),
'meteor': evaluate.load('meteor')
}
def compute_metrics(self, predictions, references):
results = {}
# Calculate BLEU score
results['bleu'] = self.metrics['bleu'].compute(
predictions=predictions,
references=references
)
# Calculate ROUGE scores
results['rouge'] = self.metrics['rouge'].compute(
predictions=predictions,
references=references
)
# Calculate METEOR score
results['meteor'] = self.metrics['meteor'].compute(
predictions=predictions,
references=references
)
return results
Comments are closed