LayerNorm
import torch
from torch import nnclass LayerNorm(nn.Module):def __init__(self, hidden_size, eps=1e-6):super().__init__()self.hidden_size = hidden_size self.eps = eps self.gamma = nn.Parameter(torch.ones(hidden_size)) self.beta = nn.Parameter(torch.zeros(hidden_size)) def forward(self, x):mean = x.mean(dim=-1, keepdim=True) variance = x.var(dim=-1, keepdim=True, unbiased=False) x_normalized = (x - mean) / torch.sqrt(variance + self.eps) output = self.gamma * x_normalized + self.beta return outputdef test_layer_norm():batch_size = 2seq_len = 4hidden_size = 8x = torch.randn(batch_size, seq_len, hidden_size) layer_norm = LayerNorm(hidden_size)output = layer_norm(x)print("Input shape:", x.shape)print("Output shape:", output.shape)if __name__ == "__main__":test_layer_norm()
BatchNorm
import torch
from torch import nnclass BatchNorm(nn.Module):def __init__(self, hidden_size, eps=1e-5, momentum=0.1):super().__init__()self.hidden_size = hidden_size self.eps = eps self.momentum = momentum self.gamma = nn.Parameter(torch.ones(hidden_size)) self.beta = nn.Parameter(torch.zeros(hidden_size)) self.running_mean = torch.zeros(hidden_size) self.running_var = torch.ones(hidden_size) def forward(self, x):if self.training:batch_mean = x.mean(dim=(0, 1), keepdim=False) batch_var = x.var(dim=(0, 1), keepdim=False, unbiased=False) self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_meanself.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_varmean = batch_meanvariance = batch_varelse:mean = self.running_meanvariance = self.running_varx_normalized = (x - mean) / torch.sqrt(variance + self.eps) output = self.gamma * x_normalized + self.beta return outputdef test_batch_norm():batch_size = 2seq_len = 4hidden_size = 8x = torch.randn(batch_size, seq_len, hidden_size) batch_norm = BatchNorm(hidden_size)output = batch_norm(x)print("Input shape:", x.shape)print("Output shape:", output.shape)if __name__ == "__main__":test_batch_norm()
Dropout
import torch
from torch import nnclass Dropout(nn.Module):def __init__(self, dropout_prob=0.1):super().__init__()self.dropout_prob = dropout_prob def forward(self, x):if self.training:mask = (torch.rand(x.shape) > self.dropout_prob).float() output = mask * x / (1.0 - self.dropout_prob) else:output = x return outputdef test_dropout():batch_size = 2seq_len = 4hidden_size = 8x = torch.randn(batch_size, seq_len, hidden_size) dropout = Dropout(dropout_prob=0.1)dropout.train()output_train = dropout(x)dropout.eval()output_eval = dropout(x)print("Input shape:", x.shape)print("Output shape during training:", output_train.shape)print("Output shape during evaluation:", output_eval.shape)if __name__ == "__main__":test_dropout()
Transformer位置编码
def sinusoidal_position_embedding(batch_size, nums_head, max_len, output_dim, device):position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(-1)ids = torch.arange(0, output_dim // 2, dtype=torch.float) theta = torch.pow(10000, -2 * ids / output_dim)embeddings = position * theta embeddings = torch.stack([torch.sin(embeddings), torch.cos(embeddings)], dim=-1)embeddings = embeddings.repeat((batch_size, nums_head, *([1] * len(embeddings.shape)))) embeddings = torch.reshape(embeddings, (batch_size, nums_head, max_len, output_dim))embeddings = embeddings.to(device)return embeddings
RoPE
Self-attention
from math import sqrt
import torch
import torch.nn as nnclass Self_Attention(nn.Module):def __init__(self, input_dim, dim_k, dim_v):super(Self_Attention, self).__init__()self.q = nn.Linear(input_dim, dim_k)self.k = nn.Linear(input_dim, dim_k)self.v = nn.Linear(input_dim, dim_v)self._norm_fact = 1 / sqrt(dim_k)def forward(self, x):Q = self.q(x) K = self.k(x) V = self.v(x) atten = torch.bmm(Q, K.permute(0, 2, 1)) * self._norm_fact atten = torch.softmax(atten, dim=-1)output = torch.bmm(atten, V) return output
input_dim = 64
dim_k = 32
dim_v = 32
self_attention = Self_Attention(input_dim, dim_k, dim_v)
batch_size = 2
seq_len = 10
x = torch.randn(batch_size, seq_len, input_dim)
output = self_attention(x)print("Input shape:", x.shape)
print("Output shape:", output.shape)
Scaled Cross Product
import torch
from torch import nnclass ScaledDotProductAttention(nn.Module):def __init__(self):super().__init__()def forward(self, query, key, value, attention_mask=None):d_k = query.size(-1) attention_scores = torch.matmul(query, key.transpose(-1, -2)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))if attention_mask is not None:attention_scores += attention_mask * -1e9attention_probs = torch.softmax(attention_scores, dim=-1) attention_output = torch.matmul(attention_probs, value) return attention_outputdef test_attn():batch_size = 128seq_len = 512hidden_size = 1024query = torch.randn(batch_size, seq_len, hidden_size) key = torch.randn(batch_size, seq_len, hidden_size) value = torch.randn(batch_size, seq_len, hidden_size) sdpa = ScaledDotProductAttention()output = sdpa(query, key, value)print("Query shape:", query.shape)print("Key shape:", key.shape)print("Value shape:", value.shape)print("Output shape:", output.shape)if __name__ == "__main__":test_attn()
MHA
import torch
from torch import nnclass MultiHeadAttention(torch.nn.Module):def __init__(self, hidden_size, num_heads):super().__init__()self.num_heads = num_headsself.head_dim = hidden_size // num_heads self.q_linear = nn.Linear(hidden_size, hidden_size) self.k_linear = nn.Linear(hidden_size, hidden_size)self.v_linear = nn.Linear(hidden_size, hidden_size)self.o_linear = nn.Linear(hidden_size, hidden_size)def forward(self, hidden_state, attention_mask=None):batch_size = hidden_state.size(0) query = self.q_linear(hidden_state) key = self.k_linear(hidden_state) value = self.v_linear(hidden_state) query = self.split_head(query) key = self.split_head(key) value = self.split_head(value) attention_scores = torch.matmul(query, key.transpose(-1, -2)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))if attention_mask is not None:attention_scores += attention_mask * -1e9attention_probs = torch.softmax(attention_scores, dim=-1) output = torch.matmul(attention_probs, value) output = output.transpose(1, 2).reshape(batch_size, -1, self.head_dim * self.num_heads)output = self.o_linear(output) return outputdef split_head(self, x):batch_size = x.size(0) return x.reshape(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)def test_MHA():batch_size = 128seq_len = 512hidden_size = 1024num_heads = 8hidden_state = torch.randn(batch_size, seq_len, hidden_size) mha = MultiHeadAttention(hidden_size, num_heads)output = mha(hidden_state)print("Input shape:", hidden_state.shape)print("Output shape:", output.shape)if __name__ == "__main__":test_MHA()
Softmax
import torchdef softmax(x):exp_x = torch.exp(x)sum_exp_x = torch.sum(exp_x, dim=0)softmax_x = exp_x / sum_exp_xreturn softmax_x
x = torch.tensor([1.0, 2.0, 3.0])
softmax_x = softmax(x)
print(softmax_x)
MSE
import torchdef mse_loss(y_true, y_pred):squared_diff = (y_true - y_pred) ** 2return torch.mean(squared_diff)
y_true = torch.tensor([3.0, -0.5, 2.0, 7.0])
y_pred = torch.tensor([2.5, 0.0, 2.0, 8.0])loss = mse_loss(y_true, y_pred)
print(f"Mean Squared Error: {loss.item()}")
Cross entropy
import torchdef cross_entropy_loss(y_true, y_pred):epsilon = 1e-12y_pred = torch.clamp(y_pred, epsilon, 1. - epsilon)ce_loss = -torch.sum(y_true * torch.log(y_pred), dim=-1)return torch.mean(ce_loss)
y_true = torch.tensor([[1, 0, 0], [0, 1, 0]], dtype=torch.float32)
y_pred = torch.tensor([[0.8, 0.1, 0.1], [0.2, 0.7, 0.1]], dtype=torch.float32)loss = cross_entropy_loss(y_true, y_pred)
print(f"Cross-Entropy Loss: {loss.item()}")