生成以下代码运行结果中包含的
神经网络训练细节及训练过程图:# 1. 安装依赖
!pip install jieba nltk
!python -m spacy download en_core_web_sm
# 2. 导入库
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import jieba
import spacy
from collections import Counter
import matplotlib.pyplot as plt
import time
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# 加载英文分词模型
nlp_en = spacy.load("en_core_web_sm")
# 3. 数据预处理类与函数
class TranslationDataset(Dataset):
def __init__(self, src_tokens, tgt_tokens, src_vocab, tgt_vocab, max_len=50):
self.src_tokens = src_tokens
self.tgt_tokens = tgt_tokens
self.src_vocab = src_vocab
self.tgt_vocab = tgt_vocab
self.max_len = max_len
self.sos_idx = tgt_vocab["<SOS>"]
self.eos_idx = tgt_vocab["<EOS>"]
self.pad_idx = tgt_vocab["<PAD>"]
def __len__(self):
return len(self.src_tokens)
def __getitem__(self, idx):
# 处理源语言输入
src_ids = [self.src_vocab.get(t, self.src_vocab["<UNK>"]) for t in self.src_tokens[idx]]
src_ids = [self.src_vocab["<SOS>"]] + src_ids + [self.src_vocab["<EOS>"]]
src_ids = src_ids[:self.max_len]
src_ids += [self.src_vocab["<PAD>"]] * (self.max_len - len(src_ids))
# 处理目标语言输入和目标
tgt_ids_input = [self.sos_idx] + [self.tgt_vocab.get(t, self.tgt_vocab["<UNK>"]) for t in self.tgt_tokens[idx]]
tgt_ids_target = [self.tgt_vocab.get(t, self.tgt_vocab["<UNK>"]) for t in self.tgt_tokens[idx]] + [self.eos_idx]
# 截断与填充
tgt_ids_input = tgt_ids_input[:self.max_len]
tgt_ids_input += [self.pad_idx] * (self.max_len - len(tgt_ids_input))
tgt_ids_target = tgt_ids_target[:self.max_len]
tgt_ids_target += [self.pad_idx] * (self.max_len - len(tgt_ids_target))
return (torch.tensor(src_ids, dtype=torch.long),
torch.tensor(tgt_ids_input, dtype=torch.long),
torch.tensor(tgt_ids_target, dtype=torch.long))
def load_data(file_path):
"""加载中英平行语料(强制转换为字符串并去空格,处理空值)"""
cn_sents, en_sents = [], []
error_lines = []
with open(file_path, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
line = str(line).strip() # 强制转换为字符串并去空格
if not line:
continue
try:
parts = line.split('\t')
if len(parts) < 2:
raise ValueError(f"至少需要2部分,实际{len(parts)}: {line}")
en, cn = parts[:2]
# 强制转换为字符串并去空格,处理空值
en_str = str(en).strip() or "<empty>"
cn_str = str(cn).strip() or "<empty>"
en_sents.append(en_str)
cn_sents.append(cn_str)
except Exception as e:
error_lines.append(f"行 {i+1}: {str(e)}")
if error_lines:
print(f"警告: 发现 {len(error_lines)} 个格式错误(前10条):")
for error in error_lines[:10]:
print(error)
return cn_sents, en_sents
def tokenize_cn(text):
"""中文分词(返回分词列表)"""
return list(jieba.cut(text))
def tokenize_en(text):
"""英文分词(返回分词列表)"""
return [token.text for token in nlp_en(text)]
def build_vocab(tokens_list, min_freq=5):
"""构建词表"""
counter = Counter()
for tokens in tokens_list:
counter.update(tokens)
vocab = [t for t, cnt in counter.items() if cnt >= min_freq]
special_tokens = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]
vocab = special_tokens + vocab
return {t: i for i, t in enumerate(vocab)}
# 4. Transformer模型组件
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super().__init__()
self.dropout = nn.Dropout(dropout)
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
pe = torch.zeros(1, max_len, d_model)
pe[0, :, 0::2] = torch.sin(position * div_term)
pe[0, :, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return self.dropout(x)
class TransformerEncoder(nn.Module):
def __init__(self, vocab_size, embed_dim=256, num_heads=8,
num_layers=3, dim_feedforward=1024, dropout=0.1):
super().__init__()
self.embed_dim = embed_dim
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.pos_encoder = PositionalEncoding(embed_dim, dropout)
encoder_layer = nn.TransformerEncoderLayer(
d_model=embed_dim, nhead=num_heads,
dim_feedforward=dim_feedforward, dropout=dropout
)
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
def forward(self, src):
src = src.transpose(0, 1)
src_emb = self.embedding(src) * np.sqrt(self.embed_dim)
src_emb = self.pos_encoder(src_emb)
return self.transformer_encoder(src_emb)
class TransformerDecoder(nn.Module):
def __init__(self, vocab_size, embed_dim=256, num_heads=8,
num_layers=3, dim_feedforward=1024, dropout=0.1):
super().__init__()
self.embed_dim = embed_dim
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.pos_encoder = PositionalEncoding(embed_dim, dropout)
decoder_layer = nn.TransformerDecoderLayer(
d_model=embed_dim, nhead=num_heads,
dim_feedforward=dim_feedforward, dropout=dropout
)
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
self.fc_out = nn.Linear(embed_dim, vocab_size)
def forward(self, tgt, memory, tgt_mask=None, memory_key_padding_mask=None):
tgt = tgt.transpose(0, 1)
tgt_emb = self.embedding(tgt) * np.sqrt(self.embed_dim)
tgt_emb = self.pos_encoder(tgt_emb)
output = self.transformer_decoder(
tgt_emb, memory,
tgt_mask=tgt_mask,
memory_key_padding_mask=memory_key_padding_mask
)
return self.fc_out(output).transpose(0, 1)
class Seq2Seq(nn.Module):
def __init__(self, cn_vocab_size, en_vocab_size, **kwargs):
super().__init__()
self.encoder = TransformerEncoder(cn_vocab_size, **kwargs)
self.decoder = TransformerDecoder(en_vocab_size, **kwargs)
self.src_pad_idx = 0
self.tgt_pad_idx = 0
def generate_tgt_mask(self, seq_len):
mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
return mask.to(next(self.parameters()).device)
def forward(self, src, tgt):
# 编码阶段
memory = self.encoder(src)
# 生成填充掩码
src_padding_mask = (src == self.src_pad_idx)
if src_padding_mask.dim() == 2:
src_padding_mask = src_padding_mask.to(next(self.parameters()).device)
# 解码阶段
tgt_seq_len = tgt.size(1)
tgt_mask = self.generate_tgt_mask(tgt_seq_len)
output = self.decoder(
tgt, memory,
tgt_mask=tgt_mask,
memory_key_padding_mask=src_padding_mask
)
return output
# 5. 训练与评估函数
def train_epoch(model, dataloader, criterion, optimizer, device):
model.train()
total_loss = 0
for src_ids, tgt_ids_input, tgt_ids_target in dataloader:
src_ids = src_ids.to(device)
tgt_ids_input = tgt_ids_input.to(device)
tgt_ids_target = tgt_ids_target.to(device)
optimizer.zero_grad()
output = model(src_ids, tgt_ids_input)
loss = criterion(
output.reshape(-1, output.size(-1)),
tgt_ids_target.reshape(-1)
)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)
def _clean_text(text):
"""清洗文本:去除不可见字符,统一空格,转为小写"""
text = str(text).strip()
# 移除特殊不可见字符
text = ''.join(c for c in text if c.isprintable())
text = ' '.join(text.split()) # 合并多个空格为一个
return text
def evaluate_bleu(model, test_src_sents, test_tgt_sents, src_vocab, tgt_vocab, tokenize_src, tokenize_tgt, max_len=50, device='cpu'):
"""评估模型的BLEU分数(使用nltk的BLEU实现)"""
model.eval()
bleu_scores = []
error_count = 0
smoothing = SmoothingFunction().method1 # 使用平滑方法1
with torch.no_grad():
for i, (src_text, ref_text) in enumerate(zip(test_src_sents, test_tgt_sents)):
# 生成翻译并转为字符串
translation_tokens = translate(model, src_text, src_vocab, tgt_vocab, tokenize_src, max_len, device)
translation = ' '.join(translation_tokens) if translation_tokens else ""
# 严格清洗文本
ref_text_clean = _clean_text(ref_text)
translation_clean = _clean_text(translation)
# 分词(nltk的BLEU需要分词列表)
ref_tokens = ref_text_clean.split()
trans_tokens = translation_clean.split()
# 调试输出(每100个样本输出一次)
if i % 100 == 0:
print(f"\n样本 {i+1}:")
print(f" 源文本: {src_text}")
print(f" 参考文本: {ref_text_clean}")
print(f" 翻译结果: {translation_clean}")
print(f" 参考分词: {ref_tokens}")
print(f" 翻译分词: {trans_tokens}")
# 验证数据类型
if not isinstance(ref_tokens, list) or not all(isinstance(t, str) for t in ref_tokens):
print(f"警告: 样本 {i+1} 的参考分词格式错误!")
ref_tokens = []
if not isinstance(trans_tokens, list) or not all(isinstance(t, str) for t in trans_tokens):
print(f"警告: 样本 {i+1} 的翻译分词格式错误!")
trans_tokens = []
# 跳过空翻译(避免除零错误)
if not trans_tokens:
print(f"警告: 样本 {i+1} 的翻译结果为空")
bleu_scores.append(0.0)
continue
# 计算BLEU分数
try:
# nltk的sentence_bleu期望: 参考 = [[分词1, 分词2]], 翻译 = [分词1, 分词2]
score = sentence_bleu([ref_tokens], trans_tokens, smoothing_function=smoothing)
bleu_scores.append(score)
except Exception as e:
print(f"样本 {i+1} 计算BLEU时出错: {str(e)}")
print(f" 参考类型: {type(ref_tokens)}, 长度: {len(ref_tokens)}")
print(f" 翻译类型: {type(trans_tokens)}, 长度: {len(trans_tokens)}")
if ref_tokens:
print(f" 参考前5词: {ref_tokens[:5]}")
if trans_tokens:
print(f" 翻译前5词: {trans_tokens[:5]}")
error_count += 1
bleu_scores.append(0.0)
print(f"\nBLEU评估完成,共处理 {len(bleu_scores)} 个样本,{error_count} 个样本计算出错")
return sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0
def translate(model, text, src_vocab, tgt_vocab, tokenize_func, max_len=50, device='cpu'):
"""贪心解码生成翻译(返回分词列表)"""
model.eval()
tokens = tokenize_func(text)
ids = [src_vocab.get(t, src_vocab["<UNK>"]) for t in tokens]
ids = [src_vocab["<SOS>"]] + ids + [src_vocab["<EOS>"]]
ids = ids[:max_len]
ids += [src_vocab["<PAD>"]] * (max_len - len(ids))
tensor = torch.tensor([ids], dtype=torch.long).to(device)
# 初始化目标序列
tgt_ids = [tgt_vocab["<SOS>"]]
with torch.no_grad():
for _ in range(max_len):
tgt_tensor = torch.tensor([tgt_ids], dtype=torch.long).to(device)
output = model(tensor, tgt_tensor)
next_idx = output.argmax(-1)[:, -1].item()
tgt_ids.append(next_idx)
if next_idx == tgt_vocab["<EOS>"]:
break
# 转换为文本(忽略特殊标记)
tokens = [k for k, v in tgt_vocab.items() if v in tgt_ids and v not in [tgt_vocab["<SOS>"], tgt_vocab["<EOS>"], tgt_vocab["<PAD>"]]]
return tokens
# 6. 主程序执行
if __name__ == "__main__":
# 超参数
BATCH_SIZE = 64
MAX_LEN = 50
EPOCHS = 20
EMBED_DIM = 256
NUM_HEADS = 8
NUM_LAYERS = 3
DIM_FEEDFORWARD = 1024
DROPOUT = 0.1
LEARNING_RATE = 1e-3
MIN_FREQ = 5
# 设备配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# 数据加载与预处理
file_path = "/content/sample_data/cmn.txt" # 根据实际数据集名称调整
cn_sents, en_sents = load_data(file_path)
# 验证数据加载情况
if not cn_sents or not en_sents:
print("错误: 未能加载有效数据,使用示例数据 fallback...")
# 示例数据(确保为字符串)
cn_sents = [
"你好", "我爱自然语言处理", "今天天气怎么样?",
"我正在学习Transformer模型", "神经网络可以处理各种复杂任务"
]
en_sents = [
"Hello", "I love natural language processing", "What's the weather like today?",
"I'm learning the Transformer model", "Neural networks can handle various complex tasks"
]
else:
print(f"成功加载 {len(cn_sents)} 条平行语料")
# 划分训练集和测试集
train_size = int(len(cn_sents) * 0.9)
train_cn_sents, test_cn_sents = cn_sents[:train_size], cn_sents[train_size:]
train_en_sents, test_en_sents = en_sents[:train_size], en_sents[train_size:]
# 分词(返回列表)
train_cn_tokens = [tokenize_cn(sent) for sent in train_cn_sents]
train_en_tokens = [tokenize_en(sent) for sent in train_en_sents]
test_cn_tokens = [tokenize_cn(sent) for sent in test_cn_sents]
test_en_tokens = [tokenize_en(sent) for sent in test_en_sents]
# 构建词表
cn_vocab = build_vocab(train_cn_tokens, min_freq=MIN_FREQ)
en_vocab = build_vocab(train_en_tokens, min_freq=MIN_FREQ)
print(f"中文词表大小: {len(cn_vocab)}")
print(f"英文词表大小: {len(en_vocab)}")
# 创建数据集
dataset_cn2en = TranslationDataset(
train_cn_tokens, train_en_tokens,
cn_vocab, en_vocab,
max_len=MAX_LEN
)
dataloader_cn2en = DataLoader(dataset_cn2en, batch_size=BATCH_SIZE, shuffle=True)
# 初始化模型
model_cn2en = Seq2Seq(
len(cn_vocab), len(en_vocab),
embed_dim=EMBED_DIM,
num_heads=NUM_HEADS,
num_layers=NUM_LAYERS,
dim_feedforward=DIM_FEEDFORWARD,
dropout=DROPOUT
).to(device)
# 训练模型
print("=== 训练中文→英文模型 ===")
criterion = nn.CrossEntropyLoss(ignore_index=en_vocab["<PAD>"])
optimizer = optim.Adam(model_cn2en.parameters(), lr=LEARNING_RATE)
train_losses = []
best_bleu = 0.0
best_model = None
for epoch in range(EPOCHS):
start_time = time.time()
train_loss = train_epoch(model_cn2en, dataloader_cn2en, criterion, optimizer, device)
train_losses.append(train_loss)
if (epoch + 1) % 5 == 0:
bleu_score = evaluate_bleu(
model_cn2en, test_cn_sents[:100], test_en_sents[:100],
cn_vocab, en_vocab, tokenize_cn, tokenize_en,
max_len=MAX_LEN, device=device
)
print(f"Epoch {epoch+1:2d} | Loss: {train_loss:.4f} | BLEU: {bleu_score:.2f} | Time: {time.time()-start_time:.2f}s")
if bleu_score > best_bleu:
best_bleu = bleu_score
best_model = model_cn2en.state_dict().copy()
print(f"Best model saved with BLEU: {best_bleu:.2f}")
else:
print(f"Epoch {epoch+1:2d} | Loss: {train_loss:.4f} | Time: {time.time()-start_time:.2f}s")
# 加载最佳模型
if best_model is not None:
model_cn2en.load_state_dict(best_model)
print(f"Loaded best model with BLEU: {best_bleu:.2f}")
# 翻译示例
print("\n=== 翻译示例 ===")
examples = [
"你好",
"我爱自然语言处理",
"今天天气怎么样?"
]
for text in examples:
translation_tokens = translate(
model_cn2en, text, cn_vocab, en_vocab, tokenize_cn,
max_len=MAX_LEN, device=device
)
translation = ' '.join(translation_tokens)
print(f"输入:{text}")
print(f"翻译:{translation}")
print("-" * 30)
# 最终评估
if len(test_cn_sents) > 0 and len(test_en_sents) > 0:
final_bleu = evaluate_bleu(
model_cn2en, test_cn_sents, test_en_sents,
cn_vocab, en_vocab, tokenize_cn, tokenize_en,
max_len=MAX_LEN, device=device
)
print(f"\n最终模型 BLEU 分数: {final_bleu:.2f}")
else:
print("\n警告: 测试集为空,无法进行最终评估")Requirement already satisfied: jieba in /usr/local/lib/python3.11/dist-packages (0.42.1)
Requirement already satisfied: nltk in /usr/local/lib/python3.11/dist-packages (3.9.1)
Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk) (8.2.1)
Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk) (1.5.1)
Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk) (2024.11.6)
Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk) (4.67.1)
Collecting en-core-web-sm==3.8.0
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 121.4 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
⚠ Restart to reload dependencies
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Using device: cuda
成功加载 29909 条平行语料
中文词表大小: 2815
英文词表大小: 2249
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/transformer.py:385: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)
warnings.warn(
=== 训练中文→英文模型 ===
Epoch 1 | Loss: 3.4688 | Time: 24.53s
Epoch 2 | Loss: 2.5631 | Time: 24.15s
Epoch 3 | Loss: 2.1103 | Time: 24.96s
Epoch 4 | Loss: 1.8082 | Time: 25.14s
样本 1:
源文本: 完全忽视汤姆不会是明智的。
参考文本: It wouldn't be wise to ignore Tom completely.
翻译结果: . Tom n't right is all
参考分词: ['It', "wouldn't", 'be', 'wise', 'to', 'ignore', 'Tom', 'completely.']
翻译分词: ['.', 'Tom', "n't", 'right', 'is', 'all']
BLEU评估完成,共处理 100 个样本,0 个样本计算出错
Epoch 5 | Loss: 1.5904 | BLEU: 0.03 | Time: 29.20s
Best model saved with BLEU: 0.03
Epoch 6 | Loss: 1.4257 | Time: 24.68s
Epoch 7 | Loss: 1.3020 | Time: 24.82s
Epoch 8 | Loss: 1.1967 | Time: 24.80s
Epoch 9 | Loss: 1.1096 | Time: 24.73s
样本 1:
源文本: 完全忽视汤姆不会是明智的。
参考文本: It wouldn't be wise to ignore Tom completely.
翻译结果: <UNK> . Tom n't is the of
...
BLEU评估完成,共处理 2991 个样本,0 个样本计算出错
最终模型 BLEU 分数: 0.03
最新发布