|
|
@@ -0,0 +1,320 @@
|
|
|
+# 简化版本 - 使用标准的Hugging Face方法
|
|
|
+import torch
|
|
|
+import pandas as pd
|
|
|
+from torch.utils.data import random_split
|
|
|
+from torch.utils.data import Dataset
|
|
|
+from torch.utils.data import DataLoader
|
|
|
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
|
|
|
+from torch.optim.lr_scheduler import ReduceLROnPlateau
|
|
|
+from tqdm import tqdm
|
|
|
+import os
|
|
|
+from torch.utils.data import WeightedRandomSampler
|
|
|
+import numpy as np
|
|
|
+from collections import Counter
|
|
|
+import time
|
|
|
+import gc
|
|
|
+
|
|
|
+class MyDataset(Dataset):
|
|
|
+ def __init__(self, file_path):
|
|
|
+ self.file_path = file_path
|
|
|
+ self.data = pd.read_csv(file_path)
|
|
|
+ self.data = self.data.dropna()
|
|
|
+ # 打印数据集基本信息
|
|
|
+ print(f"\n数据集信息:")
|
|
|
+ print(f" 总样本数: {len(self.data)}")
|
|
|
+ print(f" 列名: {self.data.columns.tolist()}")
|
|
|
+
|
|
|
+ # 统计文本长度分布
|
|
|
+ if 'record' in self.data.columns:
|
|
|
+ text_lengths = self.data['record'].astype(str).apply(len)
|
|
|
+ print(f" 文本长度统计:")
|
|
|
+ print(f" 最小: {text_lengths.min()}")
|
|
|
+ print(f" 最大: {text_lengths.max()}")
|
|
|
+ print(f" 平均: {text_lengths.mean():.1f}")
|
|
|
+ print(f" 中位数: {text_lengths.median():.1f}")
|
|
|
+ print(f" 95%分位数: {text_lengths.quantile(0.95):.1f}")
|
|
|
+
|
|
|
+ # 统计标签分布
|
|
|
+ if 'label' in self.data.columns:
|
|
|
+ label_counts = Counter(self.data['label'])
|
|
|
+ print(f" 标签分布:")
|
|
|
+ for label, count in sorted(label_counts.items()):
|
|
|
+ print(f" 类别 {label}: {count} ({count / len(self.data) * 100:.1f}%)")
|
|
|
+
|
|
|
+ def __getitem__(self, item):
|
|
|
+ return self.data.iloc[item]['record'], self.data.iloc[item]['label']
|
|
|
+
|
|
|
+ def __len__(self):
|
|
|
+ return len(self.data)
|
|
|
+
|
|
|
+
|
|
|
+class EarlyStopping:
|
|
|
+ """Early stopping机制"""
|
|
|
+
|
|
|
+ def __init__(self, patience:int=3, min_delta:float=0, mode:str='min'):
|
|
|
+ self.patience = patience
|
|
|
+ self.min_delta = min_delta
|
|
|
+ self.mode = mode
|
|
|
+ self.counter = 0
|
|
|
+ self.best_score = None
|
|
|
+ self.early_stop = False
|
|
|
+
|
|
|
+ def __call__(self, score):
|
|
|
+ if self.best_score is None:
|
|
|
+ self.best_score = score
|
|
|
+ return False
|
|
|
+
|
|
|
+ if self.mode == 'min':
|
|
|
+ if score < self.best_score - self.min_delta:
|
|
|
+ self.best_score = score
|
|
|
+ self.counter = 0
|
|
|
+ else:
|
|
|
+ self.counter += 1
|
|
|
+ else: # mode == 'max'
|
|
|
+ if score > self.best_score + self.min_delta:
|
|
|
+ self.best_score = score
|
|
|
+ self.counter = 0
|
|
|
+ else:
|
|
|
+ self.counter += 1
|
|
|
+
|
|
|
+ if self.counter >= self.patience:
|
|
|
+ self.early_stop = True
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+class SimpleTrainer:
|
|
|
+ def __init__(self, train_csv_file,
|
|
|
+ batch_size:int=32,
|
|
|
+ val_csv_file:str=None,
|
|
|
+ is_english:bool=False,
|
|
|
+ num_classes:int=2,
|
|
|
+ max_length:int=256):
|
|
|
+ # 使用标准的BERT模型
|
|
|
+ if is_english:
|
|
|
+ self.model_name = "bert-base-uncased"
|
|
|
+ else:
|
|
|
+ self.model_name = "bert-base-chinese"
|
|
|
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
+ print(f"使用设备: {self.device}")
|
|
|
+ print(f"使用模型: {self.model_name}")
|
|
|
+
|
|
|
+ self.batch_size = batch_size
|
|
|
+ self.max_length = max_length
|
|
|
+ self.__global_step = 0
|
|
|
+ self.best_val_acc = 0.0
|
|
|
+ self.best_val_loss = float('inf')
|
|
|
+
|
|
|
+ # 创建checkpoint目录
|
|
|
+ self.checkpoint_root_path = './checkpoints'
|
|
|
+ if not os.path.exists(self.checkpoint_root_path):
|
|
|
+ os.makedirs(self.checkpoint_root_path)
|
|
|
+
|
|
|
+ self.best_acc_model_path = os.path.join(self.checkpoint_root_path , f'{self.model_name}_best_acc.pth')
|
|
|
+ self.best_loss_model_path = os.path.join(self.checkpoint_root_path , f'{self.model_name}_best_loss.pth')
|
|
|
+ # 断点续训
|
|
|
+ self.latest_checkpoint_path = os.path.join(self.checkpoint_root_path , f'{self.model_name}_latest_checkpoint.pth')
|
|
|
+
|
|
|
+ # 创建模型和分词器
|
|
|
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
|
+ self.model = AutoModelForSequenceClassification.from_pretrained(
|
|
|
+ self.model_name,
|
|
|
+ num_labels=num_classes
|
|
|
+ ).to(self.device)
|
|
|
+
|
|
|
+ # 打印模型参数量
|
|
|
+ total_params = sum(p.numel() for p in self.model.parameters())
|
|
|
+ trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
|
|
|
+ print(f"模型总参数量: {total_params:,}")
|
|
|
+ print(f"可训练参数量: {trainable_params:,}")
|
|
|
+
|
|
|
+ # 创建数据集
|
|
|
+ print("\n加载数据集...")
|
|
|
+ self.train_dataset = MyDataset(train_csv_file)
|
|
|
+ # 计算样本权重(处理类别不平衡)
|
|
|
+ labels = self.train_dataset.data['label'].values
|
|
|
+ class_counts = np.bincount(labels)
|
|
|
+ sample_weights = 1.0 / class_counts[labels] # 实际上给每个样本定义了采样的权重
|
|
|
+ sampler = WeightedRandomSampler(
|
|
|
+ weights=sample_weights,
|
|
|
+ num_samples=len(sample_weights),
|
|
|
+ replacement=True
|
|
|
+ )
|
|
|
+ if val_csv_file:
|
|
|
+ self.val_dataset = MyDataset(val_csv_file)
|
|
|
+ else:
|
|
|
+ # 如果没有独立验证集,需要分割
|
|
|
+ self.train_dataset, self.val_dataset = random_split(self.train_dataset, [0.8, 0.2])
|
|
|
+ # 从完整权重中提取训练集对应的权重
|
|
|
+ train_indices = self.train_dataset.indices
|
|
|
+ train_sample_weights = sample_weights[train_indices]
|
|
|
+ sampler = WeightedRandomSampler(
|
|
|
+ weights=train_sample_weights,
|
|
|
+ num_samples=len(train_sample_weights),
|
|
|
+ replacement=True
|
|
|
+ )
|
|
|
+ # 创建数据加载器
|
|
|
+ self.train_loader = DataLoader(self.train_dataset,
|
|
|
+ batch_size=self.batch_size,
|
|
|
+ sampler=sampler,
|
|
|
+ collate_fn=self.collate_func)
|
|
|
+ self.val_loader = DataLoader(self.val_dataset,
|
|
|
+ batch_size=self.batch_size,
|
|
|
+ shuffle=False,
|
|
|
+ collate_fn=self.collate_func)
|
|
|
+
|
|
|
+ # 创建优化器和调度器
|
|
|
+ self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=3e-5, weight_decay=0.01)
|
|
|
+ self.scheduler = None
|
|
|
+ self.warmup_ratio = 0.1 # warmup步数占总步数的比例
|
|
|
+
|
|
|
+ # Early stopping
|
|
|
+ self.early_stopping = EarlyStopping(patience=5, min_delta=0.001, mode='min')
|
|
|
+ # 梯度裁剪阈值
|
|
|
+ self.max_grad_norm = 1.0
|
|
|
+
|
|
|
+ def collate_func(self, batch_data):
|
|
|
+ texts, labels = [], []
|
|
|
+ for item in batch_data:
|
|
|
+ texts.append(item[0])
|
|
|
+ labels.append(item[1])
|
|
|
+
|
|
|
+ inputs = self.tokenizer(
|
|
|
+ texts,
|
|
|
+ max_length=self.max_length,
|
|
|
+ padding='max_length',
|
|
|
+ truncation=True,
|
|
|
+ return_tensors='pt'
|
|
|
+ )
|
|
|
+ inputs['labels'] = torch.tensor(labels, dtype=torch.long)
|
|
|
+ return inputs
|
|
|
+
|
|
|
+ def train_step(self):
|
|
|
+ self.model.train()
|
|
|
+ total_loss = 0.0
|
|
|
+ correct_predictions = 0.0
|
|
|
+ total_samples = 0.0
|
|
|
+
|
|
|
+ for batch_data in tqdm(self.train_loader, desc="Training"):
|
|
|
+ batch_data = {k: v.to(self.device) for k, v in batch_data.items()}
|
|
|
+
|
|
|
+ self.optimizer.zero_grad()
|
|
|
+ outputs = self.model(**batch_data)
|
|
|
+ loss = outputs.loss
|
|
|
+ loss.backward()
|
|
|
+ # 添加梯度裁剪
|
|
|
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
|
|
|
+
|
|
|
+ self.optimizer.step()
|
|
|
+ self.scheduler.step() # 更新学习率
|
|
|
+
|
|
|
+ total_loss += loss.item()
|
|
|
+ predictions = torch.argmax(outputs.logits, dim=-1)
|
|
|
+ correct_predictions += (predictions == batch_data['labels']).float().sum().item()
|
|
|
+ total_samples += len(batch_data['labels'])
|
|
|
+
|
|
|
+ avg_loss = total_loss / len(self.train_loader)
|
|
|
+ accuracy = correct_predictions / total_samples
|
|
|
+ return avg_loss, accuracy
|
|
|
+
|
|
|
+ def val_step(self):
|
|
|
+ self.model.eval()
|
|
|
+ total_loss = 0.0
|
|
|
+ correct_predictions = 0.0
|
|
|
+ total_samples = 0.0
|
|
|
+
|
|
|
+ with torch.no_grad():
|
|
|
+ for batch_data in tqdm(self.val_loader, desc="Validating"):
|
|
|
+ batch_data = {k: v.to(self.device) for k, v in batch_data.items()}
|
|
|
+
|
|
|
+ outputs = self.model(**batch_data)
|
|
|
+ total_loss += outputs.loss.item()
|
|
|
+
|
|
|
+ predictions = torch.argmax(outputs.logits, dim=-1)
|
|
|
+ correct_predictions += (predictions == batch_data['labels']).float().sum().item()
|
|
|
+ total_samples += len(batch_data['labels'])
|
|
|
+
|
|
|
+ avg_loss = total_loss / len(self.val_loader)
|
|
|
+ accuracy = correct_predictions / total_samples
|
|
|
+ return avg_loss, accuracy
|
|
|
+
|
|
|
+ def train_and_validate(self, num_epoch):
|
|
|
+ # 创建学习率调度器(带warmup的线性衰减)
|
|
|
+ total_steps = len(self.train_loader) * num_epoch
|
|
|
+ warmup_steps = int(total_steps * self.warmup_ratio)
|
|
|
+
|
|
|
+ self.scheduler = get_linear_schedule_with_warmup(
|
|
|
+ self.optimizer,
|
|
|
+ num_warmup_steps=warmup_steps,
|
|
|
+ num_training_steps=total_steps
|
|
|
+ )
|
|
|
+
|
|
|
+ print(f"\n训练配置:")
|
|
|
+ print(f" 总训练步数: {total_steps}")
|
|
|
+ print(f" Warmup步数: {warmup_steps}")
|
|
|
+ print(f" 初始学习率: {self.optimizer.param_groups[0]['lr']}")
|
|
|
+ print(f" Batch size: {self.batch_size}")
|
|
|
+ print(f" Max length: {self.max_length}")
|
|
|
+ print(f" 梯度裁剪: {self.max_grad_norm}")
|
|
|
+
|
|
|
+ print("开始训练...")
|
|
|
+ for epoch in range(num_epoch):
|
|
|
+ print(f"\n{'='*60}")
|
|
|
+ print(f"Epoch {epoch+1}/{num_epoch}")
|
|
|
+ print(f"{'='*60}")
|
|
|
+
|
|
|
+ train_loss, train_acc = self.train_step()
|
|
|
+ val_loss, val_acc = self.val_step()
|
|
|
+
|
|
|
+ # 获取当前学习率
|
|
|
+ current_lr = self.optimizer.param_groups[0]['lr']
|
|
|
+
|
|
|
+ print(f'Epoch {epoch+1}/{num_epoch}:')
|
|
|
+ print(f' Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f}')
|
|
|
+ print(f' Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}')
|
|
|
+ print(f' Learning Rate: {current_lr:.2e}')
|
|
|
+
|
|
|
+ # 保存最佳模型 (基于验证准确率)
|
|
|
+ if val_acc > self.best_val_acc:
|
|
|
+ self.best_val_acc = val_acc
|
|
|
+ torch.save(self.model.state_dict(), self.best_acc_model_path)
|
|
|
+ print(f" ✓ 保存了新的最佳准确率模型,验证准确率: {self.best_val_acc:.4f}")
|
|
|
+
|
|
|
+ # 保存最低验证损失模型
|
|
|
+ if val_loss < self.best_val_loss:
|
|
|
+ self.best_val_loss = val_loss
|
|
|
+ torch.save(self.model.state_dict(), self.best_loss_model_path)
|
|
|
+ print(f" ✓ 保存了新的最低损失模型,验证损失: {self.best_val_loss:.4f}")
|
|
|
+
|
|
|
+ # Early stopping检查
|
|
|
+ if self.early_stopping(val_loss):
|
|
|
+ print(f"\n⚠ Early stopping触发! 在epoch {epoch+1}停止训练")
|
|
|
+ print(f" 验证损失已连续{self.early_stopping.patience}个epoch没有改善")
|
|
|
+ break
|
|
|
+
|
|
|
+ print(f"\n{'='*60}")
|
|
|
+ print("训练完成!")
|
|
|
+ print(f"{'='*60}")
|
|
|
+ print(f"最佳验证准确率: {self.best_val_acc:.4f}")
|
|
|
+ print(f"最低验证损失: {self.best_val_loss:.4f}")
|
|
|
+ print(f"模型保存路径:")
|
|
|
+ print(f" 最佳准确率模型: {self.best_acc_model_path}")
|
|
|
+ print(f" 最低损失模型: {self.best_loss_model_path}")
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ trainer_en = SimpleTrainer(train_csv_file='data_en.csv',
|
|
|
+ batch_size=32,
|
|
|
+ is_english=True,
|
|
|
+ max_length=256) # 增加max_length
|
|
|
+ trainer_en.train_and_validate(num_epoch=20)
|
|
|
+
|
|
|
+ del trainer_en
|
|
|
+ gc.collect() # 强制 Python 垃圾回收
|
|
|
+ torch.cuda.empty_cache() # 清空 CUDA 缓存
|
|
|
+ time.sleep(2) # 短暂等待即可,不需要太久
|
|
|
+
|
|
|
+ trainer = SimpleTrainer(train_csv_file='data_zh.csv',
|
|
|
+ batch_size=32,
|
|
|
+ is_english=False,
|
|
|
+ max_length=256) # 增加max_length
|
|
|
+ trainer.train_and_validate(num_epoch=20)
|