# 简化版本 - 使用标准的Hugging Face方法 import torch import pandas as pd from torch.utils.data import random_split from torch.utils.data import Dataset from torch.utils.data import DataLoader from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup from torch.optim.lr_scheduler import ReduceLROnPlateau from tqdm import tqdm import os from torch.utils.data import WeightedRandomSampler import numpy as np from collections import Counter import time import gc class MyDataset(Dataset): def __init__(self, file_path): self.file_path = file_path self.data = pd.read_csv(file_path) self.data = self.data.dropna() # 打印数据集基本信息 print(f"\n数据集信息:") print(f" 总样本数: {len(self.data)}") print(f" 列名: {self.data.columns.tolist()}") # 统计文本长度分布 if 'record' in self.data.columns: text_lengths = self.data['record'].astype(str).apply(len) print(f" 文本长度统计:") print(f" 最小: {text_lengths.min()}") print(f" 最大: {text_lengths.max()}") print(f" 平均: {text_lengths.mean():.1f}") print(f" 中位数: {text_lengths.median():.1f}") print(f" 95%分位数: {text_lengths.quantile(0.95):.1f}") # 统计标签分布 if 'label' in self.data.columns: label_counts = Counter(self.data['label']) print(f" 标签分布:") for label, count in sorted(label_counts.items()): print(f" 类别 {label}: {count} ({count / len(self.data) * 100:.1f}%)") def __getitem__(self, item): return self.data.iloc[item]['record'], self.data.iloc[item]['label'] def __len__(self): return len(self.data) class EarlyStopping: """Early stopping机制""" def __init__(self, patience:int=3, min_delta:float=0, mode:str='min'): self.patience = patience self.min_delta = min_delta self.mode = mode self.counter = 0 self.best_score = None self.early_stop = False def __call__(self, score): if self.best_score is None: self.best_score = score return False if self.mode == 'min': if score < self.best_score - self.min_delta: self.best_score = score self.counter = 0 else: self.counter += 1 else: # mode == 'max' if score > self.best_score + self.min_delta: self.best_score = score self.counter = 0 else: self.counter += 1 if self.counter >= self.patience: self.early_stop = True return True return False class SimpleTrainer: def __init__(self, train_csv_file, batch_size:int=32, val_csv_file:str=None, is_english:bool=False, num_classes:int=2, max_length:int=256): # 使用标准的BERT模型 if is_english: self.model_name = "bert-base-uncased" else: self.model_name = "bert-base-chinese" self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"使用设备: {self.device}") print(f"使用模型: {self.model_name}") self.batch_size = batch_size self.max_length = max_length self.__global_step = 0 self.best_val_acc = 0.0 self.best_val_loss = float('inf') # 创建checkpoint目录 self.checkpoint_root_path = './checkpoints' if not os.path.exists(self.checkpoint_root_path): os.makedirs(self.checkpoint_root_path) self.best_acc_model_path = os.path.join(self.checkpoint_root_path , f'{self.model_name}_best_acc.pth') self.best_loss_model_path = os.path.join(self.checkpoint_root_path , f'{self.model_name}_best_loss.pth') # 断点续训 self.latest_checkpoint_path = os.path.join(self.checkpoint_root_path , f'{self.model_name}_latest_checkpoint.pth') # 创建模型和分词器 self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.model = AutoModelForSequenceClassification.from_pretrained( self.model_name, num_labels=num_classes ).to(self.device) # 打印模型参数量 total_params = sum(p.numel() for p in self.model.parameters()) trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) print(f"模型总参数量: {total_params:,}") print(f"可训练参数量: {trainable_params:,}") # 创建数据集 print("\n加载数据集...") self.train_dataset = MyDataset(train_csv_file) # 计算样本权重(处理类别不平衡) labels = self.train_dataset.data['label'].values class_counts = np.bincount(labels) sample_weights = 1.0 / class_counts[labels] # 实际上给每个样本定义了采样的权重 sampler = WeightedRandomSampler( weights=sample_weights, num_samples=len(sample_weights), replacement=True ) if val_csv_file: self.val_dataset = MyDataset(val_csv_file) else: # 如果没有独立验证集,需要分割 self.train_dataset, self.val_dataset = random_split(self.train_dataset, [0.8, 0.2]) # 从完整权重中提取训练集对应的权重 train_indices = self.train_dataset.indices train_sample_weights = sample_weights[train_indices] sampler = WeightedRandomSampler( weights=train_sample_weights, num_samples=len(train_sample_weights), replacement=True ) # 创建数据加载器 self.train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, sampler=sampler, collate_fn=self.collate_func) self.val_loader = DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_func) # 创建优化器和调度器 self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=3e-5, weight_decay=0.01) self.scheduler = None self.warmup_ratio = 0.1 # warmup步数占总步数的比例 # Early stopping self.early_stopping = EarlyStopping(patience=5, min_delta=0.001, mode='min') # 梯度裁剪阈值 self.max_grad_norm = 1.0 def collate_func(self, batch_data): texts, labels = [], [] for item in batch_data: texts.append(item[0]) labels.append(item[1]) inputs = self.tokenizer( texts, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt' ) inputs['labels'] = torch.tensor(labels, dtype=torch.long) return inputs def train_step(self): self.model.train() total_loss = 0.0 correct_predictions = 0.0 total_samples = 0.0 for batch_data in tqdm(self.train_loader, desc="Training"): batch_data = {k: v.to(self.device) for k, v in batch_data.items()} self.optimizer.zero_grad() outputs = self.model(**batch_data) loss = outputs.loss loss.backward() # 添加梯度裁剪 torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) self.optimizer.step() self.scheduler.step() # 更新学习率 total_loss += loss.item() predictions = torch.argmax(outputs.logits, dim=-1) correct_predictions += (predictions == batch_data['labels']).float().sum().item() total_samples += len(batch_data['labels']) avg_loss = total_loss / len(self.train_loader) accuracy = correct_predictions / total_samples return avg_loss, accuracy def val_step(self): self.model.eval() total_loss = 0.0 correct_predictions = 0.0 total_samples = 0.0 with torch.no_grad(): for batch_data in tqdm(self.val_loader, desc="Validating"): batch_data = {k: v.to(self.device) for k, v in batch_data.items()} outputs = self.model(**batch_data) total_loss += outputs.loss.item() predictions = torch.argmax(outputs.logits, dim=-1) correct_predictions += (predictions == batch_data['labels']).float().sum().item() total_samples += len(batch_data['labels']) avg_loss = total_loss / len(self.val_loader) accuracy = correct_predictions / total_samples return avg_loss, accuracy def train_and_validate(self, num_epoch): # 创建学习率调度器(带warmup的线性衰减) total_steps = len(self.train_loader) * num_epoch warmup_steps = int(total_steps * self.warmup_ratio) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps ) print(f"\n训练配置:") print(f" 总训练步数: {total_steps}") print(f" Warmup步数: {warmup_steps}") print(f" 初始学习率: {self.optimizer.param_groups[0]['lr']}") print(f" Batch size: {self.batch_size}") print(f" Max length: {self.max_length}") print(f" 梯度裁剪: {self.max_grad_norm}") print("开始训练...") for epoch in range(num_epoch): print(f"\n{'='*60}") print(f"Epoch {epoch+1}/{num_epoch}") print(f"{'='*60}") train_loss, train_acc = self.train_step() val_loss, val_acc = self.val_step() # 获取当前学习率 current_lr = self.optimizer.param_groups[0]['lr'] print(f'Epoch {epoch+1}/{num_epoch}:') print(f' Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f}') print(f' Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}') print(f' Learning Rate: {current_lr:.2e}') # 保存最佳模型 (基于验证准确率) if val_acc > self.best_val_acc: self.best_val_acc = val_acc torch.save(self.model.state_dict(), self.best_acc_model_path) print(f" ✓ 保存了新的最佳准确率模型,验证准确率: {self.best_val_acc:.4f}") # 保存最低验证损失模型 if val_loss < self.best_val_loss: self.best_val_loss = val_loss torch.save(self.model.state_dict(), self.best_loss_model_path) print(f" ✓ 保存了新的最低损失模型,验证损失: {self.best_val_loss:.4f}") # Early stopping检查 if self.early_stopping(val_loss): print(f"\n⚠ Early stopping触发! 在epoch {epoch+1}停止训练") print(f" 验证损失已连续{self.early_stopping.patience}个epoch没有改善") break print(f"\n{'='*60}") print("训练完成!") print(f"{'='*60}") print(f"最佳验证准确率: {self.best_val_acc:.4f}") print(f"最低验证损失: {self.best_val_loss:.4f}") print(f"模型保存路径:") print(f" 最佳准确率模型: {self.best_acc_model_path}") print(f" 最低损失模型: {self.best_loss_model_path}") if __name__ == '__main__': trainer_en = SimpleTrainer(train_csv_file='data_en.csv', batch_size=32, is_english=True, max_length=256) # 增加max_length trainer_en.train_and_validate(num_epoch=20) del trainer_en gc.collect() # 强制 Python 垃圾回收 torch.cuda.empty_cache() # 清空 CUDA 缓存 time.sleep(2) # 短暂等待即可,不需要太久 trainer = SimpleTrainer(train_csv_file='data_zh.csv', batch_size=32, is_english=False, max_length=256) # 增加max_length trainer.train_and_validate(num_epoch=20)