| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320 |
- # 简化版本 - 使用标准的Hugging Face方法
- import torch
- import pandas as pd
- from torch.utils.data import random_split
- from torch.utils.data import Dataset
- from torch.utils.data import DataLoader
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
- from torch.optim.lr_scheduler import ReduceLROnPlateau
- from tqdm import tqdm
- import os
- from torch.utils.data import WeightedRandomSampler
- import numpy as np
- from collections import Counter
- import time
- import gc
- class MyDataset(Dataset):
- def __init__(self, file_path):
- self.file_path = file_path
- self.data = pd.read_csv(file_path)
- self.data = self.data.dropna()
- # 打印数据集基本信息
- print(f"\n数据集信息:")
- print(f" 总样本数: {len(self.data)}")
- print(f" 列名: {self.data.columns.tolist()}")
- # 统计文本长度分布
- if 'record' in self.data.columns:
- text_lengths = self.data['record'].astype(str).apply(len)
- print(f" 文本长度统计:")
- print(f" 最小: {text_lengths.min()}")
- print(f" 最大: {text_lengths.max()}")
- print(f" 平均: {text_lengths.mean():.1f}")
- print(f" 中位数: {text_lengths.median():.1f}")
- print(f" 95%分位数: {text_lengths.quantile(0.95):.1f}")
- # 统计标签分布
- if 'label' in self.data.columns:
- label_counts = Counter(self.data['label'])
- print(f" 标签分布:")
- for label, count in sorted(label_counts.items()):
- print(f" 类别 {label}: {count} ({count / len(self.data) * 100:.1f}%)")
- def __getitem__(self, item):
- return self.data.iloc[item]['record'], self.data.iloc[item]['label']
- def __len__(self):
- return len(self.data)
- class EarlyStopping:
- """Early stopping机制"""
- def __init__(self, patience:int=3, min_delta:float=0, mode:str='min'):
- self.patience = patience
- self.min_delta = min_delta
- self.mode = mode
- self.counter = 0
- self.best_score = None
- self.early_stop = False
- def __call__(self, score):
- if self.best_score is None:
- self.best_score = score
- return False
- if self.mode == 'min':
- if score < self.best_score - self.min_delta:
- self.best_score = score
- self.counter = 0
- else:
- self.counter += 1
- else: # mode == 'max'
- if score > self.best_score + self.min_delta:
- self.best_score = score
- self.counter = 0
- else:
- self.counter += 1
- if self.counter >= self.patience:
- self.early_stop = True
- return True
- return False
- class SimpleTrainer:
- def __init__(self, train_csv_file,
- batch_size:int=32,
- val_csv_file:str=None,
- is_english:bool=False,
- num_classes:int=2,
- max_length:int=256):
- # 使用标准的BERT模型
- if is_english:
- self.model_name = "bert-base-uncased"
- else:
- self.model_name = "bert-base-chinese"
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- print(f"使用设备: {self.device}")
- print(f"使用模型: {self.model_name}")
- self.batch_size = batch_size
- self.max_length = max_length
- self.__global_step = 0
- self.best_val_acc = 0.0
- self.best_val_loss = float('inf')
- # 创建checkpoint目录
- self.checkpoint_root_path = './checkpoints'
- if not os.path.exists(self.checkpoint_root_path):
- os.makedirs(self.checkpoint_root_path)
- self.best_acc_model_path = os.path.join(self.checkpoint_root_path , f'{self.model_name}_best_acc.pth')
- self.best_loss_model_path = os.path.join(self.checkpoint_root_path , f'{self.model_name}_best_loss.pth')
- # 断点续训
- self.latest_checkpoint_path = os.path.join(self.checkpoint_root_path , f'{self.model_name}_latest_checkpoint.pth')
- # 创建模型和分词器
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
- self.model = AutoModelForSequenceClassification.from_pretrained(
- self.model_name,
- num_labels=num_classes
- ).to(self.device)
- # 打印模型参数量
- total_params = sum(p.numel() for p in self.model.parameters())
- trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
- print(f"模型总参数量: {total_params:,}")
- print(f"可训练参数量: {trainable_params:,}")
- # 创建数据集
- print("\n加载数据集...")
- self.train_dataset = MyDataset(train_csv_file)
- # 计算样本权重(处理类别不平衡)
- labels = self.train_dataset.data['label'].values
- class_counts = np.bincount(labels)
- sample_weights = 1.0 / class_counts[labels] # 实际上给每个样本定义了采样的权重
- sampler = WeightedRandomSampler(
- weights=sample_weights,
- num_samples=len(sample_weights),
- replacement=True
- )
- if val_csv_file:
- self.val_dataset = MyDataset(val_csv_file)
- else:
- # 如果没有独立验证集,需要分割
- self.train_dataset, self.val_dataset = random_split(self.train_dataset, [0.8, 0.2])
- # 从完整权重中提取训练集对应的权重
- train_indices = self.train_dataset.indices
- train_sample_weights = sample_weights[train_indices]
- sampler = WeightedRandomSampler(
- weights=train_sample_weights,
- num_samples=len(train_sample_weights),
- replacement=True
- )
- # 创建数据加载器
- self.train_loader = DataLoader(self.train_dataset,
- batch_size=self.batch_size,
- sampler=sampler,
- collate_fn=self.collate_func)
- self.val_loader = DataLoader(self.val_dataset,
- batch_size=self.batch_size,
- shuffle=False,
- collate_fn=self.collate_func)
- # 创建优化器和调度器
- self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=3e-5, weight_decay=0.01)
- self.scheduler = None
- self.warmup_ratio = 0.1 # warmup步数占总步数的比例
- # Early stopping
- self.early_stopping = EarlyStopping(patience=5, min_delta=0.001, mode='min')
- # 梯度裁剪阈值
- self.max_grad_norm = 1.0
- def collate_func(self, batch_data):
- texts, labels = [], []
- for item in batch_data:
- texts.append(item[0])
- labels.append(item[1])
-
- inputs = self.tokenizer(
- texts,
- max_length=self.max_length,
- padding='max_length',
- truncation=True,
- return_tensors='pt'
- )
- inputs['labels'] = torch.tensor(labels, dtype=torch.long)
- return inputs
- def train_step(self):
- self.model.train()
- total_loss = 0.0
- correct_predictions = 0.0
- total_samples = 0.0
-
- for batch_data in tqdm(self.train_loader, desc="Training"):
- batch_data = {k: v.to(self.device) for k, v in batch_data.items()}
-
- self.optimizer.zero_grad()
- outputs = self.model(**batch_data)
- loss = outputs.loss
- loss.backward()
- # 添加梯度裁剪
- torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
- self.optimizer.step()
- self.scheduler.step() # 更新学习率
-
- total_loss += loss.item()
- predictions = torch.argmax(outputs.logits, dim=-1)
- correct_predictions += (predictions == batch_data['labels']).float().sum().item()
- total_samples += len(batch_data['labels'])
-
- avg_loss = total_loss / len(self.train_loader)
- accuracy = correct_predictions / total_samples
- return avg_loss, accuracy
- def val_step(self):
- self.model.eval()
- total_loss = 0.0
- correct_predictions = 0.0
- total_samples = 0.0
-
- with torch.no_grad():
- for batch_data in tqdm(self.val_loader, desc="Validating"):
- batch_data = {k: v.to(self.device) for k, v in batch_data.items()}
-
- outputs = self.model(**batch_data)
- total_loss += outputs.loss.item()
-
- predictions = torch.argmax(outputs.logits, dim=-1)
- correct_predictions += (predictions == batch_data['labels']).float().sum().item()
- total_samples += len(batch_data['labels'])
-
- avg_loss = total_loss / len(self.val_loader)
- accuracy = correct_predictions / total_samples
- return avg_loss, accuracy
- def train_and_validate(self, num_epoch):
- # 创建学习率调度器(带warmup的线性衰减)
- total_steps = len(self.train_loader) * num_epoch
- warmup_steps = int(total_steps * self.warmup_ratio)
- self.scheduler = get_linear_schedule_with_warmup(
- self.optimizer,
- num_warmup_steps=warmup_steps,
- num_training_steps=total_steps
- )
- print(f"\n训练配置:")
- print(f" 总训练步数: {total_steps}")
- print(f" Warmup步数: {warmup_steps}")
- print(f" 初始学习率: {self.optimizer.param_groups[0]['lr']}")
- print(f" Batch size: {self.batch_size}")
- print(f" Max length: {self.max_length}")
- print(f" 梯度裁剪: {self.max_grad_norm}")
- print("开始训练...")
- for epoch in range(num_epoch):
- print(f"\n{'='*60}")
- print(f"Epoch {epoch+1}/{num_epoch}")
- print(f"{'='*60}")
- train_loss, train_acc = self.train_step()
- val_loss, val_acc = self.val_step()
- # 获取当前学习率
- current_lr = self.optimizer.param_groups[0]['lr']
- print(f'Epoch {epoch+1}/{num_epoch}:')
- print(f' Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f}')
- print(f' Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}')
- print(f' Learning Rate: {current_lr:.2e}')
- # 保存最佳模型 (基于验证准确率)
- if val_acc > self.best_val_acc:
- self.best_val_acc = val_acc
- torch.save(self.model.state_dict(), self.best_acc_model_path)
- print(f" ✓ 保存了新的最佳准确率模型,验证准确率: {self.best_val_acc:.4f}")
- # 保存最低验证损失模型
- if val_loss < self.best_val_loss:
- self.best_val_loss = val_loss
- torch.save(self.model.state_dict(), self.best_loss_model_path)
- print(f" ✓ 保存了新的最低损失模型,验证损失: {self.best_val_loss:.4f}")
- # Early stopping检查
- if self.early_stopping(val_loss):
- print(f"\n⚠ Early stopping触发! 在epoch {epoch+1}停止训练")
- print(f" 验证损失已连续{self.early_stopping.patience}个epoch没有改善")
- break
- print(f"\n{'='*60}")
- print("训练完成!")
- print(f"{'='*60}")
- print(f"最佳验证准确率: {self.best_val_acc:.4f}")
- print(f"最低验证损失: {self.best_val_loss:.4f}")
- print(f"模型保存路径:")
- print(f" 最佳准确率模型: {self.best_acc_model_path}")
- print(f" 最低损失模型: {self.best_loss_model_path}")
- if __name__ == '__main__':
- trainer_en = SimpleTrainer(train_csv_file='data_en.csv',
- batch_size=32,
- is_english=True,
- max_length=256) # 增加max_length
- trainer_en.train_and_validate(num_epoch=20)
- del trainer_en
- gc.collect() # 强制 Python 垃圾回收
- torch.cuda.empty_cache() # 清空 CUDA 缓存
- time.sleep(2) # 短暂等待即可,不需要太久
- trainer = SimpleTrainer(train_csv_file='data_zh.csv',
- batch_size=32,
- is_english=False,
- max_length=256) # 增加max_length
- trainer.train_and_validate(num_epoch=20)
|