""" 通用数据加载器 """ import pandas as pd import numpy as np from pathlib import Path from typing import Union, Dict, Any, Optional import logging logger = logging.getLogger(__name__) class DataLoader: """通用数据加载器""" def __init__(self, data_path: Union[str, Path]): """初始化数据加载器""" self.data_path = Path(data_path) def load_csv(self, **kwargs) -> pd.DataFrame: """加载CSV文件""" try: df = pd.read_csv(self.data_path, **kwargs) logger.info(f"成功加载CSV文件: {self.data_path}") return df except Exception as e: logger.error(f"加载CSV文件失败: {e}") raise def load_json(self, **kwargs) -> pd.DataFrame: """加载JSON文件""" try: df = pd.read_json(self.data_path, **kwargs) logger.info(f"成功加载JSON文件: {self.data_path}") return df except Exception as e: logger.error(f"加载JSON文件失败: {e}") raise def load_parquet(self, **kwargs) -> pd.DataFrame: """加载Parquet文件""" try: df = pd.read_parquet(self.data_path, **kwargs) logger.info(f"成功加载Parquet文件: {self.data_path}") return df except Exception as e: logger.error(f"加载Parquet文件失败: {e}") raise def save_csv(self, df: pd.DataFrame, **kwargs) -> None: """保存为CSV文件""" try: df.to_csv(self.data_path, **kwargs) logger.info(f"成功保存CSV文件: {self.data_path}") except Exception as e: logger.error(f"保存CSV文件失败: {e}") raise def save_json(self, df: pd.DataFrame, **kwargs) -> None: """保存为JSON文件""" try: df.to_json(self.data_path, **kwargs) logger.info(f"成功保存JSON文件: {self.data_path}") except Exception as e: logger.error(f"保存JSON文件失败: {e}") raise def save_parquet(self, df: pd.DataFrame, **kwargs) -> None: """保存为Parquet文件""" try: df.to_parquet(self.data_path, **kwargs) logger.info(f"成功保存Parquet文件: {self.data_path}") except Exception as e: logger.error(f"保存Parquet文件失败: {e}") raise class ImageDataLoader: """图像数据加载器""" def __init__(self, data_dir: Union[str, Path]): """初始化图像数据加载器""" self.data_dir = Path(data_dir) def load_images(self, extensions: list = ['.jpg', '.jpeg', '.png', '.bmp']) -> list: """加载图像文件路径""" image_paths = [] for ext in extensions: image_paths.extend(self.data_dir.glob(f'**/*{ext}')) image_paths.extend(self.data_dir.glob(f'**/*{ext.upper()}')) logger.info(f"找到 {len(image_paths)} 个图像文件") return sorted(image_paths) def create_dataset_info(self, image_paths: list, label_func: callable = None) -> pd.DataFrame: """创建数据集信息DataFrame""" data = [] for img_path in image_paths: if label_func: label = label_func(img_path) else: # 默认从文件夹名获取标签 label = img_path.parent.name data.append({ 'image_path': str(img_path), 'label': label, 'filename': img_path.name }) return pd.DataFrame(data) class TextDataLoader: """文本数据加载器""" def __init__(self, data_path: Union[str, Path]): """初始化文本数据加载器""" self.data_path = Path(data_path) def load_text(self, encoding: str = 'utf-8') -> str: """加载文本文件""" try: with open(self.data_path, 'r', encoding=encoding) as f: text = f.read() logger.info(f"成功加载文本文件: {self.data_path}") return text except Exception as e: logger.error(f"加载文本文件失败: {e}") raise def load_lines(self, encoding: str = 'utf-8') -> list: """按行加载文本文件""" try: with open(self.data_path, 'r', encoding=encoding) as f: lines = f.readlines() logger.info(f"成功加载文本文件,共 {len(lines)} 行: {self.data_path}") return lines except Exception as e: logger.error(f"加载文本文件失败: {e}") raise