| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142 |
- """
- 通用数据加载器
- """
- import pandas as pd
- import numpy as np
- from pathlib import Path
- from typing import Union, Dict, Any, Optional
- import logging
- logger = logging.getLogger(__name__)
- class DataLoader:
- """通用数据加载器"""
-
- def __init__(self, data_path: Union[str, Path]):
- """初始化数据加载器"""
- self.data_path = Path(data_path)
-
- def load_csv(self, **kwargs) -> pd.DataFrame:
- """加载CSV文件"""
- try:
- df = pd.read_csv(self.data_path, **kwargs)
- logger.info(f"成功加载CSV文件: {self.data_path}")
- return df
- except Exception as e:
- logger.error(f"加载CSV文件失败: {e}")
- raise
-
- def load_json(self, **kwargs) -> pd.DataFrame:
- """加载JSON文件"""
- try:
- df = pd.read_json(self.data_path, **kwargs)
- logger.info(f"成功加载JSON文件: {self.data_path}")
- return df
- except Exception as e:
- logger.error(f"加载JSON文件失败: {e}")
- raise
-
- def load_parquet(self, **kwargs) -> pd.DataFrame:
- """加载Parquet文件"""
- try:
- df = pd.read_parquet(self.data_path, **kwargs)
- logger.info(f"成功加载Parquet文件: {self.data_path}")
- return df
- except Exception as e:
- logger.error(f"加载Parquet文件失败: {e}")
- raise
-
- def save_csv(self, df: pd.DataFrame, **kwargs) -> None:
- """保存为CSV文件"""
- try:
- df.to_csv(self.data_path, **kwargs)
- logger.info(f"成功保存CSV文件: {self.data_path}")
- except Exception as e:
- logger.error(f"保存CSV文件失败: {e}")
- raise
-
- def save_json(self, df: pd.DataFrame, **kwargs) -> None:
- """保存为JSON文件"""
- try:
- df.to_json(self.data_path, **kwargs)
- logger.info(f"成功保存JSON文件: {self.data_path}")
- except Exception as e:
- logger.error(f"保存JSON文件失败: {e}")
- raise
-
- def save_parquet(self, df: pd.DataFrame, **kwargs) -> None:
- """保存为Parquet文件"""
- try:
- df.to_parquet(self.data_path, **kwargs)
- logger.info(f"成功保存Parquet文件: {self.data_path}")
- except Exception as e:
- logger.error(f"保存Parquet文件失败: {e}")
- raise
- class ImageDataLoader:
- """图像数据加载器"""
-
- def __init__(self, data_dir: Union[str, Path]):
- """初始化图像数据加载器"""
- self.data_dir = Path(data_dir)
-
- def load_images(self, extensions: list = ['.jpg', '.jpeg', '.png', '.bmp']) -> list:
- """加载图像文件路径"""
- image_paths = []
- for ext in extensions:
- image_paths.extend(self.data_dir.glob(f'**/*{ext}'))
- image_paths.extend(self.data_dir.glob(f'**/*{ext.upper()}'))
-
- logger.info(f"找到 {len(image_paths)} 个图像文件")
- return sorted(image_paths)
-
- def create_dataset_info(self, image_paths: list, label_func: callable = None) -> pd.DataFrame:
- """创建数据集信息DataFrame"""
- data = []
- for img_path in image_paths:
- if label_func:
- label = label_func(img_path)
- else:
- # 默认从文件夹名获取标签
- label = img_path.parent.name
-
- data.append({
- 'image_path': str(img_path),
- 'label': label,
- 'filename': img_path.name
- })
-
- return pd.DataFrame(data)
- class TextDataLoader:
- """文本数据加载器"""
-
- def __init__(self, data_path: Union[str, Path]):
- """初始化文本数据加载器"""
- self.data_path = Path(data_path)
-
- def load_text(self, encoding: str = 'utf-8') -> str:
- """加载文本文件"""
- try:
- with open(self.data_path, 'r', encoding=encoding) as f:
- text = f.read()
- logger.info(f"成功加载文本文件: {self.data_path}")
- return text
- except Exception as e:
- logger.error(f"加载文本文件失败: {e}")
- raise
-
- def load_lines(self, encoding: str = 'utf-8') -> list:
- """按行加载文本文件"""
- try:
- with open(self.data_path, 'r', encoding=encoding) as f:
- lines = f.readlines()
- logger.info(f"成功加载文本文件,共 {len(lines)} 行: {self.data_path}")
- return lines
- except Exception as e:
- logger.error(f"加载文本文件失败: {e}")
- raise
|