| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- from abc import ABC, abstractmethod
- from typing import Iterator
- from typing import Any
- import os
- import re
- class RAGBase(ABC):
- @abstractmethod
- def read(self, read_path:str):
- """
- 从地址读入文件
- Args:
- read_path: 需要读取的文件路径
- Returns:
- """
- pass
- @abstractmethod
- def write(self, *args, **kwargs):
- """
- 向地址写入文件
- Returns:
- """
- pass
- @abstractmethod
- def process(self, *args: Any, **kwargs: Any) -> Any:
- pass
- @abstractmethod
- def reset(self):
- """
- 重置状态
- Returns:
- """
- pass
- @abstractmethod
- def text_generator(self, *args: Any, **kwargs: Any) -> Iterator[str]:
- """
- 文本生成器
- Args:
- *args:
- **kwargs:
- Returns:返回文本
- """
- pass
- @staticmethod
- def is_path_exist(path:str):
- """
- 判断路径是否存在,如果不存在就抛出异常
- Args:
- path: 文件路径
- Returns:
- """
- if not os.path.exists(path):
- raise FileNotFoundError('文件不存在!', path)
- @staticmethod
- def is_title(text: str) -> bool:
- """
- 判断文本中是否存在各种格式的标题, 检查开头
- """
- title_patterns = [
- # 数字编号标题
- r'^\d+(\.\d+)+\s*[\u4e00-\u9fa5a-zA-Z].*',
- r'^\d+\s+[\u4e00-\u9fa5a-zA-Z]',
- # 中文数字标题
- r'^[一二三四五六七八九十]+、\s*[\u4e00-\u9fa5].*',
- # 带括号的中文数字
- r'^([一二三四五六七八九十]+)\s*[\u4e00-\u9fa5].*',
- # 章节标题
- r'^第[一二三四五六七八九十零百千\d]+[章节条]\s*[\u4e00-\u9fa5].*',
- # 字母编号标题
- r'^[A-Z]\.\s*[\u4e00-\u9fa5].*',
- # 半个小括号组合, 1) a)
- r'^[\d\w]+\)',
- # 目录
- r'^目\s*录',
- ]
- for pat in title_patterns:
- if re.search(pat, text):
- return True
- return False
- @staticmethod
- def is_page_number(text: str) -> bool:
- """
- 判断文本中是否存在各种格式的标题, 检查开头
- """
- patterns = [
- # 纯页码
- r'^第.+页(?!\S)',
- # 仅包含数字
- r'^\d+$',
- # 仅包含罗马数字
- '^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$'
- ]
- for pat in patterns:
- if re.search(pat, text):
- return True
- return False
|