rag_base.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. from abc import ABC, abstractmethod
  2. from typing import Iterator
  3. from typing import Any
  4. import os
  5. import re
  6. class RAGBase(ABC):
  7. @abstractmethod
  8. def read(self, read_path:str):
  9. """
  10. 从地址读入文件
  11. Args:
  12. read_path: 需要读取的文件路径
  13. Returns:
  14. """
  15. pass
  16. @abstractmethod
  17. def write(self, *args, **kwargs):
  18. """
  19. 向地址写入文件
  20. Returns:
  21. """
  22. pass
  23. @abstractmethod
  24. def process(self, *args: Any, **kwargs: Any) -> Any:
  25. pass
  26. @abstractmethod
  27. def reset(self):
  28. """
  29. 重置状态
  30. Returns:
  31. """
  32. pass
  33. @abstractmethod
  34. def text_generator(self, *args: Any, **kwargs: Any) -> Iterator[str]:
  35. """
  36. 文本生成器
  37. Args:
  38. *args:
  39. **kwargs:
  40. Returns:返回文本
  41. """
  42. pass
  43. @staticmethod
  44. def is_path_exist(path:str):
  45. """
  46. 判断路径是否存在,如果不存在就抛出异常
  47. Args:
  48. path: 文件路径
  49. Returns:
  50. """
  51. if not os.path.exists(path):
  52. raise FileNotFoundError('文件不存在!', path)
  53. @staticmethod
  54. def is_title(text: str) -> bool:
  55. """
  56. 判断文本中是否存在各种格式的标题, 检查开头
  57. """
  58. title_patterns = [
  59. # 数字编号标题
  60. r'^\d+(\.\d+)+\s*[\u4e00-\u9fa5a-zA-Z].*',
  61. r'^\d+\s+[\u4e00-\u9fa5a-zA-Z]',
  62. # 中文数字标题
  63. r'^[一二三四五六七八九十]+、\s*[\u4e00-\u9fa5].*',
  64. # 带括号的中文数字
  65. r'^([一二三四五六七八九十]+)\s*[\u4e00-\u9fa5].*',
  66. # 章节标题
  67. r'^第[一二三四五六七八九十零百千\d]+[章节条]\s*[\u4e00-\u9fa5].*',
  68. # 字母编号标题
  69. r'^[A-Z]\.\s*[\u4e00-\u9fa5].*',
  70. # 半个小括号组合, 1) a)
  71. r'^[\d\w]+\)',
  72. # 目录
  73. r'^目\s*录',
  74. ]
  75. for pat in title_patterns:
  76. if re.search(pat, text):
  77. return True
  78. return False
  79. @staticmethod
  80. def is_page_number(text: str) -> bool:
  81. """
  82. 判断文本中是否存在各种格式的标题, 检查开头
  83. """
  84. patterns = [
  85. # 纯页码
  86. r'^第.+页(?!\S)',
  87. # 仅包含数字
  88. r'^\d+$',
  89. # 仅包含罗马数字
  90. '^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$'
  91. ]
  92. for pat in patterns:
  93. if re.search(pat, text):
  94. return True
  95. return False