text.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. """
  2. 文本预处理工具
  3. """
  4. import re
  5. import string
  6. from typing import List, Optional
  7. import logging
  8. logger = logging.getLogger(__name__)
  9. class TextPreprocessor:
  10. """文本预处理器"""
  11. def __init__(self):
  12. """初始化文本预处理器"""
  13. self.stop_words = set() # 可以加载停用词表
  14. def clean_text(self, text: str,
  15. remove_punctuation: bool = True,
  16. remove_numbers: bool = False,
  17. remove_extra_spaces: bool = True,
  18. to_lower: bool = True) -> str:
  19. """清理文本"""
  20. if not isinstance(text, str):
  21. return ""
  22. # 转换为小写
  23. if to_lower:
  24. text = text.lower()
  25. # 移除标点符号
  26. if remove_punctuation:
  27. text = text.translate(str.maketrans('', '', string.punctuation))
  28. # 移除数字
  29. if remove_numbers:
  30. text = re.sub(r'\d+', '', text)
  31. # 移除多余空格
  32. if remove_extra_spaces:
  33. text = re.sub(r'\s+', ' ', text).strip()
  34. return text
  35. def tokenize(self, text: str, method: str = 'split') -> List[str]:
  36. """分词"""
  37. if method == 'split':
  38. return text.split()
  39. elif method == 'regex':
  40. # 使用正则表达式分词
  41. return re.findall(r'\b\w+\b', text)
  42. else:
  43. raise ValueError(f"不支持的分词方法: {method}")
  44. def remove_stopwords(self, tokens: List[str], custom_stopwords: Optional[List[str]] = None) -> List[str]:
  45. """移除停用词"""
  46. if custom_stopwords:
  47. stop_words = set(custom_stopwords)
  48. else:
  49. stop_words = self.stop_words
  50. return [token for token in tokens if token not in stop_words]
  51. def preprocess(self, text: str,
  52. clean: bool = True,
  53. tokenize: bool = True,
  54. remove_stopwords: bool = False,
  55. **kwargs) -> List[str]:
  56. """完整的文本预处理流程"""
  57. if clean:
  58. text = self.clean_text(text, **kwargs)
  59. if tokenize:
  60. tokens = self.tokenize(text)
  61. else:
  62. tokens = [text]
  63. if remove_stopwords:
  64. tokens = self.remove_stopwords(tokens)
  65. return tokens
  66. class TextNormalizer:
  67. """文本标准化器"""
  68. @staticmethod
  69. def normalize_whitespace(text: str) -> str:
  70. """标准化空白字符"""
  71. return re.sub(r'\s+', ' ', text).strip()
  72. @staticmethod
  73. def normalize_unicode(text: str) -> str:
  74. """标准化Unicode字符"""
  75. import unicodedata
  76. return unicodedata.normalize('NFKD', text)
  77. @staticmethod
  78. def remove_html_tags(text: str) -> str:
  79. """移除HTML标签"""
  80. import re
  81. clean = re.compile('<.*?>')
  82. return re.sub(clean, '', text)
  83. @staticmethod
  84. def remove_urls(text: str) -> str:
  85. """移除URL"""
  86. import re
  87. return re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
  88. @staticmethod
  89. def remove_emails(text: str) -> str:
  90. """移除邮箱地址"""
  91. import re
  92. return re.sub(r'\S+@\S+', '', text)
  93. class TextAugmenter:
  94. """文本增强器"""
  95. @staticmethod
  96. def synonym_replacement(text: str, replacement_ratio: float = 0.1) -> str:
  97. """同义词替换"""
  98. # 这里可以实现同义词替换逻辑
  99. # 需要同义词词典或使用NLP库
  100. return text
  101. @staticmethod
  102. def random_insertion(text: str, insertion_ratio: float = 0.1) -> str:
  103. """随机插入"""
  104. # 实现随机插入逻辑
  105. return text
  106. @staticmethod
  107. def random_swap(text: str, swap_ratio: float = 0.1) -> str:
  108. """随机交换"""
  109. # 实现随机交换逻辑
  110. return text
  111. @staticmethod
  112. def random_deletion(text: str, deletion_ratio: float = 0.1) -> str:
  113. """随机删除"""
  114. # 实现随机删除逻辑
  115. return text