""" 文本预处理工具 """ import re import string from typing import List, Optional import logging logger = logging.getLogger(__name__) class TextPreprocessor: """文本预处理器""" def __init__(self): """初始化文本预处理器""" self.stop_words = set() # 可以加载停用词表 def clean_text(self, text: str, remove_punctuation: bool = True, remove_numbers: bool = False, remove_extra_spaces: bool = True, to_lower: bool = True) -> str: """清理文本""" if not isinstance(text, str): return "" # 转换为小写 if to_lower: text = text.lower() # 移除标点符号 if remove_punctuation: text = text.translate(str.maketrans('', '', string.punctuation)) # 移除数字 if remove_numbers: text = re.sub(r'\d+', '', text) # 移除多余空格 if remove_extra_spaces: text = re.sub(r'\s+', ' ', text).strip() return text def tokenize(self, text: str, method: str = 'split') -> List[str]: """分词""" if method == 'split': return text.split() elif method == 'regex': # 使用正则表达式分词 return re.findall(r'\b\w+\b', text) else: raise ValueError(f"不支持的分词方法: {method}") def remove_stopwords(self, tokens: List[str], custom_stopwords: Optional[List[str]] = None) -> List[str]: """移除停用词""" if custom_stopwords: stop_words = set(custom_stopwords) else: stop_words = self.stop_words return [token for token in tokens if token not in stop_words] def preprocess(self, text: str, clean: bool = True, tokenize: bool = True, remove_stopwords: bool = False, **kwargs) -> List[str]: """完整的文本预处理流程""" if clean: text = self.clean_text(text, **kwargs) if tokenize: tokens = self.tokenize(text) else: tokens = [text] if remove_stopwords: tokens = self.remove_stopwords(tokens) return tokens class TextNormalizer: """文本标准化器""" @staticmethod def normalize_whitespace(text: str) -> str: """标准化空白字符""" return re.sub(r'\s+', ' ', text).strip() @staticmethod def normalize_unicode(text: str) -> str: """标准化Unicode字符""" import unicodedata return unicodedata.normalize('NFKD', text) @staticmethod def remove_html_tags(text: str) -> str: """移除HTML标签""" import re clean = re.compile('<.*?>') return re.sub(clean, '', text) @staticmethod def remove_urls(text: str) -> str: """移除URL""" import re return re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) @staticmethod def remove_emails(text: str) -> str: """移除邮箱地址""" import re return re.sub(r'\S+@\S+', '', text) class TextAugmenter: """文本增强器""" @staticmethod def synonym_replacement(text: str, replacement_ratio: float = 0.1) -> str: """同义词替换""" # 这里可以实现同义词替换逻辑 # 需要同义词词典或使用NLP库 return text @staticmethod def random_insertion(text: str, insertion_ratio: float = 0.1) -> str: """随机插入""" # 实现随机插入逻辑 return text @staticmethod def random_swap(text: str, swap_ratio: float = 0.1) -> str: """随机交换""" # 实现随机交换逻辑 return text @staticmethod def random_deletion(text: str, deletion_ratio: float = 0.1) -> str: """随机删除""" # 实现随机删除逻辑 return text