wangmingyang
/
DualFlow


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
							"""
文本预处理工具
"""
import re
import string
from typing import List, Optional
import logging

logger = logging.getLogger(__name__)


class TextPreprocessor:
    """文本预处理器"""
    
    def __init__(self):
        """初始化文本预处理器"""
        self.stop_words = set()  # 可以加载停用词表
    
    def clean_text(self, text: str, 
                   remove_punctuation: bool = True,
                   remove_numbers: bool = False,
                   remove_extra_spaces: bool = True,
                   to_lower: bool = True) -> str:
        """清理文本"""
        if not isinstance(text, str):
            return ""
        
        # 转换为小写
        if to_lower:
            text = text.lower()
        
        # 移除标点符号
        if remove_punctuation:
            text = text.translate(str.maketrans('', '', string.punctuation))
        
        # 移除数字
        if remove_numbers:
            text = re.sub(r'\d+', '', text)
        
        # 移除多余空格
        if remove_extra_spaces:
            text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text: str, method: str = 'split') -> List[str]:
        """分词"""
        if method == 'split':
            return text.split()
        elif method == 'regex':
            # 使用正则表达式分词
            return re.findall(r'\b\w+\b', text)
        else:
            raise ValueError(f"不支持的分词方法: {method}")
    
    def remove_stopwords(self, tokens: List[str], custom_stopwords: Optional[List[str]] = None) -> List[str]:
        """移除停用词"""
        if custom_stopwords:
            stop_words = set(custom_stopwords)
        else:
            stop_words = self.stop_words
        
        return [token for token in tokens if token not in stop_words]
    
    def preprocess(self, text: str, 
                   clean: bool = True,
                   tokenize: bool = True,
                   remove_stopwords: bool = False,
                   **kwargs) -> List[str]:
        """完整的文本预处理流程"""
        if clean:
            text = self.clean_text(text, **kwargs)
        
        if tokenize:
            tokens = self.tokenize(text)
        else:
            tokens = [text]
        
        if remove_stopwords:
            tokens = self.remove_stopwords(tokens)
        
        return tokens


class TextNormalizer:
    """文本标准化器"""
    
    @staticmethod
    def normalize_whitespace(text: str) -> str:
        """标准化空白字符"""
        return re.sub(r'\s+', ' ', text).strip()
    
    @staticmethod
    def normalize_unicode(text: str) -> str:
        """标准化Unicode字符"""
        import unicodedata
        return unicodedata.normalize('NFKD', text)
    
    @staticmethod
    def remove_html_tags(text: str) -> str:
        """移除HTML标签"""
        import re
        clean = re.compile('<.*?>')
        return re.sub(clean, '', text)
    
    @staticmethod
    def remove_urls(text: str) -> str:
        """移除URL"""
        import re
        return re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    @staticmethod
    def remove_emails(text: str) -> str:
        """移除邮箱地址"""
        import re
        return re.sub(r'\S+@\S+', '', text)


class TextAugmenter:
    """文本增强器"""
    
    @staticmethod
    def synonym_replacement(text: str, replacement_ratio: float = 0.1) -> str:
        """同义词替换"""
        # 这里可以实现同义词替换逻辑
        # 需要同义词词典或使用NLP库
        return text
    
    @staticmethod
    def random_insertion(text: str, insertion_ratio: float = 0.1) -> str:
        """随机插入"""
        # 实现随机插入逻辑
        return text
    
    @staticmethod
    def random_swap(text: str, swap_ratio: float = 0.1) -> str:
        """随机交换"""
        # 实现随机交换逻辑
        return text
    
    @staticmethod
    def random_deletion(text: str, deletion_ratio: float = 0.1) -> str:
        """随机删除"""
        # 实现随机删除逻辑
        return text