| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- """
- 文本预处理工具
- """
- import re
- import string
- from typing import List, Optional
- import logging
- logger = logging.getLogger(__name__)
- class TextPreprocessor:
- """文本预处理器"""
-
- def __init__(self):
- """初始化文本预处理器"""
- self.stop_words = set() # 可以加载停用词表
-
- def clean_text(self, text: str,
- remove_punctuation: bool = True,
- remove_numbers: bool = False,
- remove_extra_spaces: bool = True,
- to_lower: bool = True) -> str:
- """清理文本"""
- if not isinstance(text, str):
- return ""
-
- # 转换为小写
- if to_lower:
- text = text.lower()
-
- # 移除标点符号
- if remove_punctuation:
- text = text.translate(str.maketrans('', '', string.punctuation))
-
- # 移除数字
- if remove_numbers:
- text = re.sub(r'\d+', '', text)
-
- # 移除多余空格
- if remove_extra_spaces:
- text = re.sub(r'\s+', ' ', text).strip()
-
- return text
-
- def tokenize(self, text: str, method: str = 'split') -> List[str]:
- """分词"""
- if method == 'split':
- return text.split()
- elif method == 'regex':
- # 使用正则表达式分词
- return re.findall(r'\b\w+\b', text)
- else:
- raise ValueError(f"不支持的分词方法: {method}")
-
- def remove_stopwords(self, tokens: List[str], custom_stopwords: Optional[List[str]] = None) -> List[str]:
- """移除停用词"""
- if custom_stopwords:
- stop_words = set(custom_stopwords)
- else:
- stop_words = self.stop_words
-
- return [token for token in tokens if token not in stop_words]
-
- def preprocess(self, text: str,
- clean: bool = True,
- tokenize: bool = True,
- remove_stopwords: bool = False,
- **kwargs) -> List[str]:
- """完整的文本预处理流程"""
- if clean:
- text = self.clean_text(text, **kwargs)
-
- if tokenize:
- tokens = self.tokenize(text)
- else:
- tokens = [text]
-
- if remove_stopwords:
- tokens = self.remove_stopwords(tokens)
-
- return tokens
- class TextNormalizer:
- """文本标准化器"""
-
- @staticmethod
- def normalize_whitespace(text: str) -> str:
- """标准化空白字符"""
- return re.sub(r'\s+', ' ', text).strip()
-
- @staticmethod
- def normalize_unicode(text: str) -> str:
- """标准化Unicode字符"""
- import unicodedata
- return unicodedata.normalize('NFKD', text)
-
- @staticmethod
- def remove_html_tags(text: str) -> str:
- """移除HTML标签"""
- import re
- clean = re.compile('<.*?>')
- return re.sub(clean, '', text)
-
- @staticmethod
- def remove_urls(text: str) -> str:
- """移除URL"""
- import re
- return re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
-
- @staticmethod
- def remove_emails(text: str) -> str:
- """移除邮箱地址"""
- import re
- return re.sub(r'\S+@\S+', '', text)
- class TextAugmenter:
- """文本增强器"""
-
- @staticmethod
- def synonym_replacement(text: str, replacement_ratio: float = 0.1) -> str:
- """同义词替换"""
- # 这里可以实现同义词替换逻辑
- # 需要同义词词典或使用NLP库
- return text
-
- @staticmethod
- def random_insertion(text: str, insertion_ratio: float = 0.1) -> str:
- """随机插入"""
- # 实现随机插入逻辑
- return text
-
- @staticmethod
- def random_swap(text: str, swap_ratio: float = 0.1) -> str:
- """随机交换"""
- # 实现随机交换逻辑
- return text
-
- @staticmethod
- def random_deletion(text: str, deletion_ratio: float = 0.1) -> str:
- """随机删除"""
- # 实现随机删除逻辑
- return text
|