| 123456789101112131415161718192021222324252627282930313233343536373839404142 |
- import os
- source_dir = r"D:\code\repository\RAG资料库—开发"
- target_dir = r"D:\code\repository\RAG资料库—上线"
- file_extension = 'md'
- paragraph_delimiter = '\n' # 即将被替换的段落分割符
- replacement = '。。' # 替换为该字符
- # 获取待处理的所有文件
- all_files = []
- for root, dirs, files in os.walk(source_dir):
- for file in files:
- if file.split('.')[-1] == file_extension:
- all_files.append(os.path.join(root, file))
- # 开始处理
- for file in all_files:
- # 文件保存路径
- save_path = file.replace(source_dir, target_dir)
- file_dir, filename = os.path.split(save_path)
- # 创建保存路径,确保存在
- if not os.path.exists(file_dir):
- os.makedirs(file_dir)
- # 读取文件
- with open(file, "r", encoding="utf-8") as f:
- content = f.readlines()
- # 处理孤立段落
- new_content = []
- for l in content:
- if l == paragraph_delimiter:
- continue
- # 末尾有分隔符
- if l[-1] == paragraph_delimiter:
- l = l[:-1]
- new_content.append(l)
- del content
- # 保存文件
- new_content = f"{replacement}".join(new_content)
- with open(save_path, "w", encoding="utf-8") as f:
- f.write(new_content)
- pass
|