line_break_replacement.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. import os
  2. source_dir = r"D:\code\repository\RAG资料库—开发"
  3. target_dir = r"D:\code\repository\RAG资料库—上线"
  4. file_extension = 'md'
  5. paragraph_delimiter = '\n' # 即将被替换的段落分割符
  6. replacement = '。。' # 替换为该字符
  7. # 获取待处理的所有文件
  8. all_files = []
  9. for root, dirs, files in os.walk(source_dir):
  10. for file in files:
  11. if file.split('.')[-1] == file_extension:
  12. all_files.append(os.path.join(root, file))
  13. # 开始处理
  14. for file in all_files:
  15. # 文件保存路径
  16. save_path = file.replace(source_dir, target_dir)
  17. file_dir, filename = os.path.split(save_path)
  18. # 创建保存路径,确保存在
  19. if not os.path.exists(file_dir):
  20. os.makedirs(file_dir)
  21. # 读取文件
  22. with open(file, "r", encoding="utf-8") as f:
  23. content = f.readlines()
  24. # 处理孤立段落
  25. new_content = []
  26. for l in content:
  27. if l == paragraph_delimiter:
  28. continue
  29. # 末尾有分隔符
  30. if l[-1] == paragraph_delimiter:
  31. l = l[:-1]
  32. new_content.append(l)
  33. del content
  34. # 保存文件
  35. new_content = f"{replacement}".join(new_content)
  36. with open(save_path, "w", encoding="utf-8") as f:
  37. f.write(new_content)
  38. pass