Ver Fonte

初始化提交

jiyuhang há 3 meses atrás
commit
95f871f0e6
48 ficheiros alterados com 1344 adições e 0 exclusões
  1. 3 0
      .gitignore
  2. 464 0
      create_pcl_match_dictionary.py
  3. 30 0
      item_del_tool.py
  4. 0 0
      plc_dictionary/1181_plc_dictionary/1181_dict_level_1.json
  5. 0 0
      plc_dictionary/1181_plc_dictionary/1181_dict_level_2.json
  6. 0 0
      plc_dictionary/1181_plc_dictionary/1181_dict_name_2_code.json
  7. BIN
      plc_dictionary/1181_plc_dictionary/1181_knowledge.faiss
  8. BIN
      plc_dictionary/1181_plc_dictionary/1181_点位.xlsx
  9. 0 0
      plc_dictionary/1202_plc_dictionary/1202_dict_level_1.json
  10. 0 0
      plc_dictionary/1202_plc_dictionary/1202_dict_level_2.json
  11. 0 0
      plc_dictionary/1202_plc_dictionary/1202_dict_name_2_code.json
  12. BIN
      plc_dictionary/1202_plc_dictionary/1202_knowledge.faiss
  13. BIN
      plc_dictionary/1202_plc_dictionary/1202_点位.xlsx
  14. 0 0
      plc_dictionary/1450_plc_dictionary/1450_dict_level_1.json
  15. 0 0
      plc_dictionary/1450_plc_dictionary/1450_dict_level_2.json
  16. 0 0
      plc_dictionary/1450_plc_dictionary/1450_dict_name_2_code.json
  17. BIN
      plc_dictionary/1450_plc_dictionary/1450_knowledge.faiss
  18. BIN
      plc_dictionary/1450_plc_dictionary/1450_点位.xlsx
  19. 0 0
      plc_dictionary/92_plc_dictionary/92_dict_level_1.json
  20. 0 0
      plc_dictionary/92_plc_dictionary/92_dict_level_2.json
  21. 0 0
      plc_dictionary/92_plc_dictionary/92_dict_name_2_code.json
  22. BIN
      plc_dictionary/92_plc_dictionary/92_knowledge.faiss
  23. BIN
      plc_dictionary/92_plc_dictionary/92_点位.xlsx
  24. 335 0
      plclib.py
  25. BIN
      plc点位原始文件/1181_点位_原始.xlsx
  26. BIN
      plc点位原始文件/1202_点位_原始.xlsx
  27. BIN
      plc点位原始文件/1450_点位_原始.xlsx
  28. BIN
      plc点位原始文件/92_点位_原始.xlsx
  29. 10 0
      readme
  30. 132 0
      remote_model.py
  31. 8 0
      url_config.json
  32. 1 0
      user_maintain_dictionary/equivalent_words/dict_equivalent_wordmap.json
  33. 9 0
      user_maintain_dictionary/equivalent_words/equivalent_wordmap.txt
  34. 40 0
      user_maintain_dictionary/jieba_words/A综合.txt
  35. 9 0
      user_maintain_dictionary/jieba_words/B产水率.txt
  36. 40 0
      user_maintain_dictionary/jieba_words/B回收率.txt
  37. 15 0
      user_maintain_dictionary/jieba_words/B电导.txt
  38. 5 0
      user_maintain_dictionary/jieba_words/B脱盐率.txt
  39. 18 0
      user_maintain_dictionary/jieba_words/C膜渗透率.txt
  40. 53 0
      user_maintain_dictionary/jieba_words/C膜通量.txt
  41. 9 0
      user_maintain_dictionary/jieba_words/C跨膜压差.txt
  42. 21 0
      user_maintain_dictionary/jieba_words/产水压力.txt
  43. 30 0
      user_maintain_dictionary/jieba_words/产水流量.txt
  44. 27 0
      user_maintain_dictionary/jieba_words/段压差.txt
  45. 18 0
      user_maintain_dictionary/jieba_words/浓水压力.txt
  46. 6 0
      user_maintain_dictionary/jieba_words/浓水流量.txt
  47. 37 0
      user_maintain_dictionary/jieba_words/进水压力.txt
  48. 24 0
      user_maintain_dictionary/jieba_words/进水流量.txt

+ 3 - 0
.gitignore

@@ -0,0 +1,3 @@
+__pycache__/
+*.pyc
+.idea/

+ 464 - 0
create_pcl_match_dictionary.py

@@ -0,0 +1,464 @@
+import os
+
+from sympy.solvers.diophantine.diophantine import equivalent
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+import sys
+sys.path.append(script_dir)
+import pandas as pd
+import jieba
+import jieba.posseg as pseg
+import re
+import numpy as np
+import json
+import textdistance
+import faiss
+from remote_model import RemoteBGEModel
+
+
+class PLCMatch:
+    """通过关键词+语义相似度的方式,从用户输入中匹配PLC点位"""
+    def __init__(self, project_id:int):
+        # 水厂id
+        self.project_id = str(project_id)
+        # 路径
+        self.script_dir = os.path.dirname(os.path.abspath(__file__)) # 脚本绝对路径
+        # 水厂的词典根路径
+        self.plc_dict_root_dir = os.path.join(self.script_dir, f'plc_dictionary/{self.project_id}_plc_dictionary')
+        # 读取pcl点位文件,生成name-code映射字典
+        self.name_2_code_dict = self.__read_pcl()
+
+        # 加载用户自定义词典,添加到jieba词库
+        user_dictionary_dir = os.path.join(self.script_dir, 'user_maintain_dictionary', 'jieba_words')
+        user_dict_list = [os.path.join(user_dictionary_dir, _) for _ in os.listdir(user_dictionary_dir) if _.split('.')[-1] == 'txt']  # 用户词典
+        self.user_dict_list = user_dict_list
+        self.__load_user_dict()
+
+        # 生成二级字典
+        self.dict_level_2 = self.__make_level_two_dictionary()
+
+        # 生成一级字典
+        self.dict_level_1 = self.__make_level_one_dictionary()
+
+        # 等价词映射表
+        self.equivalent_wordmap_txt = os.path.join(self.script_dir,'user_maintain_dictionary','equivalent_words', 'equivalent_wordmap.txt')
+        self.dict_equivalent_wordmap = self.__construct_equivalent_wordmap()
+
+        # 生成知识库,PLC点位数据库中文字段
+        # 加载bge-m3和bge-reranker远程模型
+        self.plc_database_name_template_list = list(self.name_2_code_dict.keys())
+        self.model = RemoteBGEModel('dev')
+        self.knowledge = self.__load_faiss_database()
+
+
+    def __load_faiss_database(self):
+        """从本地加载向量数据库"""
+        # 水厂的数据库字段知识库
+        faiss_path = os.path.join(self.plc_dict_root_dir, f'{self.project_id}_knowledge.faiss')
+        # 尝试从本地加载
+        if os.path.exists(faiss_path):
+            print('PLC点位查询功能从本地加载点位字段向量知识库...')
+            return faiss.read_index(faiss_path)
+
+        # 如果不存在就尝试重新创建
+        # 首先,我们需要拿到数据库的点位名称,可以直接从name-code映射字典当中获取
+        plc_database_name_template_list = self.plc_database_name_template_list
+        # 调用远程embedding模型,one by one 地处理,远程模型通过配置参数进行归一化
+        embeddings = [self.model.encode([temp], normalize=True)[0] for temp in plc_database_name_template_list]
+        for _ in embeddings:
+            if _ is None:
+                raise RuntimeError('为plc数据库中文字段构建向量数据库时发生异常,embeddings不能存在None')
+        # 要求embeddings是一个二维矩阵,类型为float32
+        embeddings = np.array(embeddings, dtype=np.float32)
+        # 创建 FAISS 索引
+        dimension = embeddings[0].shape[0]
+        local_faiss = faiss.IndexFlatIP(dimension)  # 建立内积索引
+        local_faiss.add(embeddings)  # 添加索引
+        # 保存未来使用
+        faiss.write_index(local_faiss, faiss_path)
+        return local_faiss
+
+
+    def __read_pcl(self):
+        """
+        读取pcl文件,生成name2code词典
+        :return:
+        """
+        # name-code映射词典路径
+        dict_name2code_path = os.path.join(self.plc_dict_root_dir, f'{self.project_id}_dict_name_2_code.json')
+        # 尝试从本地加载name-code映射字典
+        if os.path.exists(dict_name2code_path):
+            with open(dict_name2code_path, 'r', encoding='utf-8') as f:
+                dict_name2code = json.load(f)
+            return dict_name2code
+
+        # 如果本地没有就重新生成
+        # 检查点位文件是否存在
+        pcl_file_path = os.path.join(self.plc_dict_root_dir, f'{self.project_id}_点位.xlsx') # 点位文件路径
+        if not os.path.exists(pcl_file_path):
+            raise FileNotFoundError(f'{pcl_file_path} does not exist')
+        # 读点位
+        points = pd.read_excel(pcl_file_path)
+        # 列名称,name | code
+        column_label_alias, column_label_code = points.columns.tolist()
+        # 中英文匹配
+        names = points.loc[:, column_label_alias].to_numpy()
+        codes = points.loc[:, column_label_code].to_numpy()
+        # 对齐命名规范, 按照中荷水厂命名风格,将1#UF或1#RO统一替换为UF1,RO1,将所有反渗透文字替换为RO,所有超滤文字替换为UF
+        names = [s.replace('超滤','UF').replace('反渗透','RO') for s in names]
+        names = [self.field_align(s) for s in names]
+        # 名到英文的字典
+        dict_name2code = dict(zip(names, codes))
+        # name-code映射字典保存到本地文件
+        with open(dict_name2code_path, 'w', encoding='utf-8') as f:
+            json.dump(dict_name2code, f, ensure_ascii=False)
+        return dict_name2code
+
+    def __load_user_dict(self):
+        """加载用户词典,添加到jieba词库"""
+        # 删除
+        jieba.del_word('反渗透')
+        jieba.del_word('超滤')
+        for user_dict_txt in self.user_dict_list:
+            # 检查文件是否存在
+            if not os.path.exists(user_dict_txt):
+                raise FileNotFoundError(f'{user_dict_txt} does not exist')
+            # 检查文件后缀名是否合法
+            if os.path.splitext(user_dict_txt)[1] != '.txt':
+                continue
+            # 分词库加载用户字典
+            jieba.load_userdict(user_dict_txt)
+
+    def __construct_equivalent_wordmap(self):
+        """构建等价词汇映射表,等价词汇的使用方式是将备查词的所有等效说法都纳入备查序列,从而保证了搜索的高召回率"""
+        # 检查文件是否存在
+        equivalent_wordmap_path = os.path.join(self.script_dir, 'user_maintain_dictionary','equivalent_words', 'dict_equivalent_wordmap.json')
+        if os.path.exists(equivalent_wordmap_path):
+            with open(equivalent_wordmap_path, 'r', encoding='utf-8') as f:
+                equivalent_wordmap = json.load(f)
+            return equivalent_wordmap
+        # 如果本地不存在等价词典json文件,那么就尝试创建
+        if not os.path.exists(self.equivalent_wordmap_txt):
+            raise FileNotFoundError(f'{self.equivalent_wordmap_txt} does not exist')
+
+        with open(self.equivalent_wordmap_txt, 'r', encoding='utf-8') as f:
+            all_lines = [_.strip() for _ in f.readlines()]
+        # 创建等价词汇映射表
+        dict_equi_wordmap = {}
+        for line in all_lines:
+            split_list = line.split('=')
+            for i in range(len(split_list)):
+                dict_equi_wordmap[split_list[i]] = split_list
+        with open(equivalent_wordmap_path, 'w', encoding='utf-8') as f:
+            json.dump(dict_equi_wordmap,f,ensure_ascii=False)
+        return dict_equi_wordmap
+
+    def __make_level_two_dictionary(self):
+        """创建二级字典,对点位所有字段进行正则匹配中文,将中文一样的字段聚合为同一个字典键值对,键为正则提取的中文字符"""
+        group_dict = {}
+        # 尝试从本地加载二级字典
+        dict_level2_dict_path = os.path.join(self.plc_dict_root_dir, f'{self.project_id}_dict_level_2.json')
+        if os.path.exists(dict_level2_dict_path):
+            with open(dict_level2_dict_path, 'r', encoding='utf-8') as f:
+                group_dict = json.load(f)
+            return group_dict
+
+        if self.name_2_code_dict is None:
+            raise ValueError(f'name_2_code_dict is None', self.name_2_code_dict)
+        data = self.name_2_code_dict.keys()
+
+        # 创建二级字典
+        for item in data:
+            k = re.sub(r'[^\u4e00-\u9fa5]', '', item)
+            # 处理没有汉字的字段
+            if k == '':
+                k = "无"
+            if k not in group_dict.keys():
+                group_dict[k] = [item]
+            else:
+                group_dict[k].append(item)
+
+        # 保存二级字典到本地
+        with open(dict_level2_dict_path, 'w', encoding='utf-8') as f:
+            json.dump(group_dict, f, ensure_ascii=False)
+        return group_dict
+
+    @staticmethod
+    def cut_compair(arr_a: str, arr_b: str, condition='nz') -> str:
+        """
+        :param condition: 词性
+        :param arr_a:
+        :param arr_b:
+        :return: 第一个相同nz词
+        """
+        # a: w1,f1  w2,f2  w3, f3
+        # b: w1,f1  w2,f2  w3, f3
+
+        cut_arr_a = [list(_) for _ in pseg.lcut(arr_a)]
+        cut_arr_b = [list(_) for _ in pseg.lcut(arr_b)]
+        for i in range(len(cut_arr_a)):
+            for j in range(i, len(cut_arr_b)):
+                # 只比较nz词性
+                if cut_arr_a[i][1] != condition or cut_arr_b[j][1] != condition:
+                    continue
+                if cut_arr_a[i][0] == cut_arr_b[j][0] and cut_arr_a[i][1] == cut_arr_b[j][1]:
+                    return cut_arr_a[i][0]
+        return ''
+
+    def __make_level_one_dictionary(self):
+        """创建一级字典"""
+        group_dict = {}  # 存放二次分组的结果
+        # 尝试从本地加载一级字典
+        dict_level_1_path = os.path.join(self.plc_dict_root_dir, f'{self.project_id}_dict_level_1.json')
+        if os.path.exists(dict_level_1_path):
+            with open(dict_level_1_path, 'r', encoding='utf-8') as f:
+                group_dict = json.load(f)
+            return group_dict
+
+        if self.dict_level_2.keys() is None:
+            raise ValueError(f'dict_lev2 is None', self.dict_level_2)
+        # 提取二级字典的所有key
+        data = self.dict_level_2.keys()
+
+        # 如果不存在就重新生成一级字典
+        # 根据用户词典进行分词,筛选出所有带nz词的字段
+        no_nz_list = []  # 没有nz词的字段
+        nz_list = []  # 有nz词的字段
+        for item in data:
+            # 判断是否存在nz名词
+            is_exist_n = False
+            for w, f in pseg.lcut(item):
+                if f == 'nz':  # 查看词性
+                    is_exist_n = True
+                    break
+            if is_exist_n:  # 存在词
+                nz_list.append(item)
+            else:  # 不存在nz词
+                no_nz_list.append(item)
+
+        # 聚合具有相同nz名词的字段
+        while len(nz_list) > 0:
+            pos = [1 for _ in range(len(nz_list))]  # 0表示不被取,1表示需要被取,默认都要被取,用来更新nz_list给下次判断使用
+            pos[0] = 0  # 标记第一个单词为不需要处理
+            for i in range(len(nz_list)):
+                # 查看是否存在相同的nz词
+                same_nz_word = self.cut_compair(nz_list[0], nz_list[i])
+                if same_nz_word:
+                    # 执行聚合
+                    if same_nz_word not in group_dict.keys():
+                        # 首次聚合,与自身比较,创建自身类别
+                        group_dict[same_nz_word] = [nz_list[i]]
+                    else:
+                        group_dict[same_nz_word].append(nz_list[i])
+
+                    pos[i] = 0
+            # 处理完一趟就要变更nz_list
+            nz_list = np.array(nz_list)[np.array(pos, dtype=np.bool)].tolist()
+
+        # 聚合不包含nz的名词, 单独占一个类别
+        for item in no_nz_list:
+            group_dict[item] = [item]
+
+        with open(dict_level_1_path, 'w', encoding='utf-8') as f:
+            json.dump(group_dict, f, ensure_ascii=False)
+
+        return group_dict
+
+    @staticmethod
+    def field_align(input_str:str)->str:
+        """按照锡山中荷命名规范对齐字段,1#UF替换为UF1,1#RO替换为RO1,保持统一"""
+        sources_uf = re.findall(r'\d+#UF', input_str, re.IGNORECASE)  # 匹配1#UF
+        sources_ro = re.findall(r'\d+#RO', input_str, re.IGNORECASE)  # 匹配1#RO
+        sources = sources_uf + sources_ro
+        for sou in sources:
+            number_, flag_ = sou.split('#')
+            input_str = input_str.replace(sou, flag_.upper() + number_) # 统一转为大写
+        return input_str
+
+    @ staticmethod
+    def quicksort_up_part(arr:list, start:int, end:int)-> int:
+        """升序排序"""
+        # 双指针
+        low = start
+        high = end
+        pivot = arr[start][1] # 基准值
+        # 大数放在基准值右边,小数放在基准值左边
+        while low < high:
+            # 先从右向左找比基准值小的
+            while low< high and arr[high][1] >= pivot:
+                high -= 1
+            # 此时high指向值小于基准值,交换
+            if low < high:
+                arr[low], arr[high] = arr[high], arr[low]
+                low +=1
+            # 现在开始从左向右找,比基准值大的数
+            while low < high and arr[low][1] <= pivot:
+                low += 1
+            # 此时low指向值大于基准值,交换
+            if low < high:
+                arr[high], arr[low] = arr[low], arr[high]
+                high -= 1
+        return low
+
+
+    def quicksort_up(self, arr:list, start:int, end:int):
+        """按照元组第二个元素值大小进行升序排序"""
+        if start >= end:
+            return
+        # 先排一次获得基准值位置
+        mid = self.quicksort_up_part(arr, start, end)
+        # 排左面
+        self.quicksort_up(arr, start, mid - 1)
+        # 排右面
+        self.quicksort_up(arr, mid + 1, end)
+
+    def words_similarity_score_sorted(self, query:str, candidates:list)->list:
+        """计算输入语句与候选词的相似度并按照相似度分值进行排序"""
+        # 选择算法(示例使用Levenshtein,归一化到0-1)
+        candidates = candidates.copy()
+        jarowinkler = textdistance.JaroWinkler()
+        key_score_list = [(candidate, jarowinkler.normalized_similarity(query, candidate)) for candidate in candidates]
+        self.quicksort_up(key_score_list, 0, len(key_score_list) - 1)  # 升序排序
+        key_sorted_list = [tuple_element[0] for tuple_element in key_score_list]  # 取出key
+        key_sorted_list = key_sorted_list[::-1]  # 反转,变为降序
+        return key_sorted_list
+
+    def words_similarity_score_sorted_v2(self, query:str, candidates:list)->list:
+        """通过rerank的方式为候选词进行相似度排序"""
+        # 调用远程reranker模型
+        n = len(candidates)  # 候选词数量
+        group_query = [(query, i) for i in candidates]
+        score = self.model.compute_score(group_query)
+        key_score_list = [(candidates[i], score[i]) for i in range(n)]
+        self.quicksort_up(key_score_list, 0, len(key_score_list) - 1)  # 升序排序
+        key_sorted_list = [tuple_element[0] for tuple_element in key_score_list]  # 取出key
+        key_sorted_list = key_sorted_list[::-1]  # 反转,变为降序
+        return key_sorted_list
+
+    def match_v2_on(self, promt: str,is_agent:bool=False):
+        """
+        模糊匹配v2
+        :param is_agent:
+        :param promt:
+        :return:
+        """
+        print("=" * 50)
+        # 命名风格转换
+        print("原始查询:", promt)
+        promt = promt.replace('超滤', 'UF').replace('反渗透', 'RO').replace('号', '#').replace('组', '#')
+        promt = self.field_align(promt)
+        print("转换查询:", promt)
+        # 输入分词
+        nz_words = []
+        for w, f in pseg.lcut(promt):
+            print(f'{w}({f})', end="")
+            if f == 'nz':
+                nz_words.append(w)
+        print('\n备查nz词:', nz_words)
+
+        # 处理专有名词的等价词,为了保证高召回率,我们将备查词的所有等价说法都放入备查序列
+        equivalent_words = []
+        for nz_idx, nz in enumerate(nz_words):
+            # 首先判断nz词是否在等价词汇表中,如果不在根本无法替换
+            if nz in self.dict_equivalent_wordmap.keys():
+                # 然后把等价的说法都添加进去就好了
+                equivalent_words = self.dict_equivalent_wordmap.get(nz, [])
+        if equivalent_words:
+            nz_words += equivalent_words
+            nz_words = list(set(nz_words))
+        print('等价备查nz词:', nz_words)
+        del equivalent_words
+
+        # 进行一级查询,根据nz词是否包含于词典
+        query_level_one = []
+        for i in range(len(nz_words)):  # 为第i个nz词进行初次匹配
+            result = []
+            # 如果nz词包含在一级词典中就算匹配成功
+            for dict_level_1_key in self.dict_level_1.keys():
+                if nz_words[i] in dict_level_1_key:  # 如果nz词包含在一级词典内
+                    result+= self.dict_level_1.get(dict_level_1_key)
+            query_level_one.append(result)  # 放入一级查询结果中
+
+        # 进行二级查询
+        query_level_two = []
+        for idx_nz, i_nz_query_result in enumerate(query_level_one):  # 遍历每个nz词的查询结果
+            result = []  # 为第i个nz词进行二次匹配
+            # 如果第i个nz词一级查询不为空
+            if i_nz_query_result: # 第i个nz词的查询结果list
+                for res_word_level_one in i_nz_query_result:
+                    if res_word_level_one in self.dict_level_2.keys():
+                        result += self.dict_level_2.get(res_word_level_one)  # self.dict_level_2的value本身就是字典,所以用+=拼接
+            # 虽然一级查询失败,但是并不意味着映射词典里没有,因为一级词典忽略英文。
+            else:  # 如果一级查询失败,就直接在name2code字典中查询
+                if nz_words[idx_nz] in self.name_2_code_dict.keys():# 如果第i个nz词在2级词典,就直接添加到结果中
+                    result.append(nz_words[idx_nz])
+            # 如果第i个nz词的一级查询结果为空,则添加空列表占位
+            query_level_two.append(result)
+
+        # 常规精确匹配结束,如果匹配成功,结构为二维列表,否则为空列表
+        matched_keys = query_level_two  # 获取已匹配的字段
+        # 备查词合并,我们约定所有备查词进行统一的查询,后面怎么用这些结果取决于外部的应用,对于agent模式,将会输出许多结果,对月非agent只会输出概率最高的结果
+        tem_matched_keys = []
+        for item in matched_keys:
+            tem_matched_keys += item
+        matched_keys = [list(set(tem_matched_keys))]
+        del tem_matched_keys
+
+        # 如果精确匹配失败,没有匹配到任何结果则按照语义进行模糊匹配,返回满足条件的置信度最高的结果
+        # if not nz_words or ([] in matched_keys):
+        # 比起手动维护词典,我们更相信语义相似度
+        top_k = 5
+        confi = 0.2 # 置信度阈值
+        print(f'进入模糊匹配,召回Top:{top_k} 置信度阈值:{confi}...')
+        # 调用远程bge-m3模型进行embedding
+        query_embedding = np.array(self.model.encode([promt], normalize=True), dtype=np.float32) # 要求query_embedding是一个二维矩阵,形状为(1, 1024)
+        distances, indices = self.knowledge.search(query_embedding, top_k)
+        group_query = [(promt, self.plc_database_name_template_list[indices[0][i]]) for i in range(top_k)]
+        # 我们更愿意相信bge,因此把词典关键词匹配的结果一并放进去重排序
+        group_query_manuel = [(promt, k) for keys in matched_keys for k in keys]
+        group_query += group_query_manuel
+        del group_query_manuel
+        group_query = list(set(group_query))  # 去重
+        # 调用远程bge-reranker模型
+        score = self.model.compute_score(group_query)
+        rerank_result = sorted([(group_query[i][1], score[i]) for i in range(len(group_query))], key=lambda x: x[1], reverse=True)
+        print(F'打印前top{top_k}候选词结果:', rerank_result[:top_k])
+        print(f'首元素模糊匹配到{rerank_result[0][0]}, 置信度为{rerank_result[0][1]}')
+        # matched_keys 为最终结果,保持形状为二维列表
+        matched_keys = [[i[0] for i in rerank_result]]
+        # 每个匹配结果的置信度
+        matched_keys_score = [[i[1] for i in rerank_result]]
+
+        # 为结果创建映射字典
+        result_list = []
+        for i_nz_keys in matched_keys:
+            result_list.append([{key: self.name_2_code_dict.get(key)} for key in i_nz_keys])
+        print(f"查询到{len([_ for _ in result_list if _])}个结果:")
+
+        if not is_agent:
+            # 非agent模式每个匹配结果只取第一个元素的英文
+            tem_list = []
+            for res in result_list:
+                if res:
+                    for k, v in res[0].items():  # 每个nz词的查询结果都是一个list,每个list可能包含多个字典
+                        tem_list.append(f'{k}:{v}')
+            result_list = tem_list
+            print('以非agent模式返回:', result_list)
+            return result_list
+
+        print('以agent模式返回:', result_list)
+        print('='*50)
+        return result_list, matched_keys_score
+
+if __name__ == '__main__':
+    pj = 92  # pcl点位
+    pcl_helper = PLCMatch(project_id=pj)
+    # 用户输入
+    my_promt = "我想要查询锡山中荷进水电导率"
+    # query_res = pcl_helper.match_v2_on(my_promt, is_agent=True)
+    query_res = pcl_helper.match_v2_on(my_promt, is_agent=False)
+
+    pass
+
+
+

+ 30 - 0
item_del_tool.py

@@ -0,0 +1,30 @@
+import pandas as pd
+
+# 读取xlsx
+project_id = 1450
+xlsx_file = f'./plc点位原始文件/{project_id}_点位_原始.xlsx'
+df_xlsx = pd.read_excel(xlsx_file)
+# 字段剔除关键字
+del_list = ['相电压', '相电流', '启动操作', '停止操作']+\
+['备用', '中间值,不用读', '报警', '联动标志', '排水显示']+\
+['校准开关', '功率因数', '开关', '设定', '是否','起泵','停泵']+\
+['高限设置','低限设置', '手自动','关操作', '关到位', '开操作','开到位','打开/关闭', '远程']+\
+['运行电流', '不确定', '未知', '复位', '需要']+\
+['准备好', '信息清除', '提醒字', '控制字', '设置频率']+\
+['循环', '启停', '故障字', '投入/切除', '定频控制', '控制模式', '选择', '开机/停止', '杀菌步序']+\
+['加药阀', '搅拌器', '定频率', '设置', '手动/自动', '按钮']+\
+['跳转', '紧急', '启动停止', '申请', '应答', '允许', '排队', '启动/停止', '打开', '关闭']+\
+['手动开','手动关','自动关闭','手动启动','自动启动','手动停止','泵启动', '入栈','出栈','按键','信号类型','flag','Flag','FLAG']
+# 遍历每行
+reserve_idx = []
+for index, column in df_xlsx.iterrows():
+    flag = True
+    for key_word in del_list:
+        if key_word in column.loc['item_alias']:
+            flag = False
+            break
+    if flag:
+        reserve_idx.append(index)
+new_df = df_xlsx.iloc[reserve_idx, :].reset_index(drop=True)
+# 写入新文件
+new_df.to_excel(xlsx_file.replace('_原始.','.'), index=False)

Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/1181_plc_dictionary/1181_dict_level_1.json


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/1181_plc_dictionary/1181_dict_level_2.json


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/1181_plc_dictionary/1181_dict_name_2_code.json


BIN
plc_dictionary/1181_plc_dictionary/1181_knowledge.faiss


BIN
plc_dictionary/1181_plc_dictionary/1181_点位.xlsx


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/1202_plc_dictionary/1202_dict_level_1.json


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/1202_plc_dictionary/1202_dict_level_2.json


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/1202_plc_dictionary/1202_dict_name_2_code.json


BIN
plc_dictionary/1202_plc_dictionary/1202_knowledge.faiss


BIN
plc_dictionary/1202_plc_dictionary/1202_点位.xlsx


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/1450_plc_dictionary/1450_dict_level_1.json


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/1450_plc_dictionary/1450_dict_level_2.json


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/1450_plc_dictionary/1450_dict_name_2_code.json


BIN
plc_dictionary/1450_plc_dictionary/1450_knowledge.faiss


BIN
plc_dictionary/1450_plc_dictionary/1450_点位.xlsx


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/92_plc_dictionary/92_dict_level_1.json


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/92_plc_dictionary/92_dict_level_2.json


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
plc_dictionary/92_plc_dictionary/92_dict_name_2_code.json


BIN
plc_dictionary/92_plc_dictionary/92_knowledge.faiss


BIN
plc_dictionary/92_plc_dictionary/92_点位.xlsx


+ 335 - 0
plclib.py

@@ -0,0 +1,335 @@
+import os
+script_dir = os.path.dirname(os.path.abspath(__file__))
+import sys
+sys.path.append(script_dir)
+import jieba
+import jieba.posseg as pseg
+import re
+import os
+import json
+import textdistance
+import warnings
+import numpy as np
+import faiss
+from remote_model import RemoteBGEModel
+
+
+class PLCLib:
+
+    def __init__(self):
+        """缓存待实现"""
+        self.project_id = None
+        self.plc_dict_root_dir = None
+        self.name_2_code_dict = None
+        self.plc_database_name_template_list = None
+        self.dict_equivalent_wordmap = None
+        self.dict_level_2 = None
+        self.dict_level_1 = None
+        self.user_dict_list = None
+        self.knowledge = None
+
+        # 加载bge-m3和bge-reranker远程模型
+        self.model = RemoteBGEModel('dev')
+
+        # 加载用户自定义词典,添加到jieba词库, 不依赖水厂id
+        self.script_dir = os.path.dirname(os.path.abspath(__file__)) # 脚本绝对路径
+        user_dictionary_dir = os.path.join(self.script_dir, 'user_maintain_dictionary', 'jieba_words')
+        if not os.path.exists(user_dictionary_dir):
+            warnings.warn(f'用户分词词典不存在,严重影响匹配成功率,请检查路径{user_dictionary_dir}是否存在!', UserWarning)
+        else:
+            self.user_dict_list = [os.path.join(user_dictionary_dir, _) for _ in os.listdir(user_dictionary_dir) if _.split('.')[-1] == 'txt']  # 用户词典
+            self.__load_user_dict()
+
+
+    def load(self, project_id):
+        """加载词典"""
+        self.project_id = project_id
+        self.plc_dict_root_dir = os.path.join(self.script_dir, 'plc_dictionary',f'{self.project_id}_plc_dictionary')
+        # 加载name2code
+        self.name_2_code_dict = self.__read_pcl()
+        self.plc_database_name_template_list = list(self.name_2_code_dict.keys())
+        # 加载等价词表
+        self.dict_equivalent_wordmap = self.__construct_equivalent_wordmap()
+        # 加载二级词典
+        self.dict_level_2 =self.__make_level_two_dictionary()
+        # 加载一级词典
+        self.dict_level_1 = self.__make_level_one_dictionary()
+        # 加载本地知识库
+        self.knowledge = self.__load_faiss_database()
+
+    def __load_faiss_database(self):
+        """从本地加载向量数据库"""
+        # 水厂的数据库字段知识库
+        faiss_path = os.path.join(self.plc_dict_root_dir, f'{self.project_id}_knowledge.faiss')
+        # 尝试从本地加载
+        if os.path.exists(faiss_path):
+            print('PLC点位查询功能从本地加载点位字段向量知识库...')
+            local_faiss = faiss.read_index(faiss_path)
+        else:
+            raise FileNotFoundError('file not found!', faiss_path)
+        return local_faiss
+
+
+    @staticmethod
+    def field_align(input_str:str)->str:
+        """按照锡山中荷命名规范对齐字段,1#UF替换为UF1,1#RO替换为RO1,保持统一"""
+        sources_uf = re.findall(r'\d+#UF', input_str, re.IGNORECASE)  # 匹配1#UF
+        sources_ro = re.findall(r'\d+#RO', input_str, re.IGNORECASE)  # 匹配1#RO
+        sources = sources_uf + sources_ro
+        for sou in sources:
+            number_, flag_ = sou.split('#')
+            input_str = input_str.replace(sou, flag_.upper() + number_) # 统一转为大写
+        return input_str
+
+    def __construct_equivalent_wordmap(self):
+        """构建等价词汇映射表"""
+        # 检查文件是否存在
+        equivalent_wordmap_path = os.path.join(self.script_dir, 'user_maintain_dictionary','equivalent_words', 'dict_equivalent_wordmap.json')
+        if os.path.exists(equivalent_wordmap_path):
+            with open(equivalent_wordmap_path, 'r', encoding='utf-8') as f:
+                equivalent_wordmap = json.load(f)
+        else:
+            raise FileNotFoundError('file not found!', equivalent_wordmap_path)
+        return equivalent_wordmap
+
+    def __make_level_one_dictionary(self):
+        """创建一级字典"""
+        # 尝试从本地加载一级字典
+        dict_level_1_path = os.path.join(self.plc_dict_root_dir, f'{self.project_id}_dict_level_1.json')
+        if os.path.exists(dict_level_1_path):
+            with open(dict_level_1_path, 'r', encoding='utf-8') as f:
+                group_dict = json.load(f)
+        else:
+            raise FileNotFoundError('file not found!', dict_level_1_path)
+        return group_dict
+
+    def __make_level_two_dictionary(self):
+        """创建二级字典,对点位所有字段进行正则匹配中文,将中文一样的字段聚合为同一个字典键值对,键为正则提取的中文字符"""
+        # 尝试从本地加载二级字典
+        dict_level2_dict_path = os.path.join(self.plc_dict_root_dir, f'{self.project_id}_dict_level_2.json')
+        if os.path.exists(dict_level2_dict_path):
+            with open(dict_level2_dict_path, 'r', encoding='utf-8') as f:
+                group_dict = json.load(f)
+        else:
+            raise FileNotFoundError('file not found!', dict_level2_dict_path)
+        return group_dict
+
+    def __read_pcl(self):
+        """
+        读取pcl文件,生成name2code词典
+        :return:
+        """
+        # 尝试从本地加载name-code映射字典
+        dict_name2code_path = os.path.join(self.plc_dict_root_dir, f'{self.project_id}_dict_name_2_code.json')
+        if os.path.exists(dict_name2code_path):
+            with open(dict_name2code_path, 'r', encoding='utf-8') as f:
+                dict_name2code = json.load(f)
+        else:
+            raise FileNotFoundError('file not found!', dict_name2code_path)
+        return dict_name2code
+
+    def __load_user_dict(self):
+        """加载用户词典,添加到jieba词库"""
+        # 删除
+        jieba.del_word('反渗透')
+        jieba.del_word('超滤')
+        for user_dict_txt in self.user_dict_list:
+            # # 检查文件是否存在
+            # if not os.path.exists(user_dict_txt):
+            #     raise FileNotFoundError(f'{user_dict_txt} does not exist')
+            # # 检查文件后缀名是否合法
+            # if os.path.splitext(user_dict_txt)[1] != '.txt':
+            #     continue
+            # 分词库加载用户字典
+            jieba.load_userdict(user_dict_txt)
+    @ staticmethod
+    def quicksort_up_part(arr:list, start:int, end:int)-> int:
+        """升序排序"""
+        # 双指针
+        low = start
+        high = end
+        pivot = arr[start][1] # 基准值
+        # 大数放在基准值右边,小数放在基准值左边
+        while low < high:
+            # 先从右向左找比基准值小的
+            while low< high and arr[high][1] >= pivot:
+                high -= 1
+            # 此时high指向值小于基准值,交换
+            if low < high:
+                arr[low], arr[high] = arr[high], arr[low]
+                low +=1
+            # 现在开始从左向右找,比基准值大的数
+            while low < high and arr[low][1] <= pivot:
+                low += 1
+            # 此时low指向值大于基准值,交换
+            if low < high:
+                arr[high], arr[low] = arr[low], arr[high]
+                high -= 1
+        return low
+    def quicksort_up(self, arr:list, start:int, end:int):
+        """按照元组第二个元素值大小进行升序排序"""
+        if start >= end:
+            return
+        # 先排一次获得基准值位置
+        mid = self.quicksort_up_part(arr, start, end)
+        # 排左面
+        self.quicksort_up(arr, start, mid - 1)
+        # 排右面
+        self.quicksort_up(arr, mid + 1, end)
+    def words_similarity_score_sorted(self, query:str, candidates:list)->list:
+        """计算输入语句与候选词的相似度并按照相似度分值进行排序"""
+        # 选择算法(示例使用Levenshtein,归一化到0-1)
+        candidates = candidates.copy()
+        jarowinkler = textdistance.JaroWinkler()
+        key_score_list = [(candidate, jarowinkler.normalized_similarity(query, candidate)) for candidate in candidates]
+        self.quicksort_up(key_score_list, 0, len(key_score_list) - 1)  # 升序排序
+        key_sorted_list = [tuple_element[0] for tuple_element in key_score_list]  # 取出key
+        key_sorted_list = key_sorted_list[::-1]  # 反转,变为降序
+        return key_sorted_list
+
+    def words_similarity_score_sorted_v2(self, query:str, candidates:list)->list:
+        """通过rerank的方式为候选词进行相似度排序"""
+        # 调用远程reranker模型
+        n = len(candidates)  # 候选词数量
+        group_query = [(query, i) for i in candidates]
+        score = self.model.compute_score(group_query)
+        key_score_list = [(candidates[i], score[i]) for i in range(n)]
+        self.quicksort_up(key_score_list, 0, len(key_score_list) - 1)  # 升序排序
+        key_sorted_list = [tuple_element[0] for tuple_element in key_score_list]  # 取出key
+        key_sorted_list = key_sorted_list[::-1]  # 反转,变为降序
+        return key_sorted_list
+
+    def query(self, promt, is_agent:bool=False):
+        """直接拷贝PLCMatch_match_v2_on函数"""
+        """
+        模糊匹配v2
+        :param is_agent:
+        :param promt:
+        :return:
+        """
+        print("=" * 50)
+        # 命名风格转换
+        print("原始查询:", promt)
+        promt = promt.replace('超滤', 'UF').replace('反渗透', 'RO').replace('号', '#').replace('组', '#')
+        promt = self.field_align(promt)
+        print("转换查询:", promt)
+        # 输入分词
+        nz_words = []
+        for w, f in pseg.lcut(promt):
+            print(f'{w}({f})', end="")
+            if f == 'nz':
+                nz_words.append(w)
+        print('\n备查nz词:', nz_words)
+
+        # 处理专有名词的等价词,为了保证高召回率,我们将备查词的所有等价说法都放入备查序列
+        equivalent_words = []
+        for nz_idx, nz in enumerate(nz_words):
+            # 首先判断nz词是否在等价词汇表中,如果不在根本无法替换
+            if nz in self.dict_equivalent_wordmap.keys():
+                # 然后把等价的说法都添加进去就好了
+                equivalent_words = self.dict_equivalent_wordmap.get(nz, [])
+        if equivalent_words:
+            nz_words += equivalent_words
+            nz_words = list(set(nz_words))
+        print('等价备查nz词:', nz_words)
+        del equivalent_words
+
+        # 进行一级查询,根据nz词是否包含于词典
+        query_level_one = []
+        for i in range(len(nz_words)):  # 为第i个nz词进行初次匹配
+            result = []
+            # 如果nz词包含在一级词典中就算匹配成功
+            for dict_level_1_key in self.dict_level_1.keys():
+                if nz_words[i] in dict_level_1_key:  # 如果nz词包含在一级词典内
+                    result+= self.dict_level_1.get(dict_level_1_key)
+            query_level_one.append(result)  # 放入一级查询结果中
+
+        # 进行二级查询
+        query_level_two = []
+        for idx_nz, i_nz_query_result in enumerate(query_level_one):  # 遍历每个nz词的查询结果
+            result = []  # 为第i个nz词进行二次匹配
+            # 如果第i个nz词一级查询不为空
+            if i_nz_query_result: # 第i个nz词的查询结果list
+                for res_word_level_one in i_nz_query_result:
+                    if res_word_level_one in self.dict_level_2.keys():
+                        result += self.dict_level_2.get(res_word_level_one)  # self.dict_level_2的value本身就是字典,所以用+=拼接
+            # 虽然一级查询失败,但是并不意味着映射词典里没有,因为一级词典忽略英文。
+            else:  # 如果一级查询失败,就直接在name2code字典中查询
+                if nz_words[idx_nz] in self.name_2_code_dict.keys():# 如果第i个nz词在2级词典,就直接添加到结果中
+                    result.append(nz_words[idx_nz])
+            # 如果第i个nz词的一级查询结果为空,则添加空列表占位
+            query_level_two.append(result)
+
+        # 常规精确匹配结束,如果匹配成功,结构为二维列表,否则为空列表
+        matched_keys = query_level_two  # 获取已匹配的字段
+        # 备查词合并,我们约定所有备查词进行统一的查询,后面怎么用这些结果取决于外部的应用,对于agent模式,将会输出许多结果,对月非agent只会输出概率最高的结果
+        tem_matched_keys = []
+        for item in matched_keys:
+            tem_matched_keys += item
+        matched_keys = [list(set(tem_matched_keys))]
+        del tem_matched_keys
+
+        # 如果精确匹配失败,没有匹配到任何结果则按照语义进行模糊匹配,返回满足条件的置信度最高的结果
+        # if not nz_words or ([] in matched_keys):
+        # 比起手动维护词典,我们更相信语义相似度
+        top_k = 5
+        confi = 0.2 # 置信度阈值
+        print(f'进入模糊匹配,召回Top:{top_k} 置信度阈值:{confi}...')
+        # 调用远程bge-m3模型进行embedding
+        query_embedding = np.array(self.model.encode([promt], normalize=True), dtype=np.float32) # 要求query_embedding是一个二维矩阵,形状为(1, 1024)
+        distances, indices = self.knowledge.search(query_embedding, top_k)
+        group_query = [(promt, self.plc_database_name_template_list[indices[0][i]]) for i in range(top_k)]
+        # 我们更愿意相信bge,因此把词典关键词匹配的结果一并放进去重排序
+        group_query_manuel = [(promt, k) for keys in matched_keys for k in keys]
+        group_query += group_query_manuel
+        del group_query_manuel
+        group_query = list(set(group_query))  # 去重
+        # 调用远程bge-reranker模型
+        score = self.model.compute_score(group_query)
+        rerank_result = sorted([(group_query[i][1], score[i]) for i in range(len(group_query))], key=lambda x: x[1], reverse=True)
+        print(F'打印前top{top_k}候选词结果:', rerank_result[:top_k])
+        print(f'首元素模糊匹配到{rerank_result[0][0]}, 置信度为{rerank_result[0][1]}')
+        # matched_keys 为最终结果,保持形状为二维列表
+        matched_keys = [[i[0] for i in rerank_result]]
+        # 每个匹配结果的置信度
+        matched_keys_score = [[i[1] for i in rerank_result]]
+
+        # 为结果创建映射字典
+        result_list = []
+        for i_nz_keys in matched_keys:
+            result_list.append([{key: self.name_2_code_dict.get(key)} for key in i_nz_keys])
+        print(f"查询到{len([_ for _ in result_list if _])}个结果:")
+
+        if not is_agent:
+            # 非agent模式每个匹配结果只取第一个元素的英文
+            tem_list = []
+            for res in result_list:
+                if res:
+                    for k, v in res[0].items():  # 每个nz词的查询结果都是一个list,每个list可能包含多个字典
+                        tem_list.append(f'{k}:{v}')
+            result_list = tem_list
+            print('以非agent模式返回:', result_list)
+            return result_list
+
+        print('以agent模式返回:', result_list)
+        print('='*50)
+        return result_list, matched_keys_score
+
+# 步骤1:实例化,单例模式
+helper = PLCLib()
+
+if __name__ == '__main__':
+    # demo
+    # 步骤2:按照水厂加载数据库
+    helper.load(92)
+    # 步骤3:根据查询匹配水厂
+    # helper.query("查询RO1回收率、RO2回收率、...")
+    helper.query("查询中荷水厂产水电导率", is_agent=False)
+    # agent 模式
+    # 输出格式:list, [RO1回收率查询结果, RO2回收率查询结果, ...]
+    # RO1回收率查询结果:list, [{'RO1回收率': 'RO1HSL'}]
+    # RO2回收率查询结果:list, [{'RO2回收率': 'RO2HSL'}]
+    # ...
+    # 完整查询格式: [[{'RO1回收率': 'RO1HSL'}], [{'RO2回收率': 'RO2HSL'}]]
+    # 非agent 模式,每个结果取首个元素,直接返回英文code
+    #

BIN
plc点位原始文件/1181_点位_原始.xlsx


BIN
plc点位原始文件/1202_点位_原始.xlsx


BIN
plc点位原始文件/1450_点位_原始.xlsx


BIN
plc点位原始文件/92_点位_原始.xlsx


+ 10 - 0
readme

@@ -0,0 +1,10 @@
+create_level_query_dict.py 创建二级查询字典
+item_del_tool.py 字段剔除工具
+user_dictionary.txt 用户分词词典
+user_level_1.json 1级词典,一轮查询
+user_level_2.json 2级词典,二轮查询
+
+注意,用户词典中的词如果数据库字段中包含用户词典中的词,那么
+
+
+version:2025年11月7日15点12分

+ 132 - 0
remote_model.py

@@ -0,0 +1,132 @@
+# version: 2025.12.04
+import requests
+from typing import List, Tuple, Optional
+import os
+import json
+import time
+import numpy as np
+from FlagEmbedding import FlagAutoModel, FlagReranker
+script_dir = os.path.dirname(os.path.abspath(__file__))
+
+class RemoteBGEModel:
+
+    def __init__(self, branch:str='dev', timeout:int=3, max_retries:int=3):
+        # 加载网址配置文件
+        self.branch = branch.strip().lower()
+        if not self.branch in ['dev', 'test', 'master', 'main', 'local']:  # 输入参数合法
+            raise ValueError("Param 'branch' must be dev test master or main",branch)
+
+        self.url_file = os.path.join(script_dir, 'url_config.json')
+        self.embedding_url, self.reranker_url = self.load_url()
+        self.timeout = timeout
+        self.max_retries = max_retries
+        # 构建请求头
+        self.headers = {"Content-Type": "application/json"}
+
+    def load_url(self):
+        """加载url"""
+        if not os.path.exists(self.url_file):
+            raise FileNotFoundError("File not exist", self.url_file)
+        # 读取json配置文件
+        with open(self.url_file, 'r', encoding='utf-8') as f:
+            json_data = json.load(f)
+        if self.branch == 'dev' or self.branch == 'test':
+            embed_url = json_data['dev_embed_url'] + '/embed'
+            rerank_url = json_data['dev_reranker_url'] + '/rerank'
+        elif self.branch == 'main' or self.branch == 'master':
+            embed_url = json_data['master_embed_url'] + '/embed'
+            rerank_url = json_data['master_reranker_url'] + '/rerank'
+        else:
+            embed_url = json_data['local_embed_url'] + '/embed'
+            rerank_url = json_data['local_reranker_url'] + '/rerank'
+        return embed_url, rerank_url
+
+    def _access_remote_model(self, url:str, data:dict):
+        """调用bge-m3,embedding"""
+        # 类型检查
+        time.sleep(0.08)  # 方式频繁调用接口
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.post(url=url, headers=self.headers, json=data)
+                if response.status_code == 200:
+                    return np.array(response.json())
+            except Exception as e:
+                print('请求embedding模型失败', e)
+                time.sleep(1)
+                return None
+        return None
+
+    def encode(self,texts: List[str], normalize: bool = True):
+        """调用bge-m3,embedding"""
+        # 类型检查
+        if not isinstance(texts, list) and not isinstance(texts, str):
+            raise TypeError("Text must be list or string",texts)
+        if isinstance(texts, List):
+            if not texts:
+                raise ValueError("Text must not be empty",texts)
+            for i, content in enumerate(texts):
+                if not isinstance(content, str):
+                    raise ValueError(f"Text must not be empty, pos:{i}, content{content}")
+        data = {"inputs":texts, "normalize":normalize}
+
+        return self._access_remote_model(
+            url=self.embedding_url,
+            data=data
+        )
+
+    def compute_score(self, pairs: List[Tuple[str, str]]):
+        """调用远程bge-reranker计算相关性, 并按照原位置输出分数"""
+        # 类型检查
+        if not isinstance(pairs, list):
+            raise TypeError("Pairs must be list",pairs)
+
+        if not pairs:
+            raise ValueError("Pairs must not be empty",pairs)
+
+        if len(pairs[0]) != 2:
+            raise ValueError("Pairs must not be empty",pairs)
+        i = 0
+        for j, k in pairs:
+            if not isinstance(j, str) or not isinstance(k, str):
+                raise TypeError(f"Elements of every pairs must not be str, pos:{i}, ({j}, {k})")
+            i+=1
+        # 判断pairs的每个query是否为一致
+        if len(pairs) >= 3:
+            for i in range(1, len(pairs), len(pairs) - 1):
+                if pairs[i - 1][0] != pairs[i][0] or pairs[i-1][0] != pairs[i+1][0]:
+                    raise ValueError("Pairs must have the same query", pairs)
+        elif len(pairs) == 2:
+            if pairs[0][0] != pairs[1][0]:
+                raise ValueError("Pairs must have the same query", pairs)
+        texts = [t for q, t in pairs]
+        data = {
+            "query": pairs[0][0],  # 对于bge-reranker,query字段可为空
+            "texts": texts
+        }
+
+        # 返回rerank结果
+        res = self._access_remote_model(
+            url=self.reranker_url,
+            data=data
+        )
+        # 按照原有位置输出score
+        score = [_["score"] for _ in sorted(res, key=lambda x: x["index"])]
+        return score
+
+
+if __name__ == "__main__":
+    timeout = 3
+    max_retries = 3
+    bge_model = RemoteBGEModel('dev', timeout, max_retries)
+    t = bge_model.encode(["hello"], normalize=True)
+    tt = bge_model.compute_score([("你好呀我的名字叫做汤姆","今天世界杯中国得了冠军"),
+                                  ("你好呀我的名字叫做汤姆","你好呀我的名字叫做山姆"),
+                                  ("你好呀我的名字叫做汤姆","你好呀我的名字叫做汤姆?"),
+                                  ("你好呀我的名字叫做汤姆","我今天非常的开心,你呢?")])
+    # reranker = FlagReranker(os.path.join(script_dir, 'bge-reranker-v2-m3'), use_fp16=True, local_files_only=True,
+    #                              devices=["cuda:0"])
+    # ttt = reranker.compute_score([("你好呀我的名字叫做汤姆","今天世界杯中国得了冠军"),
+    #                               ("你好呀我的名字叫做汤姆","你好呀我的名字叫做山姆"),
+    #                               ("你好呀我的名字叫做汤姆","你好呀我的名字叫做汤姆?"),
+    #                               ("你好呀我的名字叫做汤姆","我今天非常的开心,你呢?")], normalize=True)
+    pass

+ 8 - 0
url_config.json

@@ -0,0 +1,8 @@
+{
+  "dev_embed_url": "http://101.200.76.30:8002",
+  "dev_reranker_url": "http://101.200.76.30:8003",
+  "master_embed_url": "http://101.200.76.30:8002",
+  "master_reranker_url": "http://101.200.76.30:8003",
+  "local_embed_url": "http://101.200.76.30:8002",
+  "local_reranker_url": "http://101.200.76.30:8003"
+}

+ 1 - 0
user_maintain_dictionary/equivalent_words/dict_equivalent_wordmap.json

@@ -0,0 +1 @@
+{"总回收率": ["总回收率", "回收率"], "回收率": ["总回收率", "回收率"], "总进水量": ["总进水量", "进水量"], "进水量": ["总进水量", "进水量"], "总产水电导": ["总产水电导", "产水电导"], "产水电导": ["总产水电导", "产水电导"], "总进水电导": ["总进水电导", "进水电导"], "进水电导": ["总进水电导", "进水电导"], "总产水压力": ["总产水压力", "产水压力"], "产水压力": ["总产水压力", "产水压力"], "总产水流量": ["总产水流量", "产水流量"], "产水流量": ["总产水流量", "产水流量"], "总进水流量": ["总进水流量", "进水流量"], "进水流量": ["总进水流量", "进水流量"], "电导": ["电导", "电导率"], "电导率": ["电导", "电导率"], "": [""]}

+ 9 - 0
user_maintain_dictionary/equivalent_words/equivalent_wordmap.txt

@@ -0,0 +1,9 @@
+总回收率=回收率
+总进水量=进水量
+总产水电导=产水电导
+总进水电导=进水电导
+总产水压力=产水压力
+总产水流量=产水流量
+总进水流量=进水流量
+电导=电导率
+

+ 40 - 0
user_maintain_dictionary/jieba_words/A综合.txt

@@ -0,0 +1,40 @@
+ph 1000 nz
+CIP 1000 nz
+液位 1000 nz
+温度 1000 nz
+UF进水浊度 1000 nz
+UF产水浊度 1000 nz
+
+供水泵 1000 nz
+UF泵 1000 nz
+反洗泵 1000 nz
+UF反洗水泵 1000 nz
+清水外供泵 1000 nz
+清洗水泵 1000 nz
+高压泵 1000 nz
+加药泵 1000 nz
+段间泵 1000 nz
+卸料泵 1000 nz
+
+反洗膜通量 1000 nz
+清洗膜通量 1000 nz
+
+自清洗过滤器 1000 nz
+脱碳风机 1000 nz
+
+还原剂 1000 nz
+阻垢剂 1000 nz
+絮凝剂 1000 nz
+盐酸 1000 nz
+
+清水池 1000 nz
+中和池 1000 nz
+
+水温校正因子 1000 nz
+
+外供水 1000 nz
+UF总产水 1000 nz
+进水池 1000 nz
+反洗水池 1000 nz
+中荷废水 1000 nz
+

+ 9 - 0
user_maintain_dictionary/jieba_words/B产水率.txt

@@ -0,0 +1,9 @@
+产水率 1000 nz
+RO1产水率 1000 nz
+RO2产水率 1000 nz
+RO3产水率 1000 nz
+RO4产水率 1000 nz
+RO5产水率 1000 nz
+RO6产水率 1000 nz
+RO7产水率 1000 nz
+RO8产水率 1000 nz

+ 40 - 0
user_maintain_dictionary/jieba_words/B回收率.txt

@@ -0,0 +1,40 @@
+回收率 1000 nz
+产水率 1000 nz
+运行回收率 1000 nz
+RO回收率 1000 nz
+RO总回收率 1000 nz
+UF回收率 1000 nz
+UF总回收率 1000 nz
+RO1回收率 1000 nz
+RO2回收率 1000 nz
+RO3回收率 1000 nz
+RO4回收率 1000 nz
+RO5回收率 1000 nz
+RO6回收率 1000 nz
+RO7回收率 1000 nz
+RO8回收率 1000 nz
+RO1运行回收率 1000 nz
+RO2运行回收率 1000 nz
+RO3运行回收率 1000 nz
+RO4运行回收率 1000 nz
+RO5运行回收率 1000 nz
+RO6运行回收率 1000 nz
+RO7运行回收率 1000 nz
+RO8运行回收率 1000 nz
+UF1回收率 1000 nz
+UF2回收率 1000 nz
+UF3回收率 1000 nz
+UF4回收率 1000 nz
+UF5回收率 1000 nz
+UF6回收率 1000 nz
+UF7回收率 1000 nz
+UF8回收率 1000 nz
+UF1运行回收率 1000 nz
+UF2运行回收率 1000 nz
+UF3运行回收率 1000 nz
+UF4运行回收率 1000 nz
+UF5运行回收率 1000 nz
+UF6运行回收率 1000 nz
+UF7运行回收率 1000 nz
+UF8运行回收率 1000 nz
+

+ 15 - 0
user_maintain_dictionary/jieba_words/B电导.txt

@@ -0,0 +1,15 @@
+电导 1000 nz
+电导率 1000 nz
+产水电导 1000 nz
+RO1产水电导 1000 nz
+RO2产水电导 1000 nz
+RO3产水电导 1000 nz
+RO4产水电导 1000 nz
+RO5产水电导 1000 nz
+RO6产水电导 1000 nz
+RO7产水电导 1000 nz
+RO8产水电导 1000 nz
+RO总产水电导 1000 nz
+RO产水电导 1000 nz
+RO总进水电导 1000 nz
+RO进水电导 1000 nz

+ 5 - 0
user_maintain_dictionary/jieba_words/B脱盐率.txt

@@ -0,0 +1,5 @@
+脱盐率 1000 nz
+RO1脱盐率 1000 nz
+RO2脱盐率 1000 nz
+RO3脱盐率 1000 nz
+RO4脱盐率 1000 nz

+ 18 - 0
user_maintain_dictionary/jieba_words/C膜渗透率.txt

@@ -0,0 +1,18 @@
+膜渗透率 1000 nz
+渗透率 1000 nz
+UF1膜渗透率 1000 nz
+UF2膜渗透率 1000 nz
+UF3膜渗透率 1000 nz
+UF4膜渗透率 1000 nz
+UF5膜渗透率 1000 nz
+UF6膜渗透率 1000 nz
+UF7膜渗透率 1000 nz
+UF8膜渗透率 1000 nz
+UF1渗透率 1000 nz
+UF2渗透率 1000 nz
+UF3渗透率 1000 nz
+UF4渗透率 1000 nz
+UF5渗透率 1000 nz
+UF6渗透率 1000 nz
+UF7渗透率 1000 nz
+UF8渗透率 1000 nz

+ 53 - 0
user_maintain_dictionary/jieba_words/C膜通量.txt

@@ -0,0 +1,53 @@
+膜通量 1000 nz
+膜运行通量 1000 nz
+UF1膜运行通量 1000 nz
+UF2膜运行通量 1000 nz
+UF3膜运行通量 1000 nz
+UF4膜运行通量 1000 nz
+UF5膜运行通量 1000 nz
+UF6膜运行通量 1000 nz
+UF7膜运行通量 1000 nz
+UF8膜运行通量 1000 nz
+RO1膜运行通量 1000 nz
+RO2膜运行通量 1000 nz
+RO3膜运行通量 1000 nz
+RO4膜运行通量 1000 nz
+RO5膜运行通量 1000 nz
+RO6膜运行通量 1000 nz
+RO7膜运行通量 1000 nz
+RO8膜运行通量 1000 nz
+通量 1000 nz
+UF1通量 1000 nz
+UF2通量 1000 nz
+UF3通量 1000 nz
+UF4通量 1000 nz
+UF5通量 1000 nz
+UF6通量 1000 nz
+UF7通量 1000 nz
+UF8通量 1000 nz
+RO1通量 1000 nz
+RO2通量 1000 nz
+RO3通量 1000 nz
+RO4通量 1000 nz
+RO5通量 1000 nz
+RO6通量 1000 nz
+RO7通量 1000 nz
+RO8通量 1000 nz
+UF1膜通量 1000 nz
+UF2膜通量 1000 nz
+UF3膜通量 1000 nz
+UF4膜通量 1000 nz
+UF5膜通量 1000 nz
+UF6膜通量 1000 nz
+UF7膜通量 1000 nz
+UF8膜通量 1000 nz
+RO1膜通量 1000 nz
+RO2膜通量 1000 nz
+RO3膜通量 1000 nz
+RO4膜通量 1000 nz
+RO5膜通量 1000 nz
+RO6膜通量 1000 nz
+RO7膜通量 1000 nz
+RO8膜通量 1000 nz
+
+

+ 9 - 0
user_maintain_dictionary/jieba_words/C跨膜压差.txt

@@ -0,0 +1,9 @@
+跨膜压差 1000 nz
+UF1跨膜压差 1000 nz
+UF2跨膜压差 1000 nz
+UF3跨膜压差 1000 nz
+UF4跨膜压差 1000 nz
+UF5跨膜压差 1000 nz
+UF6跨膜压差 1000 nz
+UF7跨膜压差 1000 nz
+UF8跨膜压差 1000 nz

+ 21 - 0
user_maintain_dictionary/jieba_words/产水压力.txt

@@ -0,0 +1,21 @@
+产水压力 1000
+RO总产水压力 1000 nz
+RO产水压力 1000 nz
+RO1产水压力 1000 nz
+RO2产水压力 1000 nz
+RO3产水压力 1000 nz
+RO4产水压力 1000 nz
+RO5产水压力 1000 nz
+RO6产水压力 1000 nz
+RO7产水压力 1000 nz
+RO8产水压力 1000 nz
+UF总产水压力 1000 nz
+UF产水压力 1000 nz
+UF1产水压力 1000 nz
+UF2产水压力 1000 nz
+UF3产水压力 1000 nz
+UF4产水压力 1000 nz
+UF5产水压力 1000 nz
+UF6产水压力 1000 nz
+UF7产水压力 1000 nz
+UF8产水压力 1000 nz

+ 30 - 0
user_maintain_dictionary/jieba_words/产水流量.txt

@@ -0,0 +1,30 @@
+产水流量 1000 nz
+UF产水流量 1000 nz
+UF总产水流量 1000 nz
+RO产水流量 1000 nz
+RO总产水流量 1000 nz
+RO4一二段产水流量比值 1000 nz
+RO3一二段产水流量比值 1000 nz
+RO2一二段产水流量比值 1000 nz
+RO1一二段产水流量比值 1000 nz
+RO5一二段产水流量比值 1000 nz
+RO6一二段产水流量比值 1000 nz
+RO7一二段产水流量比值 1000 nz
+RO8一二段产水流量比值 1000 nz
+RO1产水流量 1000 nz
+RO2产水流量 1000 nz
+RO3产水流量 1000 nz
+RO4产水流量 1000 nz
+RO5产水流量 1000 nz
+RO6产水流量 1000 nz
+RO7产水流量 1000 nz
+RO8产水流量 1000 nz
+RO1反渗透二段产水流量 1000 nz
+RO2反渗透二段产水流量 1000 nz
+RO3反渗透二段产水流量 1000 nz
+RO4反渗透二段产水流量 1000 nz
+RO5反渗透二段产水流量 1000 nz
+RO6反渗透二段产水流量 1000 nz
+RO7反渗透二段产水流量 1000 nz
+RO8反渗透二段产水流量 1000 nz
+

+ 27 - 0
user_maintain_dictionary/jieba_words/段压差.txt

@@ -0,0 +1,27 @@
+一段压差 1000 nz
+二段压差 1000 nz
+三段压差 1000 nz
+RO1一段压差 1000 nz
+RO1二段压差 1000 nz
+RO1三段压差 1000 nz
+RO2一段压差 1000 nz
+RO2二段压差 1000 nz
+RO2三段压差 1000 nz
+RO3一段压差 1000 nz
+RO3二段压差 1000 nz
+RO3三段压差 1000 nz
+RO4一段压差 1000 nz
+RO4二段压差 1000 nz
+RO4三段压差 1000 nz
+RO5一段压差 1000 nz
+RO5二段压差 1000 nz
+RO5三段压差 1000 nz
+RO6一段压差 1000 nz
+RO6二段压差 1000 nz
+RO6三段压差 1000 nz
+RO7一段压差 1000 nz
+RO7二段压差 1000 nz
+RO7三段压差 1000 nz
+RO8一段压差 1000 nz
+RO8二段压差 1000 nz
+RO8三段压差 1000 nz

+ 18 - 0
user_maintain_dictionary/jieba_words/浓水压力.txt

@@ -0,0 +1,18 @@
+浓水压力 1000 nz
+RO1一段浓水压力 1000 nz
+RO1二段浓水压力 1000 nz
+RO2一段浓水压力 1000 nz
+RO2二段浓水压力 1000 nz
+RO3一段浓水压力 1000 nz
+RO3二段浓水压力 1000 nz
+RO4一段浓水压力 1000 nz
+RO4二段浓水压力 1000 nz
+RO5一段浓水压力 1000 nz
+RO5二段浓水压力 1000 nz
+RO6一段浓水压力 1000 nz
+RO6二段浓水压力 1000 nz
+RO7一段浓水压力 1000 nz
+RO7二段浓水压力 1000 nz
+RO8一段浓水压力 1000 nz
+RO8二段浓水压力 1000 nz
+

+ 6 - 0
user_maintain_dictionary/jieba_words/浓水流量.txt

@@ -0,0 +1,6 @@
+浓水流量 1000 nz
+RO浓水流量 1000 nz
+RO1反渗透浓水流量 1000 nz
+RO2反渗透浓水流量 1000 nz
+RO3反渗透浓水流量 1000 nz
+RO4反渗透浓水流量 1000 nz

+ 37 - 0
user_maintain_dictionary/jieba_words/进水压力.txt

@@ -0,0 +1,37 @@
+进水压力 1000 nz
+一段进水压力 1000 nz
+二段进水压力 1000 nz
+三段进水压力 1000 nz
+RO1一段进水压力 1000 nz
+RO2一段进水压力 1000 nz
+RO3一段进水压力 1000 nz
+RO4一段进水压力 1000 nz
+RO5一段进水压力 1000 nz
+RO6一段进水压力 1000 nz
+RO7一段进水压力 1000 nz
+RO8一段进水压力 1000 nz
+RO1二段进水压力 1000 nz
+RO2二段进水压力 1000 nz
+RO3二段进水压力 1000 nz
+RO4二段进水压力 1000 nz
+RO5二段进水压力 1000 nz
+RO6二段进水压力 1000 nz
+RO7二段进水压力 1000 nz
+RO8二段进水压力 1000 nz
+RO1三段进水压力 1000 nz
+RO2三段进水压力 1000 nz
+RO3三段进水压力 1000 nz
+RO4三段进水压力 1000 nz
+RO5三段进水压力 1000 nz
+RO6三段进水压力 1000 nz
+RO7三段进水压力 1000 nz
+RO8三段进水压力 1000 nz
+
+UF1进水压力 1000 nz
+UF2进水压力 1000 nz
+UF3进水压力 1000 nz
+UF4进水压力 1000 nz
+UF5进水压力 1000 nz
+UF6进水压力 1000 nz
+UF7进水压力 1000 nz
+UF8进水压力 1000 nz

+ 24 - 0
user_maintain_dictionary/jieba_words/进水流量.txt

@@ -0,0 +1,24 @@
+UF总进水量 1000 nz
+UF进水量 1000 nz
+进水流量 1000 nz
+RO总进水流量 1000 nz
+RO进水流量 1000 nz
+总进水流量 1000 nz
+UF总进水流量 1000 nz
+UF进水流量 1000 nz
+RO1反渗透进水流量 1000 nz
+RO2反渗透进水流量 1000 nz
+RO3反渗透进水流量 1000 nz
+RO4反渗透进水流量 1000 nz
+RO5反渗透进水流量 1000 nz
+RO6反渗透进水流量 1000 nz
+RO7反渗透进水流量 1000 nz
+RO8反渗透进水流量 1000 nz
+UF1进水流量 1000 nz
+UF2进水流量 1000 nz
+UF3进水流量 1000 nz
+UF4进水流量 1000 nz
+UF5进水流量 1000 nz
+UF6进水流量 1000 nz
+UF7进水流量 1000 nz
+UF8进水流量 1000 nz

Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff