Browse Source

通用模板暂存

zfrr 2 months ago
parent
commit
f96cab19b8

BIN
final/ByRules/__pycache__/commonUtil.cpython-39.pyc


BIN
final/ByRules/__pycache__/similarity_answer_json.cpython-39.pyc


BIN
final/ByRules/__pycache__/similarity_answer_json_copy.cpython-39.pyc


BIN
final/ByRules/__pycache__/util.cpython-39.pyc


+ 20 - 3
final/ByRules/app.py

@@ -3,7 +3,8 @@ import sys
 from flask import Flask, request, jsonify
 
 from commonUtil import fill_template, fill_template_auto
-from similarity_answer_json import *
+from similarity_answer_json_copy import *
+# from similarity_answer_json import *
 from util import *
 import os
 
@@ -27,7 +28,8 @@ app = Flask(__name__)
 
 
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-TEMPLATE_FOLDER = os.path.join(BASE_DIR, "templatesJson")
+TEMPLATE_FOLDER = os.path.join(BASE_DIR, "templatesJson_copy")
+# TEMPLATE_FOLDER = os.path.join(BASE_DIR, "templatesJson")
 DATA_FOLDER = os.path.join(BASE_DIR, "..", "Json", "json_data")
 
 MAPPING_FOLDER = os.path.join(BASE_DIR, "..", "Json", "sjgxys")
@@ -43,7 +45,22 @@ def process_query_route():
         return jsonify({"error": "Query cannot be empty"}), 400
 
     try:
-        result = process_query(query, template_dict, TEMPLATE_FOLDER)
+        # result = process_query(query, template_dict, TEMPLATE_FOLDER)
+        result = process_query(query, template_dict, TEMPLATE_FOLDER,item_lexicon)
+
+        print("用户问句:", query)
+        print("条件为:",result['conditions'])
+        print("匹配的模板 key:", result['matched_key'])
+        print("最相似的模板:", result['matched_template'])
+        print("相似度分数:", result['similarity_score'])
+        print("类型:", result["type"])
+        print("关键词:", result["keywords"])
+        print("查询字段:", result["target"])
+        print("模型名字", result["name"])
+        print("条件", result["conditions"])
+        print("返回的内容是:", result["content"])
+        print("问句是:", result["query"])
+        print("动作是:", result["play"])
 
         #  如果没有该问题模板
         if result['play'] == '疑问':

+ 101 - 99
final/ByRules/similarity_answer_json_copy.py

@@ -20,8 +20,8 @@ template_dict = {
     "17": ["那个省{item}最高?是多少?"],
     "18": ["那个省{item}最低?是多少?"],
     "19": ["省间交易{item}有多少?"],
-    "24": ["某年{item}前{x}名是谁?"],
-    "26": ["某年{item}}第{x}名是谁?"],
+    "24": ["某年{item}前名是谁?"],
+    "26": ["某年{item}}第名是谁?"],
 }
 
 # 用户问句中可能出现的实际实体词,用于识别并替换成占位符进行匹配
@@ -58,18 +58,23 @@ from typing import Tuple, List, Dict
 from datetime import datetime
 import re
 
-def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
+import re
+from typing import Tuple, List, Dict, Union
+from datetime import datetime
+
+def extract_time_location(question: str) -> Tuple[List[Dict], List[str], int]:
     current_date = datetime.now()
     current_year = current_date.year
     current_month = current_date.month
 
-    # 匹配绝对时间
+    # 绝对时间正则
     absolute_patterns = [
-        r'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日',
-        r'(?P<year>\d{4})年(?P<month>\d{1,2})月',
-        r'(?P<year>\d{4})年'
+        r'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日',  # 年月日
+        r'(?P<year>\d{4})年(?P<month>\d{1,2})月',                 # 年月
+        r'(?P<year>\d{4})年'                                     # 年
     ]
 
+    # 相对年份映射
     relative_year_mapping = {
         '明年': current_year + 1,
         '今年': current_year,
@@ -77,6 +82,7 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
         '前年': current_year - 2
     }
 
+    # 季度/半年映射
     season_mapping = {
         '一季度': (1, 3),
         '二季度': (4, 6),
@@ -96,7 +102,8 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
     time_results = []
     used_keywords = set()
 
-    # 🆕 处理“起止时间段”,格式:2023年1月到2024年2月、去年1月到今年2月、2023年1月到1月等
+    # =================== 处理起止时间段 ===================
+    # 1. 2023年1月到2024年2月
     range_pattern = r'(?P<start>(\d{4}|今|去|前|明)年(\d{1,2})?月?)到(?P<end>(\d{4}|今|去|前|明)年(\d{1,2})?月?)'
     for match in re.finditer(range_pattern, question):
         start_raw, end_raw = match.group('start'), match.group('end')
@@ -141,10 +148,9 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
         })
         used_keywords.add(match.group())
 
-    # 🆕 新增匹配“2024年1月到2月”,结束时间没有写年份,默认与开始时间同年
+    # 2. 2023年1月到2月(简化格式)
     partial_range_pattern = r'(?P<year>\d{4})年(?P<start_month>\d{1,2})月到(?P<end_month>\d{1,2})月'
     for match in re.finditer(partial_range_pattern, question):
-        # 避免重复匹配已经被上面时间段匹配使用过的字符串
         if match.group() in used_keywords:
             continue
         year = int(match.group('year'))
@@ -160,18 +166,18 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
         })
         used_keywords.add(match.group())
 
-    # 相对+具体月份
-    relative_absolute_pattern = r'(?P<relative>今|去|前)年(?P<month>\d{1,2})月'
+    # 3. 相对+具体月份(如“去年2月”)
+    relative_absolute_pattern = r'(?P<relative>今|去|前|明)年(?P<month>\d{1,2})月'
     for match in re.finditer(relative_absolute_pattern, question):
         if match.group() in used_keywords:
             continue
         rel = match.group('relative')
         month = int(match.group('month'))
-        year = {'今': current_year, '去': current_year - 1, '前': current_year - 2}.get(rel, current_year)
+        year = {'今': current_year, '去': current_year - 1, '前': current_year - 2, '明': current_year + 1}[rel]
         time_results.append({'year': year, 'month': month, 'raw': match.group()})
         used_keywords.add(match.group())
 
-    # 绝对时间
+    # 4. 绝对时间(例如“2023年2月”)
     for pattern in absolute_patterns:
         for match in re.finditer(pattern, question):
             if match.group() in used_keywords:
@@ -187,13 +193,13 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
             time_results.append(time_info)
             used_keywords.add(match.group())
 
-    # 单独的相对年份关键词
+    # 5. 相对年份词(今年、去年等)
     for term, year in relative_year_mapping.items():
         if term in question and term not in used_keywords:
             time_results.append({'year': year, 'label': term, 'raw': term})
             used_keywords.add(term)
 
-    # 当前/上个月
+    # 6. 当前、上个月
     if '当前' in question and '当前' not in used_keywords:
         time_results.append({'year': current_year, 'month': current_month, 'label': '当前', 'raw': '当前'})
         used_keywords.add('当前')
@@ -202,17 +208,16 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
         prev_month = current_month - 1 if current_month > 1 else 12
         time_results.append({'year': prev_year, 'month': prev_month, 'label': '上个月', 'raw': '上个月'})
         used_keywords.add('上个月')
-    # ✅ 添加:当年
+
+    # 7. 当年、当月
     if '当年' in question and '当年' not in used_keywords:
         time_results.append({'year': current_year, 'label': '当年', 'raw': '当年'})
         used_keywords.add('当年')
-
-    # ✅ 添加:当月
     if '当月' in question and '当月' not in used_keywords:
         time_results.append({'year': current_year, 'month': current_month, 'label': '当月', 'raw': '当月'})
         used_keywords.add('当月')
 
-    # 季度和半年
+    # 8. 季度、半年
     for term, (start_month, end_month) in season_mapping.items():
         if term in question and term not in used_keywords:
             time_results.append({
@@ -224,80 +229,65 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
             })
             used_keywords.add(term)
 
-    # 地点识别
+    # =================== 地点提取 ===================
     locations = [p for p in provinces if p in question]
 
-    # 提取“前N名”或“Top N”格式
-    # 中文数字转阿拉伯数字映射(可扩展)
+    # =================== 排名提取 ===================
     chinese_digit_map = {
         '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
-        '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
-        '两':2
+        '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, '两': 2
     }
-
-    # question = "2022年受入电量前五名是谁"
-
-    # 匹配“前五”或“top 5”等形式
     rank_match = re.search(r'(前|top\s*)(\d+|[一二两三四五六七八九十])', question, re.IGNORECASE)
-
-    # rank_match = re.search(
-    #     r'(前|top\s*|第\s*)(\d+|[一二三四五六七八九十])\s*(名)?',
-    #     question,
-    #     re.IGNORECASE
-    # )
-
     if rank_match:
         rank_str = rank_match.group(2)
         if rank_str.isdigit():
             rank = int(rank_str)
         else:
             rank = chinese_digit_map.get(rank_str, None)
-
-        # print(f"匹配到的排名为:{rank}")
     else:
         rank = None
-        print("未匹配到排名")
 
     return time_results, locations, rank
 
+
 # 先用 extract_time_location 判断问句包含哪类时间信息,然后只对结构匹配的模板子集做余弦匹配。
 # def classify_by_time_type(query, time_info):
 #     if any('start_year' in t and 'end_year' in t for t in time_info):
 #         return ['3']  # 时间段
 #     return list(template_dict.keys())  # fallback 所有模板
 #888
-def classify_by_time_type(query, time_info):
-    if not time_info:
-        # 无时间信息时,返回指定模板 19-23
-        return ['19', '20', '21', '22', '23', '17.1', '17.2', '17.3', '17.4', '18.1', '18.2', '18.3', '18.4','24','25']
-
-    time = time_info[0]
-
-    # 情况 1:起始时间和结束时间都有,判断为时间段
-    if 'start_year' in time and 'end_year' in time:
-        return ['3']  # 某年某月到某月累计交易电量
-
-    # 情况 2:有 year 和 month,精确到月
-    if 'year' in time and 'month' in time:
-        return ['2','16.1','16.2','20']  # 某年某月交易电量
-
-    # 情况 3:仅 year,全年
-    if 'year' in time and 'month' not in time:
-        return ['1','8.1','8.2','8.3','8.4','9.1','9.2','9.3','9.4','9.5','9.6','9.7','9.8','9.9','9.10','9.11','9.12','9.13','9.14','9.15','9.16','9.17','16.1','16.2','21','22','23','24','25']  # 某年全年累计交易电量
-def match_template_with_time_filter(query, template_dict, tokenizer, extract_time_location_func):
-    """
-    先基于时间信息筛选候选模板,再进行TF-IDF匹配。
-    """
-    # 提取时间
-    time_info, _, _ = extract_time_location_func(query)
-    print(time_info)
-    # 通过时间判断候选模板 key
-    candidate_keys = classify_by_time_type(query, time_info)
-    print(candidate_keys)
-    # 构造候选子模板字典
-    filtered_template_dict = {k: template_dict[k] for k in candidate_keys}
-    # 使用你原来的 TF-IDF 匹配函数
-    return match_template(query, filtered_template_dict, tokenizer)
+# def classify_by_time_type(query, time_info):
+#     if not time_info:
+#         # 无时间信息时,返回指定模板 19-23
+#         return ['19', '20', '21', '22', '23', '17.1', '17.2', '17.3', '17.4', '18.1', '18.2', '18.3', '18.4','24','25']
+#
+#     time = time_info[0]
+#
+#     # 情况 1:起始时间和结束时间都有,判断为时间段
+#     if 'start_year' in time and 'end_year' in time:
+#         return ['3']  # 某年某月到某月累计交易电量
+#
+#     # 情况 2:有 year 和 month,精确到月
+#     if 'year' in time and 'month' in time:
+#         return ['2','16.1','16.2','20']  # 某年某月交易电量
+#
+#     # 情况 3:仅 year,全年
+#     if 'year' in time and 'month' not in time:
+#         return ['1','8.1','8.2','8.3','8.4','9.1','9.2','9.3','9.4','9.5','9.6','9.7','9.8','9.9','9.10','9.11','9.12','9.13','9.14','9.15','9.16','9.17','16.1','16.2','21','22','23','24','25']  # 某年全年累计交易电量
+# def match_template_with_time_filter(query, template_dict, tokenizer, extract_time_location_func):
+#     """
+#     先基于时间信息筛选候选模板,再进行TF-IDF匹配。
+#     """
+#     # 提取时间
+#     time_info, _, _ = extract_time_location_func(query)
+#     print(time_info)
+#     # 通过时间判断候选模板 key
+#     candidate_keys = classify_by_time_type(query, time_info)
+#     print(candidate_keys)
+#     # 构造候选子模板字典
+#     filtered_template_dict = {k: template_dict[k] for k in candidate_keys}
+#     # 使用你原来的 TF-IDF 匹配函数
+#     return match_template(query, filtered_template_dict, tokenizer)
 # 根据模板去对应的json文件中找数据
 def load_template_info(matched_key, json_folder):
     """
@@ -379,11 +369,20 @@ def extract_item_from_query(template, query):
         return query[start:end]
     return None
 
+# 将问句中的年月换成某年某月
+def replace_time_in_query(query):
+    """
+    将问句中的具体时间(年、月)替换为“某年”“某月”格式,避免对模板匹配产生干扰
+    如: '2023年5月电量' => '某年某月电量'
+    """
+    query = re.sub(r'\d{4}年', '某年', query)  # 替换年份
+    query = re.sub(r'\d{1,2}月', '某月', query)  # 替换月份
+    return query
 
 # ==========================
 # 匹配模板主函数
 # ==========================
-def match_template(query, template_dict, json_folder,item_lexicon):
+def process_query(query, template_dict, json_folder, item_lexicon=None):
     """
     主函数:将用户 query 与模板进行匹配,并返回最相似模板及提取信息
     参数:
@@ -393,6 +392,8 @@ def match_template(query, template_dict, json_folder,item_lexicon):
     返回:
         dict: 包含匹配信息(key、模板、相似度、提取的实体)
     """
+    if item_lexicon is None:
+        item_lexicon = item_lexicon
     # 提取条件
     time_info, location_info, rank_info = extract_time_location(query)
     conditions = {}
@@ -407,7 +408,8 @@ def match_template(query, template_dict, json_folder,item_lexicon):
     # found_item: 找到的实体词(如 "送出电量"),如果没找到则为 None
     # found_item_key实体来源key
     query_for_match, found_item ,found_item_key = replace_item_in_query(query, item_lexicon)
-
+    # 对 query_for_match 中的时间(年/月)进行归一化,变为“某年某月”避免时间影响匹配
+    query_for_match = replace_time_in_query(query_for_match)
     # 计算 TF-IDF 相似度
     vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
     tfidf_matrix = vectorizer.fit_transform([query_for_match] + templates)
@@ -463,7 +465,7 @@ def match_template(query, template_dict, json_folder,item_lexicon):
     # 从原始 query 中提取 item 实体值
     extracted_item = extract_item_from_query(best_match_template, query)
 
-
+    print(found_item)
     match_json = found_item_key + "_" + found_item[0]
     template_info = load_template_info(match_json, json_folder)
 
@@ -485,6 +487,8 @@ def match_template(query, template_dict, json_folder,item_lexicon):
     content = template_info.get("content", "")
     # 动作类型
     play = template_info.get("play", "")
+    # 问题序号
+    qcode = template_info.get("qcode", "")
     # 返回最终结果
     result = {
         "conditions": conditions,
@@ -503,36 +507,34 @@ def match_template(query, template_dict, json_folder,item_lexicon):
         "play": play,
         "find_max": find_max,
         "value_key": value_key,
-        "name_key": name_key
+        "name_key": name_key,
+        "qcode": qcode
     }
+
     return result
 
 
 # ==========================
 # 测试部分(main 入口)
 # ==========================
-if __name__ == "__main__":
-    # query = "2024年省间交易电量按交易周期划分的电量是多少?"
-    # query = "2024年省间交易电量月度交易电量是多少?"
-    # query = "2024年全年累计省间交易电量是多少?"
-    query = "那个省送出电量最高?是多少?"
-
-    json_folder = "templatesJson_copy"
-    result = match_template(query, template_dict, json_folder, item_lexicon)
-
-    print("用户问句:", query)
-    print("条件为:",result['conditions'])
-    print("匹配的模板 key:", result['matched_key'])
-    print("最相似的模板:", result['matched_template'])
-    print("相似度分数:", result['similarity_score'])
-    print("提取出的 item:", result['extracted_item'])
-    print("提取出的template_info:", result['template_info'])
-    print("类型:", result["type"])
-    print("关键词:", result["keywords"])
-    print("查询字段:", result["target"])
-    print("模型名字", result["name"])
-    print("条件", result["conditions"])
-    print("返回的内容是:", result["content"])
-    print("问句是:", result["query"])
-    print("动作是:", result["play"])
+# if __name__ == "__main__":
+#
+#     query = "2023年2月交易电量是多少?"
+#
+#     json_folder = "templatesJson_copy"
+#     result = process_query(query, template_dict, json_folder, item_lexicon)
+#     print(result)
+#     print("用户问句:", query)
+#     print("条件为:",result['conditions'])
+#     print("匹配的模板 key:", result['matched_key'])
+#     print("最相似的模板:", result['matched_template'])
+#     print("相似度分数:", result['similarity_score'])
+#     print("类型:", result["type"])
+#     print("关键词:", result["keywords"])
+#     print("查询字段:", result["target"])
+#     print("模型名字", result["name"])
+#     print("条件", result["conditions"])
+#     print("返回的内容是:", result["content"])
+#     print("问句是:", result["query"])
+#     print("动作是:", result["play"])
 

+ 2 - 2
final/ByRules/templatesJson_copy/26.json → final/ByRules/templatesJson_copy/26_受入电量.json

@@ -1,9 +1,9 @@
 {
   "dataJsonName": "sjjy1_B05_output",
-  "type": "topN",
+  "type": "rank",
   "value_key": "受入电量",
   "name_key": "单位",
-  "top_n": 5,
+  "rank": 1,
   "content": "送出电量排名分别为:&",
   "play": "讲述文本",
   "name": "省间交易",

+ 11 - 0
final/ByRules/templatesJson_copy/26_送出电量.json

@@ -0,0 +1,11 @@
+{
+  "dataJsonName": "sjjy1_B05_output",
+  "type": "rank",
+  "value_key": "送出电量",
+  "name_key": "单位",
+  "rank": 1,
+  "content": "送出电量排名分别为:&",
+  "play": "讲述文本",
+  "name": "省间交易",
+  "qcode": "25"
+}