|
@@ -52,7 +52,8 @@ template_dict = {
|
|
|
"23": ["省间交易当年参与交易的家次有多少?"],
|
|
|
"24": ["某年送出电量前五名是谁?"],
|
|
|
"25": ["某年受入电量前五名是谁?"],
|
|
|
- "26": ["某年送出电量第五名是谁?"],
|
|
|
+ "26": ["某年受入电量第五名是谁?"],
|
|
|
+ "27": ["某年送出电量第五名是谁?"],
|
|
|
}
|
|
|
# 将地点映射成相应的代码
|
|
|
def map_location_to_unit(location: str) -> str:
|
|
@@ -269,9 +270,32 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
# print(f"匹配到的排名为:{rank}")
|
|
|
else:
|
|
|
rank = None
|
|
|
- print("未匹配到排名")
|
|
|
+ # print("未匹配到排名")
|
|
|
|
|
|
- return time_results, locations, rank
|
|
|
+
|
|
|
+
|
|
|
+ # 匹配“第五”或“top 5”等形式
|
|
|
+ rank_match2 = re.search(r'(第\s*)(\d+|[一二两三四五六七八九十])\s*(名)', question, re.IGNORECASE)
|
|
|
+
|
|
|
+ # rank_match = re.search(
|
|
|
+ # r'(前|top\s*|第\s*)(\d+|[一二三四五六七八九十])\s*(名)?',
|
|
|
+ # question,
|
|
|
+ # re.IGNORECASE
|
|
|
+ # )
|
|
|
+
|
|
|
+ if rank_match2:
|
|
|
+ rank_str = rank_match2.group(2)
|
|
|
+ if rank_str.isdigit():
|
|
|
+ rank2 = int(rank_str)
|
|
|
+ else:
|
|
|
+ rank2 = chinese_digit_map.get(rank_str, None)
|
|
|
+
|
|
|
+ # print(f"匹配到的排名为:{rank}")
|
|
|
+ else:
|
|
|
+ rank2 = None
|
|
|
+ # print("未匹配到排名")
|
|
|
+
|
|
|
+ return time_results, locations, rank, rank2
|
|
|
|
|
|
# 先用 extract_time_location 判断问句包含哪类时间信息,然后只对结构匹配的模板子集做余弦匹配。
|
|
|
# def classify_by_time_type(query, time_info):
|
|
@@ -282,7 +306,7 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
def classify_by_time_type(query, time_info):
|
|
|
if not time_info:
|
|
|
# 无时间信息时,返回指定模板 19-23
|
|
|
- return ['19', '20', '21', '22', '23', '17.1', '17.2', '17.3', '17.4', '18.1', '18.2', '18.3', '18.4','24','25']
|
|
|
+ return ['19', '20', '21', '22', '23', '17.1', '17.2', '17.3', '17.4', '18.1', '18.2', '18.3', '18.4','24','25','26','27']
|
|
|
|
|
|
time = time_info[0]
|
|
|
|
|
@@ -296,17 +320,17 @@ def classify_by_time_type(query, time_info):
|
|
|
|
|
|
# 情况 3:仅 year,全年
|
|
|
if 'year' in time and 'month' not in time:
|
|
|
- return ['1','8.1','8.2','8.3','8.4','9.1','9.2','9.3','9.4','9.5','9.6','9.7','9.8','9.9','9.10','9.11','9.12','9.13','9.14','9.15','9.16','9.17','16.1','16.2','21','22','23','24','25'] # 某年全年累计交易电量
|
|
|
+ return ['1','8.1','8.2','8.3','8.4','9.1','9.2','9.3','9.4','9.5','9.6','9.7','9.8','9.9','9.10','9.11','9.12','9.13','9.14','9.15','9.16','9.17','16.1','16.2','21','22','23','24','25','26','27'] # 某年全年累计交易电量
|
|
|
def match_template_with_time_filter(query, template_dict, tokenizer, extract_time_location_func):
|
|
|
"""
|
|
|
先基于时间信息筛选候选模板,再进行TF-IDF匹配。
|
|
|
"""
|
|
|
# 提取时间
|
|
|
- time_info, _, _ = extract_time_location_func(query)
|
|
|
- print(time_info)
|
|
|
+ time_info, _, _, _ = extract_time_location_func(query)
|
|
|
+ # print(time_info)
|
|
|
# 通过时间判断候选模板 key
|
|
|
candidate_keys = classify_by_time_type(query, time_info)
|
|
|
- print(candidate_keys)
|
|
|
+ # print(candidate_keys)
|
|
|
# 构造候选子模板字典
|
|
|
filtered_template_dict = {k: template_dict[k] for k in candidate_keys}
|
|
|
# 使用你原来的 TF-IDF 匹配函数
|
|
@@ -377,7 +401,7 @@ def load_template_info(matched_key, json_folder):
|
|
|
return data
|
|
|
def process_query(query, template_dict, json_folder, tokenizer=jieba_tokenizer):
|
|
|
# 提取条件
|
|
|
- time_info, location_info, rank_info = extract_time_location(query)
|
|
|
+ time_info, location_info, rank_info, rank_info2 = extract_time_location(query)
|
|
|
|
|
|
conditions = {}
|
|
|
# 匹配模板
|
|
@@ -427,6 +451,10 @@ def process_query(query, template_dict, json_folder, tokenizer=jieba_tokenizer):
|
|
|
|
|
|
if rank_info:
|
|
|
conditions['rank'] = rank_info
|
|
|
+
|
|
|
+ if rank_info2:
|
|
|
+ conditions['rank2'] = rank_info2
|
|
|
+
|
|
|
# 查询模板json
|
|
|
template_info = load_template_info(matched_key, json_folder)
|
|
|
# 模板的关键词
|
|
@@ -447,6 +475,8 @@ def process_query(query, template_dict, json_folder, tokenizer=jieba_tokenizer):
|
|
|
content = template_info.get("content", "")
|
|
|
# 动作类型
|
|
|
play = template_info.get("play", "")
|
|
|
+ # 问题序号
|
|
|
+ qcode = template_info.get("qcode", "")
|
|
|
return {
|
|
|
"matched_key": matched_key,
|
|
|
"matched_template": best_sentence,
|
|
@@ -462,8 +492,8 @@ def process_query(query, template_dict, json_folder, tokenizer=jieba_tokenizer):
|
|
|
"play": play,
|
|
|
"find_max": find_max,
|
|
|
"value_key": value_key,
|
|
|
- "name_key": name_key
|
|
|
-
|
|
|
+ "name_key": name_key,
|
|
|
+ "qcode": qcode
|
|
|
}
|
|
|
# 查询类
|
|
|
def smart_find_value(folder_path, file_name, conditions: dict, target_key: str):
|
|
@@ -528,26 +558,28 @@ def find_key_recursively(data, target_key):
|
|
|
|
|
|
# query = "2023年省间交易电量新能源交易电量是多少??"
|
|
|
# query = "今年1月到2023年2月累计交易电量是多少?"
|
|
|
+# query = "2024年送出电量第二名是谁?"
|
|
|
+query = "2024年7月、8月、12月交易电量的平均值是多少?"
|
|
|
+
|
|
|
+json_folder = "templatesJson"
|
|
|
+
|
|
|
#
|
|
|
-# json_folder = "templatesJson"
|
|
|
#
|
|
|
-# #
|
|
|
-# #
|
|
|
-# result = process_query(query, template_dict, json_folder)
|
|
|
+result = process_query(query, template_dict, json_folder)
|
|
|
#
|
|
|
-# # print(result)
|
|
|
-# print(result['content'])
|
|
|
-# print("匹配的模板 key:", result["matched_key"])
|
|
|
-# print("最相似的模板句:", result["matched_template"])
|
|
|
-# print("相似度分数:", result["similarity_score"])
|
|
|
-# print("类型:", result["type"])
|
|
|
-# print("关键词:", result["keywords"])
|
|
|
-# print("查询字段:", result["target"])
|
|
|
-# print("模型名字", result["name"])
|
|
|
-# print("条件", result["conditions"])
|
|
|
-# print("返回的内容是:", result["content"])
|
|
|
-# print("问句是:", result["query"])
|
|
|
-# print("动作是:", result["play"])
|
|
|
+print(result)
|
|
|
+print(result['content'])
|
|
|
+print("匹配的模板 key:", result["matched_key"])
|
|
|
+print("最相似的模板句:", result["matched_template"])
|
|
|
+print("相似度分数:", result["similarity_score"])
|
|
|
+print("类型:", result["type"])
|
|
|
+print("关键词:", result["keywords"])
|
|
|
+print("查询字段:", result["target"])
|
|
|
+print("模型名字", result["name"])
|
|
|
+print("条件", result["conditions"])
|
|
|
+print("返回的内容是:", result["content"])
|
|
|
+print("问句是:", result["query"])
|
|
|
+print("动作是:", result["play"])
|
|
|
|
|
|
# query = "当月送出均价最高的是哪个省??"
|
|
|
# query = ("2025年送出电量前五名是谁??")
|