|
@@ -20,8 +20,8 @@ template_dict = {
|
|
|
"17": ["那个省{item}最高?是多少?"],
|
|
|
"18": ["那个省{item}最低?是多少?"],
|
|
|
"19": ["省间交易{item}有多少?"],
|
|
|
- "24": ["某年{item}前{x}名是谁?"],
|
|
|
- "26": ["某年{item}}第{x}名是谁?"],
|
|
|
+ "24": ["某年{item}前几名是谁?"],
|
|
|
+ "26": ["某年{item}}第几名是谁?"],
|
|
|
}
|
|
|
|
|
|
# 用户问句中可能出现的实际实体词,用于识别并替换成占位符进行匹配
|
|
@@ -58,18 +58,23 @@ from typing import Tuple, List, Dict
|
|
|
from datetime import datetime
|
|
|
import re
|
|
|
|
|
|
-def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
+import re
|
|
|
+from typing import Tuple, List, Dict, Union
|
|
|
+from datetime import datetime
|
|
|
+
|
|
|
+def extract_time_location(question: str) -> Tuple[List[Dict], List[str], int]:
|
|
|
current_date = datetime.now()
|
|
|
current_year = current_date.year
|
|
|
current_month = current_date.month
|
|
|
|
|
|
- # 匹配绝对时间
|
|
|
+ # 绝对时间正则
|
|
|
absolute_patterns = [
|
|
|
- r'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日',
|
|
|
- r'(?P<year>\d{4})年(?P<month>\d{1,2})月',
|
|
|
- r'(?P<year>\d{4})年'
|
|
|
+ r'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日', # 年月日
|
|
|
+ r'(?P<year>\d{4})年(?P<month>\d{1,2})月', # 年月
|
|
|
+ r'(?P<year>\d{4})年' # 年
|
|
|
]
|
|
|
|
|
|
+ # 相对年份映射
|
|
|
relative_year_mapping = {
|
|
|
'明年': current_year + 1,
|
|
|
'今年': current_year,
|
|
@@ -77,6 +82,7 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
'前年': current_year - 2
|
|
|
}
|
|
|
|
|
|
+ # 季度/半年映射
|
|
|
season_mapping = {
|
|
|
'一季度': (1, 3),
|
|
|
'二季度': (4, 6),
|
|
@@ -96,7 +102,8 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
time_results = []
|
|
|
used_keywords = set()
|
|
|
|
|
|
- # 🆕 处理“起止时间段”,格式:2023年1月到2024年2月、去年1月到今年2月、2023年1月到1月等
|
|
|
+ # =================== 处理起止时间段 ===================
|
|
|
+ # 1. 2023年1月到2024年2月
|
|
|
range_pattern = r'(?P<start>(\d{4}|今|去|前|明)年(\d{1,2})?月?)到(?P<end>(\d{4}|今|去|前|明)年(\d{1,2})?月?)'
|
|
|
for match in re.finditer(range_pattern, question):
|
|
|
start_raw, end_raw = match.group('start'), match.group('end')
|
|
@@ -141,10 +148,9 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
})
|
|
|
used_keywords.add(match.group())
|
|
|
|
|
|
- # 🆕 新增匹配“2024年1月到2月”,结束时间没有写年份,默认与开始时间同年
|
|
|
+ # 2. 2023年1月到2月(简化格式)
|
|
|
partial_range_pattern = r'(?P<year>\d{4})年(?P<start_month>\d{1,2})月到(?P<end_month>\d{1,2})月'
|
|
|
for match in re.finditer(partial_range_pattern, question):
|
|
|
- # 避免重复匹配已经被上面时间段匹配使用过的字符串
|
|
|
if match.group() in used_keywords:
|
|
|
continue
|
|
|
year = int(match.group('year'))
|
|
@@ -160,18 +166,18 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
})
|
|
|
used_keywords.add(match.group())
|
|
|
|
|
|
- # 相对+具体月份
|
|
|
- relative_absolute_pattern = r'(?P<relative>今|去|前)年(?P<month>\d{1,2})月'
|
|
|
+ # 3. 相对+具体月份(如“去年2月”)
|
|
|
+ relative_absolute_pattern = r'(?P<relative>今|去|前|明)年(?P<month>\d{1,2})月'
|
|
|
for match in re.finditer(relative_absolute_pattern, question):
|
|
|
if match.group() in used_keywords:
|
|
|
continue
|
|
|
rel = match.group('relative')
|
|
|
month = int(match.group('month'))
|
|
|
- year = {'今': current_year, '去': current_year - 1, '前': current_year - 2}.get(rel, current_year)
|
|
|
+ year = {'今': current_year, '去': current_year - 1, '前': current_year - 2, '明': current_year + 1}[rel]
|
|
|
time_results.append({'year': year, 'month': month, 'raw': match.group()})
|
|
|
used_keywords.add(match.group())
|
|
|
|
|
|
- # 绝对时间
|
|
|
+ # 4. 绝对时间(例如“2023年2月”)
|
|
|
for pattern in absolute_patterns:
|
|
|
for match in re.finditer(pattern, question):
|
|
|
if match.group() in used_keywords:
|
|
@@ -187,13 +193,13 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
time_results.append(time_info)
|
|
|
used_keywords.add(match.group())
|
|
|
|
|
|
- # 单独的相对年份关键词
|
|
|
+ # 5. 相对年份词(今年、去年等)
|
|
|
for term, year in relative_year_mapping.items():
|
|
|
if term in question and term not in used_keywords:
|
|
|
time_results.append({'year': year, 'label': term, 'raw': term})
|
|
|
used_keywords.add(term)
|
|
|
|
|
|
- # 当前/上个月
|
|
|
+ # 6. 当前、上个月
|
|
|
if '当前' in question and '当前' not in used_keywords:
|
|
|
time_results.append({'year': current_year, 'month': current_month, 'label': '当前', 'raw': '当前'})
|
|
|
used_keywords.add('当前')
|
|
@@ -202,17 +208,16 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
prev_month = current_month - 1 if current_month > 1 else 12
|
|
|
time_results.append({'year': prev_year, 'month': prev_month, 'label': '上个月', 'raw': '上个月'})
|
|
|
used_keywords.add('上个月')
|
|
|
- # ✅ 添加:当年
|
|
|
+
|
|
|
+ # 7. 当年、当月
|
|
|
if '当年' in question and '当年' not in used_keywords:
|
|
|
time_results.append({'year': current_year, 'label': '当年', 'raw': '当年'})
|
|
|
used_keywords.add('当年')
|
|
|
-
|
|
|
- # ✅ 添加:当月
|
|
|
if '当月' in question and '当月' not in used_keywords:
|
|
|
time_results.append({'year': current_year, 'month': current_month, 'label': '当月', 'raw': '当月'})
|
|
|
used_keywords.add('当月')
|
|
|
|
|
|
- # 季度和半年
|
|
|
+ # 8. 季度、半年
|
|
|
for term, (start_month, end_month) in season_mapping.items():
|
|
|
if term in question and term not in used_keywords:
|
|
|
time_results.append({
|
|
@@ -224,80 +229,65 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
})
|
|
|
used_keywords.add(term)
|
|
|
|
|
|
- # 地点识别
|
|
|
+ # =================== 地点提取 ===================
|
|
|
locations = [p for p in provinces if p in question]
|
|
|
|
|
|
- # 提取“前N名”或“Top N”格式
|
|
|
- # 中文数字转阿拉伯数字映射(可扩展)
|
|
|
+ # =================== 排名提取 ===================
|
|
|
chinese_digit_map = {
|
|
|
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
|
|
|
- '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
|
|
|
- '两':2
|
|
|
+ '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, '两': 2
|
|
|
}
|
|
|
-
|
|
|
- # question = "2022年受入电量前五名是谁"
|
|
|
-
|
|
|
- # 匹配“前五”或“top 5”等形式
|
|
|
rank_match = re.search(r'(前|top\s*)(\d+|[一二两三四五六七八九十])', question, re.IGNORECASE)
|
|
|
-
|
|
|
- # rank_match = re.search(
|
|
|
- # r'(前|top\s*|第\s*)(\d+|[一二三四五六七八九十])\s*(名)?',
|
|
|
- # question,
|
|
|
- # re.IGNORECASE
|
|
|
- # )
|
|
|
-
|
|
|
if rank_match:
|
|
|
rank_str = rank_match.group(2)
|
|
|
if rank_str.isdigit():
|
|
|
rank = int(rank_str)
|
|
|
else:
|
|
|
rank = chinese_digit_map.get(rank_str, None)
|
|
|
-
|
|
|
- # print(f"匹配到的排名为:{rank}")
|
|
|
else:
|
|
|
rank = None
|
|
|
- print("未匹配到排名")
|
|
|
|
|
|
return time_results, locations, rank
|
|
|
|
|
|
+
|
|
|
# 先用 extract_time_location 判断问句包含哪类时间信息,然后只对结构匹配的模板子集做余弦匹配。
|
|
|
# def classify_by_time_type(query, time_info):
|
|
|
# if any('start_year' in t and 'end_year' in t for t in time_info):
|
|
|
# return ['3'] # 时间段
|
|
|
# return list(template_dict.keys()) # fallback 所有模板
|
|
|
#888
|
|
|
-def classify_by_time_type(query, time_info):
|
|
|
- if not time_info:
|
|
|
- # 无时间信息时,返回指定模板 19-23
|
|
|
- return ['19', '20', '21', '22', '23', '17.1', '17.2', '17.3', '17.4', '18.1', '18.2', '18.3', '18.4','24','25']
|
|
|
-
|
|
|
- time = time_info[0]
|
|
|
-
|
|
|
- # 情况 1:起始时间和结束时间都有,判断为时间段
|
|
|
- if 'start_year' in time and 'end_year' in time:
|
|
|
- return ['3'] # 某年某月到某月累计交易电量
|
|
|
-
|
|
|
- # 情况 2:有 year 和 month,精确到月
|
|
|
- if 'year' in time and 'month' in time:
|
|
|
- return ['2','16.1','16.2','20'] # 某年某月交易电量
|
|
|
-
|
|
|
- # 情况 3:仅 year,全年
|
|
|
- if 'year' in time and 'month' not in time:
|
|
|
- return ['1','8.1','8.2','8.3','8.4','9.1','9.2','9.3','9.4','9.5','9.6','9.7','9.8','9.9','9.10','9.11','9.12','9.13','9.14','9.15','9.16','9.17','16.1','16.2','21','22','23','24','25'] # 某年全年累计交易电量
|
|
|
-def match_template_with_time_filter(query, template_dict, tokenizer, extract_time_location_func):
|
|
|
- """
|
|
|
- 先基于时间信息筛选候选模板,再进行TF-IDF匹配。
|
|
|
- """
|
|
|
- # 提取时间
|
|
|
- time_info, _, _ = extract_time_location_func(query)
|
|
|
- print(time_info)
|
|
|
- # 通过时间判断候选模板 key
|
|
|
- candidate_keys = classify_by_time_type(query, time_info)
|
|
|
- print(candidate_keys)
|
|
|
- # 构造候选子模板字典
|
|
|
- filtered_template_dict = {k: template_dict[k] for k in candidate_keys}
|
|
|
- # 使用你原来的 TF-IDF 匹配函数
|
|
|
- return match_template(query, filtered_template_dict, tokenizer)
|
|
|
+# def classify_by_time_type(query, time_info):
|
|
|
+# if not time_info:
|
|
|
+# # 无时间信息时,返回指定模板 19-23
|
|
|
+# return ['19', '20', '21', '22', '23', '17.1', '17.2', '17.3', '17.4', '18.1', '18.2', '18.3', '18.4','24','25']
|
|
|
+#
|
|
|
+# time = time_info[0]
|
|
|
+#
|
|
|
+# # 情况 1:起始时间和结束时间都有,判断为时间段
|
|
|
+# if 'start_year' in time and 'end_year' in time:
|
|
|
+# return ['3'] # 某年某月到某月累计交易电量
|
|
|
+#
|
|
|
+# # 情况 2:有 year 和 month,精确到月
|
|
|
+# if 'year' in time and 'month' in time:
|
|
|
+# return ['2','16.1','16.2','20'] # 某年某月交易电量
|
|
|
+#
|
|
|
+# # 情况 3:仅 year,全年
|
|
|
+# if 'year' in time and 'month' not in time:
|
|
|
+# return ['1','8.1','8.2','8.3','8.4','9.1','9.2','9.3','9.4','9.5','9.6','9.7','9.8','9.9','9.10','9.11','9.12','9.13','9.14','9.15','9.16','9.17','16.1','16.2','21','22','23','24','25'] # 某年全年累计交易电量
|
|
|
+# def match_template_with_time_filter(query, template_dict, tokenizer, extract_time_location_func):
|
|
|
+# """
|
|
|
+# 先基于时间信息筛选候选模板,再进行TF-IDF匹配。
|
|
|
+# """
|
|
|
+# # 提取时间
|
|
|
+# time_info, _, _ = extract_time_location_func(query)
|
|
|
+# print(time_info)
|
|
|
+# # 通过时间判断候选模板 key
|
|
|
+# candidate_keys = classify_by_time_type(query, time_info)
|
|
|
+# print(candidate_keys)
|
|
|
+# # 构造候选子模板字典
|
|
|
+# filtered_template_dict = {k: template_dict[k] for k in candidate_keys}
|
|
|
+# # 使用你原来的 TF-IDF 匹配函数
|
|
|
+# return match_template(query, filtered_template_dict, tokenizer)
|
|
|
# 根据模板去对应的json文件中找数据
|
|
|
def load_template_info(matched_key, json_folder):
|
|
|
"""
|
|
@@ -379,11 +369,20 @@ def extract_item_from_query(template, query):
|
|
|
return query[start:end]
|
|
|
return None
|
|
|
|
|
|
+# 将问句中的年月换成某年某月
|
|
|
+def replace_time_in_query(query):
|
|
|
+ """
|
|
|
+ 将问句中的具体时间(年、月)替换为“某年”“某月”格式,避免对模板匹配产生干扰
|
|
|
+ 如: '2023年5月电量' => '某年某月电量'
|
|
|
+ """
|
|
|
+ query = re.sub(r'\d{4}年', '某年', query) # 替换年份
|
|
|
+ query = re.sub(r'\d{1,2}月', '某月', query) # 替换月份
|
|
|
+ return query
|
|
|
|
|
|
# ==========================
|
|
|
# 匹配模板主函数
|
|
|
# ==========================
|
|
|
-def match_template(query, template_dict, json_folder,item_lexicon):
|
|
|
+def process_query(query, template_dict, json_folder, item_lexicon=None):
|
|
|
"""
|
|
|
主函数:将用户 query 与模板进行匹配,并返回最相似模板及提取信息
|
|
|
参数:
|
|
@@ -393,6 +392,8 @@ def match_template(query, template_dict, json_folder,item_lexicon):
|
|
|
返回:
|
|
|
dict: 包含匹配信息(key、模板、相似度、提取的实体)
|
|
|
"""
|
|
|
+ if item_lexicon is None:
|
|
|
+ item_lexicon = item_lexicon
|
|
|
# 提取条件
|
|
|
time_info, location_info, rank_info = extract_time_location(query)
|
|
|
conditions = {}
|
|
@@ -407,7 +408,8 @@ def match_template(query, template_dict, json_folder,item_lexicon):
|
|
|
# found_item: 找到的实体词(如 "送出电量"),如果没找到则为 None
|
|
|
# found_item_key实体来源key
|
|
|
query_for_match, found_item ,found_item_key = replace_item_in_query(query, item_lexicon)
|
|
|
-
|
|
|
+ # 对 query_for_match 中的时间(年/月)进行归一化,变为“某年某月”避免时间影响匹配
|
|
|
+ query_for_match = replace_time_in_query(query_for_match)
|
|
|
# 计算 TF-IDF 相似度
|
|
|
vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
|
|
|
tfidf_matrix = vectorizer.fit_transform([query_for_match] + templates)
|
|
@@ -463,7 +465,7 @@ def match_template(query, template_dict, json_folder,item_lexicon):
|
|
|
# 从原始 query 中提取 item 实体值
|
|
|
extracted_item = extract_item_from_query(best_match_template, query)
|
|
|
|
|
|
-
|
|
|
+ print(found_item)
|
|
|
match_json = found_item_key + "_" + found_item[0]
|
|
|
template_info = load_template_info(match_json, json_folder)
|
|
|
|
|
@@ -485,6 +487,8 @@ def match_template(query, template_dict, json_folder,item_lexicon):
|
|
|
content = template_info.get("content", "")
|
|
|
# 动作类型
|
|
|
play = template_info.get("play", "")
|
|
|
+ # 问题序号
|
|
|
+ qcode = template_info.get("qcode", "")
|
|
|
# 返回最终结果
|
|
|
result = {
|
|
|
"conditions": conditions,
|
|
@@ -503,36 +507,34 @@ def match_template(query, template_dict, json_folder,item_lexicon):
|
|
|
"play": play,
|
|
|
"find_max": find_max,
|
|
|
"value_key": value_key,
|
|
|
- "name_key": name_key
|
|
|
+ "name_key": name_key,
|
|
|
+ "qcode": qcode
|
|
|
}
|
|
|
+
|
|
|
return result
|
|
|
|
|
|
|
|
|
# ==========================
|
|
|
# 测试部分(main 入口)
|
|
|
# ==========================
|
|
|
-if __name__ == "__main__":
|
|
|
- # query = "2024年省间交易电量按交易周期划分的电量是多少?"
|
|
|
- # query = "2024年省间交易电量月度交易电量是多少?"
|
|
|
- # query = "2024年全年累计省间交易电量是多少?"
|
|
|
- query = "那个省送出电量最高?是多少?"
|
|
|
-
|
|
|
- json_folder = "templatesJson_copy"
|
|
|
- result = match_template(query, template_dict, json_folder, item_lexicon)
|
|
|
-
|
|
|
- print("用户问句:", query)
|
|
|
- print("条件为:",result['conditions'])
|
|
|
- print("匹配的模板 key:", result['matched_key'])
|
|
|
- print("最相似的模板:", result['matched_template'])
|
|
|
- print("相似度分数:", result['similarity_score'])
|
|
|
- print("提取出的 item:", result['extracted_item'])
|
|
|
- print("提取出的template_info:", result['template_info'])
|
|
|
- print("类型:", result["type"])
|
|
|
- print("关键词:", result["keywords"])
|
|
|
- print("查询字段:", result["target"])
|
|
|
- print("模型名字", result["name"])
|
|
|
- print("条件", result["conditions"])
|
|
|
- print("返回的内容是:", result["content"])
|
|
|
- print("问句是:", result["query"])
|
|
|
- print("动作是:", result["play"])
|
|
|
+# if __name__ == "__main__":
|
|
|
+#
|
|
|
+# query = "2023年2月交易电量是多少?"
|
|
|
+#
|
|
|
+# json_folder = "templatesJson_copy"
|
|
|
+# result = process_query(query, template_dict, json_folder, item_lexicon)
|
|
|
+# print(result)
|
|
|
+# print("用户问句:", query)
|
|
|
+# print("条件为:",result['conditions'])
|
|
|
+# print("匹配的模板 key:", result['matched_key'])
|
|
|
+# print("最相似的模板:", result['matched_template'])
|
|
|
+# print("相似度分数:", result['similarity_score'])
|
|
|
+# print("类型:", result["type"])
|
|
|
+# print("关键词:", result["keywords"])
|
|
|
+# print("查询字段:", result["target"])
|
|
|
+# print("模型名字", result["name"])
|
|
|
+# print("条件", result["conditions"])
|
|
|
+# print("返回的内容是:", result["content"])
|
|
|
+# print("问句是:", result["query"])
|
|
|
+# print("动作是:", result["play"])
|
|
|
|