|
@@ -6,12 +6,17 @@ import json
|
|
|
from datetime import datetime
|
|
|
from typing import Tuple, List, Dict
|
|
|
import re
|
|
|
+
|
|
|
+from final.ByRules.util import calculate_sum_by_time_range
|
|
|
+
|
|
|
+
|
|
|
def jieba_tokenizer(text):
|
|
|
return list(jieba.cut(text))
|
|
|
# 定义问题模板
|
|
|
template_dict = {
|
|
|
"1": ["某年全年累计省间交易电量是多少?"],
|
|
|
"2": ["某年某月交易电量是多少?"],
|
|
|
+ "3": ["某年某月到某月累计交易电量是多少?"],
|
|
|
"8.1": ["某年省间交易电量按交易周期划分的电量是多少?"],
|
|
|
"8.2": ["某年省间交易电量按交易类型划分的电量是多少?"],
|
|
|
"8.3": ["某年省间交易电量按发电类型划分的电量是多少?"],
|
|
@@ -52,11 +57,16 @@ def map_location_to_unit(location: str) -> str:
|
|
|
return code
|
|
|
return '未知单位'
|
|
|
# 提取时间和地点
|
|
|
+from typing import Tuple, List, Dict
|
|
|
+from datetime import datetime
|
|
|
+import re
|
|
|
+
|
|
|
def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
current_date = datetime.now()
|
|
|
current_year = current_date.year
|
|
|
current_month = current_date.month
|
|
|
|
|
|
+ # 匹配绝对时间
|
|
|
absolute_patterns = [
|
|
|
r'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日',
|
|
|
r'(?P<year>\d{4})年(?P<month>\d{1,2})月',
|
|
@@ -79,19 +89,96 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
'下半年': (7, 12)
|
|
|
}
|
|
|
|
|
|
+ provinces = [
|
|
|
+ '北京', '天津', '上海', '重庆', '河北', '山西', '辽宁', '吉林', '黑龙江',
|
|
|
+ '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南',
|
|
|
+ '广东', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '台湾',
|
|
|
+ '内蒙古', '广西', '西藏', '宁夏', '新疆', '香港', '澳门'
|
|
|
+ ]
|
|
|
+
|
|
|
time_results = []
|
|
|
+ used_keywords = set()
|
|
|
+
|
|
|
+ # 🆕 处理“起止时间段”,格式:2023年1月到2024年2月、去年1月到今年2月、2023年1月到1月等
|
|
|
+ range_pattern = r'(?P<start>(\d{4}|今|去|前|明)年(\d{1,2})?月?)到(?P<end>(\d{4}|今|去|前|明)年(\d{1,2})?月?)'
|
|
|
+ for match in re.finditer(range_pattern, question):
|
|
|
+ start_raw, end_raw = match.group('start'), match.group('end')
|
|
|
|
|
|
- # 处理“去年12月”等相对年份+月份的组合
|
|
|
+ def parse_relative(text):
|
|
|
+ year = current_year
|
|
|
+ month = None
|
|
|
+ if '明年' in text:
|
|
|
+ year = current_year + 1
|
|
|
+ elif '今年' in text or '今' in text:
|
|
|
+ year = current_year
|
|
|
+ elif '去年' in text or '去' in text:
|
|
|
+ year = current_year - 1
|
|
|
+ elif '前年' in text or '前' in text:
|
|
|
+ year = current_year - 2
|
|
|
+ m = re.search(r'(\d{1,2})月', text)
|
|
|
+ if m:
|
|
|
+ month = int(m.group(1))
|
|
|
+ return year, month
|
|
|
+
|
|
|
+ def parse_absolute(text):
|
|
|
+ m = re.match(r'(?P<year>\d{4})年(?P<month>\d{1,2})?月?', text)
|
|
|
+ if m:
|
|
|
+ year = int(m.group('year'))
|
|
|
+ month = int(m.group('month')) if m.group('month') else None
|
|
|
+ return year, month
|
|
|
+ return None, None
|
|
|
+
|
|
|
+ def parse_any(text):
|
|
|
+ if any(key in text for key in relative_year_mapping.keys()) or text[:1] in ['今', '去', '前', '明']:
|
|
|
+ return parse_relative(text)
|
|
|
+ else:
|
|
|
+ return parse_absolute(text)
|
|
|
+
|
|
|
+ start_y, start_m = parse_any(start_raw)
|
|
|
+ end_y, end_m = parse_any(end_raw)
|
|
|
+ time_results.append({
|
|
|
+ 'start_year': start_y, 'start_month': start_m,
|
|
|
+ 'end_year': end_y, 'end_month': end_m,
|
|
|
+ 'label': f'{start_raw}到{end_raw}',
|
|
|
+ 'raw': match.group()
|
|
|
+ })
|
|
|
+ used_keywords.add(match.group())
|
|
|
+
|
|
|
+ # 🆕 新增匹配“2024年1月到2月”,结束时间没有写年份,默认与开始时间同年
|
|
|
+ partial_range_pattern = r'(?P<year>\d{4})年(?P<start_month>\d{1,2})月到(?P<end_month>\d{1,2})月'
|
|
|
+ for match in re.finditer(partial_range_pattern, question):
|
|
|
+ # 避免重复匹配已经被上面时间段匹配使用过的字符串
|
|
|
+ if match.group() in used_keywords:
|
|
|
+ continue
|
|
|
+ year = int(match.group('year'))
|
|
|
+ start_month = int(match.group('start_month'))
|
|
|
+ end_month = int(match.group('end_month'))
|
|
|
+ time_results.append({
|
|
|
+ 'start_year': year,
|
|
|
+ 'start_month': start_month,
|
|
|
+ 'end_year': year,
|
|
|
+ 'end_month': end_month,
|
|
|
+ 'label': match.group(),
|
|
|
+ 'raw': match.group()
|
|
|
+ })
|
|
|
+ used_keywords.add(match.group())
|
|
|
+
|
|
|
+ # 相对+具体月份
|
|
|
relative_absolute_pattern = r'(?P<relative>今|去|前)年(?P<month>\d{1,2})月'
|
|
|
for match in re.finditer(relative_absolute_pattern, question):
|
|
|
+ if match.group() in used_keywords:
|
|
|
+ continue
|
|
|
rel = match.group('relative')
|
|
|
month = int(match.group('month'))
|
|
|
year = {'今': current_year, '去': current_year - 1, '前': current_year - 2}.get(rel, current_year)
|
|
|
time_results.append({'year': year, 'month': month, 'raw': match.group()})
|
|
|
+ used_keywords.add(match.group())
|
|
|
|
|
|
- # 绝对时间匹配
|
|
|
+ # 绝对时间
|
|
|
for pattern in absolute_patterns:
|
|
|
for match in re.finditer(pattern, question):
|
|
|
+ if match.group() in used_keywords:
|
|
|
+ continue
|
|
|
time_info = {'raw': match.group()}
|
|
|
gd = match.groupdict()
|
|
|
if gd.get('year'):
|
|
@@ -100,33 +187,28 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
time_info['month'] = int(gd['month'])
|
|
|
if gd.get('day'):
|
|
|
time_info['day'] = int(gd['day'])
|
|
|
- if time_info not in time_results:
|
|
|
- time_results.append(time_info)
|
|
|
-
|
|
|
- # 记录已匹配的相对时间词,避免重复
|
|
|
- used_relatives = {tr['raw'] for tr in time_results if 'label' in tr or tr['raw'] in relative_year_mapping}
|
|
|
+ time_results.append(time_info)
|
|
|
+ used_keywords.add(match.group())
|
|
|
|
|
|
- # 相对年份(“前年”“去年”“今年”“明年”)
|
|
|
+ # 单独的相对年份关键词
|
|
|
for term, year in relative_year_mapping.items():
|
|
|
- if term in question and term not in used_relatives:
|
|
|
+ if term in question and term not in used_keywords:
|
|
|
time_results.append({'year': year, 'label': term, 'raw': term})
|
|
|
+ used_keywords.add(term)
|
|
|
|
|
|
- # 新增:相对月份处理
|
|
|
- if '当前' in question:
|
|
|
+ # 当前/上个月
|
|
|
+ if '当前' in question and '当前' not in used_keywords:
|
|
|
time_results.append({'year': current_year, 'month': current_month, 'label': '当前', 'raw': '当前'})
|
|
|
- if '上个月' in question:
|
|
|
- # 计算上个月年月
|
|
|
- if current_month == 1:
|
|
|
- year = current_year - 1
|
|
|
- month = 12
|
|
|
- else:
|
|
|
- year = current_year
|
|
|
- month = current_month - 1
|
|
|
- time_results.append({'year': year, 'month': month, 'label': '上个月', 'raw': '上个月'})
|
|
|
-
|
|
|
- # 季度或半年
|
|
|
+ used_keywords.add('当前')
|
|
|
+ if '上个月' in question and '上个月' not in used_keywords:
|
|
|
+ prev_year = current_year if current_month > 1 else current_year - 1
|
|
|
+ prev_month = current_month - 1 if current_month > 1 else 12
|
|
|
+ time_results.append({'year': prev_year, 'month': prev_month, 'label': '上个月', 'raw': '上个月'})
|
|
|
+ used_keywords.add('上个月')
|
|
|
+
|
|
|
+ # 季度和半年
|
|
|
for term, (start_month, end_month) in season_mapping.items():
|
|
|
- if term in question:
|
|
|
+ if term in question and term not in used_keywords:
|
|
|
time_results.append({
|
|
|
'year': current_year,
|
|
|
'label': term,
|
|
@@ -134,15 +216,177 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
'end_month': end_month,
|
|
|
'raw': term
|
|
|
})
|
|
|
+ used_keywords.add(term)
|
|
|
|
|
|
- provinces = ['北京', '天津', '上海', '重庆', '河北', '山西', '辽宁', '吉林', '黑龙江',
|
|
|
- '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南',
|
|
|
- '广东', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '台湾',
|
|
|
- '内蒙古', '广西', '西藏', '宁夏', '新疆', '香港', '澳门']
|
|
|
-
|
|
|
+ # 地点识别
|
|
|
locations = [p for p in provinces if p in question]
|
|
|
|
|
|
return time_results, locations
|
|
|
+
|
|
|
+# def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
+# current_date = datetime.now()
|
|
|
+# current_year = current_date.year
|
|
|
+# current_month = current_date.month
|
|
|
+#
|
|
|
+# # 匹配绝对时间
|
|
|
+# absolute_patterns = [
|
|
|
+# r'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日',
|
|
|
+# r'(?P<year>\d{4})年(?P<month>\d{1,2})月',
|
|
|
+# r'(?P<year>\d{4})年'
|
|
|
+# ]
|
|
|
+#
|
|
|
+# relative_year_mapping = {
|
|
|
+# '明年': current_year + 1,
|
|
|
+# '今年': current_year,
|
|
|
+# '去年': current_year - 1,
|
|
|
+# '前年': current_year - 2
|
|
|
+# }
|
|
|
+#
|
|
|
+# season_mapping = {
|
|
|
+# '一季度': (1, 3),
|
|
|
+# '二季度': (4, 6),
|
|
|
+# '三季度': (7, 9),
|
|
|
+# '四季度': (10, 12),
|
|
|
+# '上半年': (1, 6),
|
|
|
+# '下半年': (7, 12)
|
|
|
+# }
|
|
|
+#
|
|
|
+# provinces = [
|
|
|
+# '北京', '天津', '上海', '重庆', '河北', '山西', '辽宁', '吉林', '黑龙江',
|
|
|
+# '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南',
|
|
|
+# '广东', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '台湾',
|
|
|
+# '内蒙古', '广西', '西藏', '宁夏', '新疆', '香港', '澳门'
|
|
|
+# ]
|
|
|
+#
|
|
|
+# time_results = []
|
|
|
+# used_keywords = set()
|
|
|
+#
|
|
|
+# # 🆕 处理“起止时间段”
|
|
|
+# range_pattern = r'(?P<start>(\d{4}|今|去|前|明)年(\d{1,2})?月?)到(?P<end>(\d{4}|今|去|前|明)年(\d{1,2})?月?)'
|
|
|
+# for match in re.finditer(range_pattern, question):
|
|
|
+# start_raw, end_raw = match.group('start'), match.group('end')
|
|
|
+#
|
|
|
+# def parse_relative(text):
|
|
|
+# year = current_year
|
|
|
+# month = None
|
|
|
+# if '明年' in text:
|
|
|
+# year = current_year + 1
|
|
|
+# elif '今年' in text or '今' in text:
|
|
|
+# year = current_year
|
|
|
+# elif '去年' in text or '去' in text:
|
|
|
+# year = current_year - 1
|
|
|
+# elif '前年' in text or '前' in text:
|
|
|
+# year = current_year - 2
|
|
|
+# # 提取月份
|
|
|
+# m = re.search(r'(\d{1,2})月', text)
|
|
|
+# if m:
|
|
|
+# month = int(m.group(1))
|
|
|
+# return year, month
|
|
|
+#
|
|
|
+# def parse_absolute(text):
|
|
|
+# m = re.match(r'(?P<year>\d{4})年(?P<month>\d{1,2})?月?', text)
|
|
|
+# if m:
|
|
|
+# year = int(m.group('year'))
|
|
|
+# month = int(m.group('month')) if m.group('month') else None
|
|
|
+# return year, month
|
|
|
+# return None, None
|
|
|
+#
|
|
|
+# # 统一处理为 year/month
|
|
|
+# def parse_any(text):
|
|
|
+# if any(key in text for key in relative_year_mapping.keys()) or text[:1] in ['今', '去', '前', '明']:
|
|
|
+# return parse_relative(text)
|
|
|
+# else:
|
|
|
+# return parse_absolute(text)
|
|
|
+#
|
|
|
+# start_y, start_m = parse_any(start_raw)
|
|
|
+# end_y, end_m = parse_any(end_raw)
|
|
|
+# time_results.append({
|
|
|
+# 'start_year': start_y, 'start_month': start_m,
|
|
|
+# 'end_year': end_y, 'end_month': end_m,
|
|
|
+# 'label': f'{start_raw}到{end_raw}',
|
|
|
+# 'raw': match.group()
|
|
|
+# })
|
|
|
+# used_keywords.add(match.group())
|
|
|
+#
|
|
|
+# # 相对+具体月份
|
|
|
+# relative_absolute_pattern = r'(?P<relative>今|去|前)年(?P<month>\d{1,2})月'
|
|
|
+# for match in re.finditer(relative_absolute_pattern, question):
|
|
|
+# if match.group() in used_keywords:
|
|
|
+# continue
|
|
|
+# rel = match.group('relative')
|
|
|
+# month = int(match.group('month'))
|
|
|
+# year = {'今': current_year, '去': current_year - 1, '前': current_year - 2}.get(rel, current_year)
|
|
|
+# time_results.append({'year': year, 'month': month, 'raw': match.group()})
|
|
|
+# used_keywords.add(match.group())
|
|
|
+#
|
|
|
+# # 绝对时间
|
|
|
+# for pattern in absolute_patterns:
|
|
|
+# for match in re.finditer(pattern, question):
|
|
|
+# if match.group() in used_keywords:
|
|
|
+# continue
|
|
|
+# time_info = {'raw': match.group()}
|
|
|
+# gd = match.groupdict()
|
|
|
+# if gd.get('year'):
|
|
|
+# time_info['year'] = int(gd['year'])
|
|
|
+# if gd.get('month'):
|
|
|
+# time_info['month'] = int(gd['month'])
|
|
|
+# if gd.get('day'):
|
|
|
+# time_info['day'] = int(gd['day'])
|
|
|
+# time_results.append(time_info)
|
|
|
+# used_keywords.add(match.group())
|
|
|
+#
|
|
|
+# # 单独的相对年份关键词
|
|
|
+# for term, year in relative_year_mapping.items():
|
|
|
+# if term in question and term not in used_keywords:
|
|
|
+# time_results.append({'year': year, 'label': term, 'raw': term})
|
|
|
+# used_keywords.add(term)
|
|
|
+#
|
|
|
+# # 当前/上个月
|
|
|
+# if '当前' in question and '当前' not in used_keywords:
|
|
|
+# time_results.append({'year': current_year, 'month': current_month, 'label': '当前', 'raw': '当前'})
|
|
|
+# used_keywords.add('当前')
|
|
|
+# if '上个月' in question and '上个月' not in used_keywords:
|
|
|
+# prev_year = current_year if current_month > 1 else current_year - 1
|
|
|
+# prev_month = current_month - 1 if current_month > 1 else 12
|
|
|
+# time_results.append({'year': prev_year, 'month': prev_month, 'label': '上个月', 'raw': '上个月'})
|
|
|
+# used_keywords.add('上个月')
|
|
|
+#
|
|
|
+# # 季度和半年
|
|
|
+# for term, (start_month, end_month) in season_mapping.items():
|
|
|
+# if term in question and term not in used_keywords:
|
|
|
+# time_results.append({
|
|
|
+# 'year': current_year,
|
|
|
+# 'label': term,
|
|
|
+# 'start_month': start_month,
|
|
|
+# 'end_month': end_month,
|
|
|
+# 'raw': term
|
|
|
+# })
|
|
|
+# used_keywords.add(term)
|
|
|
+#
|
|
|
+# # 地点识别
|
|
|
+# locations = [p for p in provinces if p in question]
|
|
|
+#
|
|
|
+# return time_results, locations
|
|
|
+# 先用 extract_time_location 判断问句包含哪类时间信息,然后只对结构匹配的模板子集做余弦匹配。
|
|
|
+def classify_by_time_type(query, time_info):
|
|
|
+ if any('start_year' in t and 'end_year' in t for t in time_info):
|
|
|
+ return ['3'] # 时间段
|
|
|
+ return list(template_dict.keys()) # fallback 所有模板
|
|
|
+def match_template_with_time_filter(query, template_dict, tokenizer, extract_time_location_func):
|
|
|
+ """
|
|
|
+ 先基于时间信息筛选候选模板,再进行TF-IDF匹配。
|
|
|
+ """
|
|
|
+ # 提取时间
|
|
|
+ time_info, _ = extract_time_location_func(query)
|
|
|
+
|
|
|
+ # 通过时间判断候选模板 key
|
|
|
+ candidate_keys = classify_by_time_type(query, time_info)
|
|
|
+
|
|
|
+ # 构造候选子模板字典
|
|
|
+ filtered_template_dict = {k: template_dict[k] for k in candidate_keys}
|
|
|
+
|
|
|
+ # 使用你原来的 TF-IDF 匹配函数
|
|
|
+ return match_template(query, filtered_template_dict, tokenizer)
|
|
|
# 找相似度最高的模板
|
|
|
def match_template(query, template_dict, tokenizer):
|
|
|
"""
|
|
@@ -210,9 +454,19 @@ def load_template_info(matched_key, json_folder):
|
|
|
def process_query(query, template_dict, json_folder, tokenizer=jieba_tokenizer):
|
|
|
# 提取条件
|
|
|
time_info, location_info = extract_time_location(query)
|
|
|
+ print(time_info)
|
|
|
+ candidate_ids = classify_by_time_type(query, time_info)
|
|
|
+ # 构造候选模板
|
|
|
+ candidates = [template_dict[k][0] for k in candidate_ids]
|
|
|
+
|
|
|
conditions = {}
|
|
|
# 匹配模板
|
|
|
- matched_key, best_sentence, score = match_template(query, template_dict, tokenizer)
|
|
|
+ matched_key, best_sentence, score = match_template_with_time_filter(
|
|
|
+ query,
|
|
|
+ template_dict,
|
|
|
+ tokenizer,
|
|
|
+ extract_time_location_func=extract_time_location
|
|
|
+ )
|
|
|
# 定义阈值
|
|
|
similarity_threshold = 0.4
|
|
|
# ★ 判断相似度阈值
|
|
@@ -328,28 +582,37 @@ def find_key_recursively(data, target_key):
|
|
|
_search(data)
|
|
|
return results
|
|
|
# query = "当月省间交易完成的交易是多少?"
|
|
|
+query = "2024年1月到2月累计交易电量是多少?"
|
|
|
# query = "但同样阿贾克斯大口径的话我可合金外壳设计文件突然发?"
|
|
|
-# json_folder = "templatesJson"
|
|
|
-#
|
|
|
-#
|
|
|
-# result = process_query(query, template_dict, json_folder)
|
|
|
-#
|
|
|
-# print("匹配的模板 key:", result["matched_key"])
|
|
|
-# print("最相似的模板句:", result["matched_template"])
|
|
|
-# print("相似度分数:", result["similarity_score"])
|
|
|
-# print("类型:", result["type"])
|
|
|
-# print("关键词:", result["keywords"])
|
|
|
-# print("查询字段:", result["target"])
|
|
|
-# print("模型名字", result["name"])
|
|
|
-# print("条件", result["conditions"])
|
|
|
-# print("返回的内容是:", result["content"])
|
|
|
-# print("问句是:", result["query"])
|
|
|
-# print("动作是:", result["play"])
|
|
|
+json_folder = "templatesJson"
|
|
|
+
|
|
|
+
|
|
|
+result = process_query(query, template_dict, json_folder)
|
|
|
+
|
|
|
+print("匹配的模板 key:", result["matched_key"])
|
|
|
+print("最相似的模板句:", result["matched_template"])
|
|
|
+print("相似度分数:", result["similarity_score"])
|
|
|
+print("类型:", result["type"])
|
|
|
+print("关键词:", result["keywords"])
|
|
|
+print("查询字段:", result["target"])
|
|
|
+print("模型名字", result["name"])
|
|
|
+print("条件", result["conditions"])
|
|
|
+print("返回的内容是:", result["content"])
|
|
|
+print("问句是:", result["query"])
|
|
|
+print("动作是:", result["play"])
|
|
|
+
|
|
|
+# type = result["type"]
|
|
|
# content = result["content"]
|
|
|
#
|
|
|
# json_data_folder = "..\Json\json_data"
|
|
|
-# result = smart_find_value(json_data_folder, result["name"],result["conditions"],result["target"] )
|
|
|
+# if type == "query":
|
|
|
+# result = smart_find_value(json_data_folder, result["name"],result["conditions"],result["target"] )
|
|
|
+# elif type == "calculate":
|
|
|
+# result = calculate_sum_by_time_range(json_data_folder, result["name"], result["target"],)
|
|
|
+#
|
|
|
+# # 最终回答的文本
|
|
|
# final_content = content.replace("?", str(result))
|
|
|
# # print(f"{content}{result}")
|
|
|
#
|
|
|
# print(final_content)
|
|
|
+
|