|
@@ -3,9 +3,6 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
|
import jieba
|
|
|
import os
|
|
|
import json
|
|
|
-from datetime import datetime
|
|
|
-from typing import Tuple, List, Dict
|
|
|
-import re
|
|
|
|
|
|
from final.ByRules.util import calculate_sum_by_time_range
|
|
|
|
|
@@ -224,155 +221,26 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
|
|
|
return time_results, locations
|
|
|
|
|
|
-# def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
|
|
|
-# current_date = datetime.now()
|
|
|
-# current_year = current_date.year
|
|
|
-# current_month = current_date.month
|
|
|
-#
|
|
|
-# # 匹配绝对时间
|
|
|
-# absolute_patterns = [
|
|
|
-# r'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日',
|
|
|
-# r'(?P<year>\d{4})年(?P<month>\d{1,2})月',
|
|
|
-# r'(?P<year>\d{4})年'
|
|
|
-# ]
|
|
|
-#
|
|
|
-# relative_year_mapping = {
|
|
|
-# '明年': current_year + 1,
|
|
|
-# '今年': current_year,
|
|
|
-# '去年': current_year - 1,
|
|
|
-# '前年': current_year - 2
|
|
|
-# }
|
|
|
-#
|
|
|
-# season_mapping = {
|
|
|
-# '一季度': (1, 3),
|
|
|
-# '二季度': (4, 6),
|
|
|
-# '三季度': (7, 9),
|
|
|
-# '四季度': (10, 12),
|
|
|
-# '上半年': (1, 6),
|
|
|
-# '下半年': (7, 12)
|
|
|
-# }
|
|
|
-#
|
|
|
-# provinces = [
|
|
|
-# '北京', '天津', '上海', '重庆', '河北', '山西', '辽宁', '吉林', '黑龙江',
|
|
|
-# '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南',
|
|
|
-# '广东', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '台湾',
|
|
|
-# '内蒙古', '广西', '西藏', '宁夏', '新疆', '香港', '澳门'
|
|
|
-# ]
|
|
|
-#
|
|
|
-# time_results = []
|
|
|
-# used_keywords = set()
|
|
|
-#
|
|
|
-# # 🆕 处理“起止时间段”
|
|
|
-# range_pattern = r'(?P<start>(\d{4}|今|去|前|明)年(\d{1,2})?月?)到(?P<end>(\d{4}|今|去|前|明)年(\d{1,2})?月?)'
|
|
|
-# for match in re.finditer(range_pattern, question):
|
|
|
-# start_raw, end_raw = match.group('start'), match.group('end')
|
|
|
-#
|
|
|
-# def parse_relative(text):
|
|
|
-# year = current_year
|
|
|
-# month = None
|
|
|
-# if '明年' in text:
|
|
|
-# year = current_year + 1
|
|
|
-# elif '今年' in text or '今' in text:
|
|
|
-# year = current_year
|
|
|
-# elif '去年' in text or '去' in text:
|
|
|
-# year = current_year - 1
|
|
|
-# elif '前年' in text or '前' in text:
|
|
|
-# year = current_year - 2
|
|
|
-# # 提取月份
|
|
|
-# m = re.search(r'(\d{1,2})月', text)
|
|
|
-# if m:
|
|
|
-# month = int(m.group(1))
|
|
|
-# return year, month
|
|
|
-#
|
|
|
-# def parse_absolute(text):
|
|
|
-# m = re.match(r'(?P<year>\d{4})年(?P<month>\d{1,2})?月?', text)
|
|
|
-# if m:
|
|
|
-# year = int(m.group('year'))
|
|
|
-# month = int(m.group('month')) if m.group('month') else None
|
|
|
-# return year, month
|
|
|
-# return None, None
|
|
|
-#
|
|
|
-# # 统一处理为 year/month
|
|
|
-# def parse_any(text):
|
|
|
-# if any(key in text for key in relative_year_mapping.keys()) or text[:1] in ['今', '去', '前', '明']:
|
|
|
-# return parse_relative(text)
|
|
|
-# else:
|
|
|
-# return parse_absolute(text)
|
|
|
-#
|
|
|
-# start_y, start_m = parse_any(start_raw)
|
|
|
-# end_y, end_m = parse_any(end_raw)
|
|
|
-# time_results.append({
|
|
|
-# 'start_year': start_y, 'start_month': start_m,
|
|
|
-# 'end_year': end_y, 'end_month': end_m,
|
|
|
-# 'label': f'{start_raw}到{end_raw}',
|
|
|
-# 'raw': match.group()
|
|
|
-# })
|
|
|
-# used_keywords.add(match.group())
|
|
|
-#
|
|
|
-# # 相对+具体月份
|
|
|
-# relative_absolute_pattern = r'(?P<relative>今|去|前)年(?P<month>\d{1,2})月'
|
|
|
-# for match in re.finditer(relative_absolute_pattern, question):
|
|
|
-# if match.group() in used_keywords:
|
|
|
-# continue
|
|
|
-# rel = match.group('relative')
|
|
|
-# month = int(match.group('month'))
|
|
|
-# year = {'今': current_year, '去': current_year - 1, '前': current_year - 2}.get(rel, current_year)
|
|
|
-# time_results.append({'year': year, 'month': month, 'raw': match.group()})
|
|
|
-# used_keywords.add(match.group())
|
|
|
-#
|
|
|
-# # 绝对时间
|
|
|
-# for pattern in absolute_patterns:
|
|
|
-# for match in re.finditer(pattern, question):
|
|
|
-# if match.group() in used_keywords:
|
|
|
-# continue
|
|
|
-# time_info = {'raw': match.group()}
|
|
|
-# gd = match.groupdict()
|
|
|
-# if gd.get('year'):
|
|
|
-# time_info['year'] = int(gd['year'])
|
|
|
-# if gd.get('month'):
|
|
|
-# time_info['month'] = int(gd['month'])
|
|
|
-# if gd.get('day'):
|
|
|
-# time_info['day'] = int(gd['day'])
|
|
|
-# time_results.append(time_info)
|
|
|
-# used_keywords.add(match.group())
|
|
|
-#
|
|
|
-# # 单独的相对年份关键词
|
|
|
-# for term, year in relative_year_mapping.items():
|
|
|
-# if term in question and term not in used_keywords:
|
|
|
-# time_results.append({'year': year, 'label': term, 'raw': term})
|
|
|
-# used_keywords.add(term)
|
|
|
-#
|
|
|
-# # 当前/上个月
|
|
|
-# if '当前' in question and '当前' not in used_keywords:
|
|
|
-# time_results.append({'year': current_year, 'month': current_month, 'label': '当前', 'raw': '当前'})
|
|
|
-# used_keywords.add('当前')
|
|
|
-# if '上个月' in question and '上个月' not in used_keywords:
|
|
|
-# prev_year = current_year if current_month > 1 else current_year - 1
|
|
|
-# prev_month = current_month - 1 if current_month > 1 else 12
|
|
|
-# time_results.append({'year': prev_year, 'month': prev_month, 'label': '上个月', 'raw': '上个月'})
|
|
|
-# used_keywords.add('上个月')
|
|
|
-#
|
|
|
-# # 季度和半年
|
|
|
-# for term, (start_month, end_month) in season_mapping.items():
|
|
|
-# if term in question and term not in used_keywords:
|
|
|
-# time_results.append({
|
|
|
-# 'year': current_year,
|
|
|
-# 'label': term,
|
|
|
-# 'start_month': start_month,
|
|
|
-# 'end_month': end_month,
|
|
|
-# 'raw': term
|
|
|
-# })
|
|
|
-# used_keywords.add(term)
|
|
|
-#
|
|
|
-# # 地点识别
|
|
|
-# locations = [p for p in provinces if p in question]
|
|
|
-#
|
|
|
-# return time_results, locations
|
|
|
# 先用 extract_time_location 判断问句包含哪类时间信息,然后只对结构匹配的模板子集做余弦匹配。
|
|
|
+# def classify_by_time_type(query, time_info):
|
|
|
+# if any('start_year' in t and 'end_year' in t for t in time_info):
|
|
|
+# return ['3'] # 时间段
|
|
|
+# return list(template_dict.keys()) # fallback 所有模板
|
|
|
def classify_by_time_type(query, time_info):
|
|
|
- if any('start_year' in t and 'end_year' in t for t in time_info):
|
|
|
- return ['3'] # 时间段
|
|
|
- return list(template_dict.keys()) # fallback 所有模板
|
|
|
+ if not time_info:
|
|
|
+ # 无时间信息时,返回指定模板 19-23
|
|
|
+ return ['19', '20', '21', '22', '23']
|
|
|
+
|
|
|
+ time = time_info[0]
|
|
|
+
|
|
|
+ # 情况 1:起始时间和结束时间都有,判断为时间段
|
|
|
+ if 'start_year' in time and 'end_year' in time:
|
|
|
+ return ['3'] # 某年某月到某月累计交易电量
|
|
|
+
|
|
|
+ # 情况 2:有 year 和 month,精确到月
|
|
|
+ if 'year' in time and 'month' in time:
|
|
|
+ return ['2'] # 某年某月交易电量
|
|
|
+
|
|
|
def match_template_with_time_filter(query, template_dict, tokenizer, extract_time_location_func):
|
|
|
"""
|
|
|
先基于时间信息筛选候选模板,再进行TF-IDF匹配。
|
|
@@ -385,7 +253,6 @@ def match_template_with_time_filter(query, template_dict, tokenizer, extract_tim
|
|
|
|
|
|
# 构造候选子模板字典
|
|
|
filtered_template_dict = {k: template_dict[k] for k in candidate_keys}
|
|
|
-
|
|
|
# 使用你原来的 TF-IDF 匹配函数
|
|
|
return match_template(query, filtered_template_dict, tokenizer)
|
|
|
# 找相似度最高的模板
|
|
@@ -455,10 +322,6 @@ def load_template_info(matched_key, json_folder):
|
|
|
def process_query(query, template_dict, json_folder, tokenizer=jieba_tokenizer):
|
|
|
# 提取条件
|
|
|
time_info, location_info = extract_time_location(query)
|
|
|
- print(time_info)
|
|
|
- candidate_ids = classify_by_time_type(query, time_info)
|
|
|
- # 构造候选模板
|
|
|
- candidates = [template_dict[k][0] for k in candidate_ids]
|
|
|
|
|
|
conditions = {}
|
|
|
# 匹配模板
|
|
@@ -469,7 +332,7 @@ def process_query(query, template_dict, json_folder, tokenizer=jieba_tokenizer):
|
|
|
extract_time_location_func=extract_time_location
|
|
|
)
|
|
|
# 定义阈值
|
|
|
- similarity_threshold = 0.4
|
|
|
+ similarity_threshold = 0.3
|
|
|
# ★ 判断相似度阈值
|
|
|
if score < similarity_threshold:
|
|
|
return {
|
|
@@ -487,11 +350,19 @@ def process_query(query, template_dict, json_folder, tokenizer=jieba_tokenizer):
|
|
|
}
|
|
|
|
|
|
if time_info:
|
|
|
- year = time_info[0].get('year')
|
|
|
- if year:
|
|
|
- conditions['年'] = year
|
|
|
- if 'month' in time_info[0]:
|
|
|
- conditions['月'] = time_info[0]['month']
|
|
|
+ ti = time_info[0]
|
|
|
+ # 先判断是否是区间时间(有start_year/end_year等字段)
|
|
|
+ if 'start_year' in ti and 'end_year' in ti:
|
|
|
+ conditions['start_year'] = ti.get('start_year')
|
|
|
+ conditions['start_month'] = ti.get('start_month')
|
|
|
+ conditions['end_year'] = ti.get('end_year')
|
|
|
+ conditions['end_month'] = ti.get('end_month')
|
|
|
+ else:
|
|
|
+ # 单时间点
|
|
|
+ if 'year' in ti:
|
|
|
+ conditions['年'] = ti['year']
|
|
|
+ if 'month' in ti:
|
|
|
+ conditions['月'] = ti['month']
|
|
|
|
|
|
if location_info:
|
|
|
unit = map_location_to_unit(location_info[0])
|
|
@@ -592,32 +463,42 @@ def find_key_recursively(data, target_key):
|
|
|
return results
|
|
|
# query = "当月省间交易完成的交易是多少?"
|
|
|
# query = "2024年1月到2月累计交易电量是多少?"
|
|
|
-# # query = "但同样阿贾克斯大口径的话我可合金外壳设计文件突然发?"
|
|
|
-# json_folder = "templatesJson"
|
|
|
-#
|
|
|
-#
|
|
|
-# result = process_query(query, template_dict, json_folder)
|
|
|
-#
|
|
|
-# print("匹配的模板 key:", result["matched_key"])
|
|
|
-# print("最相似的模板句:", result["matched_template"])
|
|
|
-# print("相似度分数:", result["similarity_score"])
|
|
|
-# print("类型:", result["type"])
|
|
|
-# print("关键词:", result["keywords"])
|
|
|
-# print("查询字段:", result["target"])
|
|
|
-# print("模型名字", result["name"])
|
|
|
-# print("条件", result["conditions"])
|
|
|
-# print("返回的内容是:", result["content"])
|
|
|
-# print("问句是:", result["query"])
|
|
|
-# print("动作是:", result["play"])
|
|
|
-
|
|
|
-# type = result["type"]
|
|
|
-# content = result["content"]
|
|
|
-#
|
|
|
-# json_data_folder = "..\Json\json_data"
|
|
|
-# if type == "query":
|
|
|
-# result = smart_find_value(json_data_folder, result["name"],result["conditions"],result["target"] )
|
|
|
-# elif type == "calculate":
|
|
|
-# result = calculate_sum_by_time_range(json_data_folder, result["name"], result["target"],)
|
|
|
+query = "2024年12月交易电量是多少?"
|
|
|
+# query = "但同样阿贾克斯大口径的话我可合金外壳设计文件突然发?"
|
|
|
+json_folder = "templatesJson"
|
|
|
+
|
|
|
+
|
|
|
+result = process_query(query, template_dict, json_folder)
|
|
|
+
|
|
|
+print("匹配的模板 key:", result["matched_key"])
|
|
|
+print("最相似的模板句:", result["matched_template"])
|
|
|
+print("相似度分数:", result["similarity_score"])
|
|
|
+print("类型:", result["type"])
|
|
|
+print("关键词:", result["keywords"])
|
|
|
+print("查询字段:", result["target"])
|
|
|
+print("模型名字", result["name"])
|
|
|
+print("条件", result["conditions"])
|
|
|
+print("返回的内容是:", result["content"])
|
|
|
+print("问句是:", result["query"])
|
|
|
+print("动作是:", result["play"])
|
|
|
+
|
|
|
+type = result["type"]
|
|
|
+content = result["content"]
|
|
|
+
|
|
|
+json_data_folder = "..\Json\json_data"
|
|
|
+if type == "query":
|
|
|
+ fileName = result["dataJsonName"]
|
|
|
+ result = smart_find_value(json_data_folder, fileName,result["conditions"],result["target"] )
|
|
|
+ print(result)
|
|
|
+elif type == "calculate":
|
|
|
+ conditions = result["conditions"]
|
|
|
+ start_conditions = {('年' if 'year' in k else '月'): v for k, v in conditions.items() if k.startswith('start_')}
|
|
|
+ end_conditions = {('年' if 'year' in k else '月'): v for k, v in conditions.items() if k.startswith('end_')}
|
|
|
+ print(start_conditions)
|
|
|
+ print(end_conditions)
|
|
|
+ fileName = result["dataJsonName"] + ".json"
|
|
|
+ result = calculate_sum_by_time_range(json_data_folder,fileName,result["target"],start_conditions, end_conditions)
|
|
|
+ print(result)
|
|
|
#
|
|
|
# # 最终回答的文本
|
|
|
# final_content = content.replace("?", str(result))
|