Browse Source

新增计算

zfrr 2 months ago
parent
commit
bd71c06a14

BIN
final/ByRules/__pycache__/similarity_answer_json.cpython-39.pyc


+ 0 - 41
final/ByRules/answer.py

@@ -164,47 +164,6 @@ def map_location_to_unit(location: str) -> str:
             return code
     return '未知单位'
 
-
-# 查询
-# def smart_find_value(folder_path, file_name, conditions: dict, target_key: str):
-#     file_path = os.path.join(folder_path, file_name)
-#
-#     if not os.path.exists(file_path):
-#         print(f"文件 {file_path} 不存在")
-#         return None
-#
-#     with open(file_path, 'r', encoding='utf-8') as f:
-#         try:
-#             data = json.load(f)
-#         except json.JSONDecodeError as e:
-#             print(f"JSON 解析失败:{e}")
-#             return None
-#
-#     # 情况一:数据是字典,直接匹配
-#     if isinstance(data, dict):
-#         if not conditions:
-#             return data.get(target_key, None)
-#         if all(data.get(k) == v for k, v in conditions.items()):
-#             return data.get(target_key, None)
-#         return None
-#
-#     # 情况二:数据是列表,逐条匹配
-#     elif isinstance(data, list):
-#         results = []
-#         for record in data:
-#             if isinstance(record, dict) and all(record.get(k) == v for k, v in conditions.items()):
-#                 results.append(record.get(target_key))
-#         if not results:
-#             return None
-#         elif len(results) == 1:
-#             return results[0]
-#         else:
-#             return results
-#
-#     else:
-#         print(f"未知数据结构:{type(data)}")
-#         return None
-
 def find_key_recursively(data, target_key):
     results = []
 

+ 309 - 46
final/ByRules/similarity_answer_json.py

@@ -6,12 +6,17 @@ import json
 from datetime import datetime
 from typing import Tuple, List, Dict
 import re
+
+from final.ByRules.util import calculate_sum_by_time_range
+
+
 def jieba_tokenizer(text):
     return list(jieba.cut(text))
 # 定义问题模板
 template_dict = {
     "1": ["某年全年累计省间交易电量是多少?"],
     "2": ["某年某月交易电量是多少?"],
+    "3": ["某年某月到某月累计交易电量是多少?"],
     "8.1": ["某年省间交易电量按交易周期划分的电量是多少?"],
     "8.2": ["某年省间交易电量按交易类型划分的电量是多少?"],
     "8.3": ["某年省间交易电量按发电类型划分的电量是多少?"],
@@ -52,11 +57,16 @@ def map_location_to_unit(location: str) -> str:
             return code
     return '未知单位'
 # 提取时间和地点
+from typing import Tuple, List, Dict
+from datetime import datetime
+import re
+
 def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
     current_date = datetime.now()
     current_year = current_date.year
     current_month = current_date.month
 
+    # 匹配绝对时间
     absolute_patterns = [
         r'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日',
         r'(?P<year>\d{4})年(?P<month>\d{1,2})月',
@@ -79,19 +89,96 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
         '下半年': (7, 12)
     }
 
+    provinces = [
+        '北京', '天津', '上海', '重庆', '河北', '山西', '辽宁', '吉林', '黑龙江',
+        '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南',
+        '广东', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '台湾',
+        '内蒙古', '广西', '西藏', '宁夏', '新疆', '香港', '澳门'
+    ]
+
     time_results = []
+    used_keywords = set()
+
+    # 🆕 处理“起止时间段”,格式:2023年1月到2024年2月、去年1月到今年2月、2023年1月到1月等
+    range_pattern = r'(?P<start>(\d{4}|今|去|前|明)年(\d{1,2})?月?)到(?P<end>(\d{4}|今|去|前|明)年(\d{1,2})?月?)'
+    for match in re.finditer(range_pattern, question):
+        start_raw, end_raw = match.group('start'), match.group('end')
 
-    # 处理“去年12月”等相对年份+月份的组合
+        def parse_relative(text):
+            year = current_year
+            month = None
+            if '明年' in text:
+                year = current_year + 1
+            elif '今年' in text or '今' in text:
+                year = current_year
+            elif '去年' in text or '去' in text:
+                year = current_year - 1
+            elif '前年' in text or '前' in text:
+                year = current_year - 2
+            m = re.search(r'(\d{1,2})月', text)
+            if m:
+                month = int(m.group(1))
+            return year, month
+
+        def parse_absolute(text):
+            m = re.match(r'(?P<year>\d{4})年(?P<month>\d{1,2})?月?', text)
+            if m:
+                year = int(m.group('year'))
+                month = int(m.group('month')) if m.group('month') else None
+                return year, month
+            return None, None
+
+        def parse_any(text):
+            if any(key in text for key in relative_year_mapping.keys()) or text[:1] in ['今', '去', '前', '明']:
+                return parse_relative(text)
+            else:
+                return parse_absolute(text)
+
+        start_y, start_m = parse_any(start_raw)
+        end_y, end_m = parse_any(end_raw)
+        time_results.append({
+            'start_year': start_y, 'start_month': start_m,
+            'end_year': end_y, 'end_month': end_m,
+            'label': f'{start_raw}到{end_raw}',
+            'raw': match.group()
+        })
+        used_keywords.add(match.group())
+
+    # 🆕 新增匹配“2024年1月到2月”,结束时间没有写年份,默认与开始时间同年
+    partial_range_pattern = r'(?P<year>\d{4})年(?P<start_month>\d{1,2})月到(?P<end_month>\d{1,2})月'
+    for match in re.finditer(partial_range_pattern, question):
+        # 避免重复匹配已经被上面时间段匹配使用过的字符串
+        if match.group() in used_keywords:
+            continue
+        year = int(match.group('year'))
+        start_month = int(match.group('start_month'))
+        end_month = int(match.group('end_month'))
+        time_results.append({
+            'start_year': year,
+            'start_month': start_month,
+            'end_year': year,
+            'end_month': end_month,
+            'label': match.group(),
+            'raw': match.group()
+        })
+        used_keywords.add(match.group())
+
+    # 相对+具体月份
     relative_absolute_pattern = r'(?P<relative>今|去|前)年(?P<month>\d{1,2})月'
     for match in re.finditer(relative_absolute_pattern, question):
+        if match.group() in used_keywords:
+            continue
         rel = match.group('relative')
         month = int(match.group('month'))
         year = {'今': current_year, '去': current_year - 1, '前': current_year - 2}.get(rel, current_year)
         time_results.append({'year': year, 'month': month, 'raw': match.group()})
+        used_keywords.add(match.group())
 
-    # 绝对时间匹配
+    # 绝对时间
     for pattern in absolute_patterns:
         for match in re.finditer(pattern, question):
+            if match.group() in used_keywords:
+                continue
             time_info = {'raw': match.group()}
             gd = match.groupdict()
             if gd.get('year'):
@@ -100,33 +187,28 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
                 time_info['month'] = int(gd['month'])
             if gd.get('day'):
                 time_info['day'] = int(gd['day'])
-            if time_info not in time_results:
-                time_results.append(time_info)
-
-    # 记录已匹配的相对时间词,避免重复
-    used_relatives = {tr['raw'] for tr in time_results if 'label' in tr or tr['raw'] in relative_year_mapping}
+            time_results.append(time_info)
+            used_keywords.add(match.group())
 
-    # 相对年份(“前年”“去年”“今年”“明年”)
+    # 单独的相对年份关键词
     for term, year in relative_year_mapping.items():
-        if term in question and term not in used_relatives:
+        if term in question and term not in used_keywords:
             time_results.append({'year': year, 'label': term, 'raw': term})
+            used_keywords.add(term)
 
-    # 新增:相对月份处理
-    if '当前' in question:
+    # 当前/上个月
+    if '当前' in question and '当前' not in used_keywords:
         time_results.append({'year': current_year, 'month': current_month, 'label': '当前', 'raw': '当前'})
-    if '上个月' in question:
-        # 计算上个月年月
-        if current_month == 1:
-            year = current_year - 1
-            month = 12
-        else:
-            year = current_year
-            month = current_month - 1
-        time_results.append({'year': year, 'month': month, 'label': '上个月', 'raw': '上个月'})
-
-    # 季度或半年
+        used_keywords.add('当前')
+    if '上个月' in question and '上个月' not in used_keywords:
+        prev_year = current_year if current_month > 1 else current_year - 1
+        prev_month = current_month - 1 if current_month > 1 else 12
+        time_results.append({'year': prev_year, 'month': prev_month, 'label': '上个月', 'raw': '上个月'})
+        used_keywords.add('上个月')
+
+    # 季度和半年
     for term, (start_month, end_month) in season_mapping.items():
-        if term in question:
+        if term in question and term not in used_keywords:
             time_results.append({
                 'year': current_year,
                 'label': term,
@@ -134,15 +216,177 @@ def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
                 'end_month': end_month,
                 'raw': term
             })
+            used_keywords.add(term)
 
-    provinces = ['北京', '天津', '上海', '重庆', '河北', '山西', '辽宁', '吉林', '黑龙江',
-                 '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南',
-                 '广东', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '台湾',
-                 '内蒙古', '广西', '西藏', '宁夏', '新疆', '香港', '澳门']
-
+    # 地点识别
     locations = [p for p in provinces if p in question]
 
     return time_results, locations
+
+# def extract_time_location(question: str) -> Tuple[List[Dict], List[str]]:
+#     current_date = datetime.now()
+#     current_year = current_date.year
+#     current_month = current_date.month
+#
+#     # 匹配绝对时间
+#     absolute_patterns = [
+#         r'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日',
+#         r'(?P<year>\d{4})年(?P<month>\d{1,2})月',
+#         r'(?P<year>\d{4})年'
+#     ]
+#
+#     relative_year_mapping = {
+#         '明年': current_year + 1,
+#         '今年': current_year,
+#         '去年': current_year - 1,
+#         '前年': current_year - 2
+#     }
+#
+#     season_mapping = {
+#         '一季度': (1, 3),
+#         '二季度': (4, 6),
+#         '三季度': (7, 9),
+#         '四季度': (10, 12),
+#         '上半年': (1, 6),
+#         '下半年': (7, 12)
+#     }
+#
+#     provinces = [
+#         '北京', '天津', '上海', '重庆', '河北', '山西', '辽宁', '吉林', '黑龙江',
+#         '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南',
+#         '广东', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '台湾',
+#         '内蒙古', '广西', '西藏', '宁夏', '新疆', '香港', '澳门'
+#     ]
+#
+#     time_results = []
+#     used_keywords = set()
+#
+#     # 🆕 处理“起止时间段”
+#     range_pattern = r'(?P<start>(\d{4}|今|去|前|明)年(\d{1,2})?月?)到(?P<end>(\d{4}|今|去|前|明)年(\d{1,2})?月?)'
+#     for match in re.finditer(range_pattern, question):
+#         start_raw, end_raw = match.group('start'), match.group('end')
+#
+#         def parse_relative(text):
+#             year = current_year
+#             month = None
+#             if '明年' in text:
+#                 year = current_year + 1
+#             elif '今年' in text or '今' in text:
+#                 year = current_year
+#             elif '去年' in text or '去' in text:
+#                 year = current_year - 1
+#             elif '前年' in text or '前' in text:
+#                 year = current_year - 2
+#             # 提取月份
+#             m = re.search(r'(\d{1,2})月', text)
+#             if m:
+#                 month = int(m.group(1))
+#             return year, month
+#
+#         def parse_absolute(text):
+#             m = re.match(r'(?P<year>\d{4})年(?P<month>\d{1,2})?月?', text)
+#             if m:
+#                 year = int(m.group('year'))
+#                 month = int(m.group('month')) if m.group('month') else None
+#                 return year, month
+#             return None, None
+#
+#         # 统一处理为 year/month
+#         def parse_any(text):
+#             if any(key in text for key in relative_year_mapping.keys()) or text[:1] in ['今', '去', '前', '明']:
+#                 return parse_relative(text)
+#             else:
+#                 return parse_absolute(text)
+#
+#         start_y, start_m = parse_any(start_raw)
+#         end_y, end_m = parse_any(end_raw)
+#         time_results.append({
+#             'start_year': start_y, 'start_month': start_m,
+#             'end_year': end_y, 'end_month': end_m,
+#             'label': f'{start_raw}到{end_raw}',
+#             'raw': match.group()
+#         })
+#         used_keywords.add(match.group())
+#
+#     # 相对+具体月份
+#     relative_absolute_pattern = r'(?P<relative>今|去|前)年(?P<month>\d{1,2})月'
+#     for match in re.finditer(relative_absolute_pattern, question):
+#         if match.group() in used_keywords:
+#             continue
+#         rel = match.group('relative')
+#         month = int(match.group('month'))
+#         year = {'今': current_year, '去': current_year - 1, '前': current_year - 2}.get(rel, current_year)
+#         time_results.append({'year': year, 'month': month, 'raw': match.group()})
+#         used_keywords.add(match.group())
+#
+#     # 绝对时间
+#     for pattern in absolute_patterns:
+#         for match in re.finditer(pattern, question):
+#             if match.group() in used_keywords:
+#                 continue
+#             time_info = {'raw': match.group()}
+#             gd = match.groupdict()
+#             if gd.get('year'):
+#                 time_info['year'] = int(gd['year'])
+#             if gd.get('month'):
+#                 time_info['month'] = int(gd['month'])
+#             if gd.get('day'):
+#                 time_info['day'] = int(gd['day'])
+#             time_results.append(time_info)
+#             used_keywords.add(match.group())
+#
+#     # 单独的相对年份关键词
+#     for term, year in relative_year_mapping.items():
+#         if term in question and term not in used_keywords:
+#             time_results.append({'year': year, 'label': term, 'raw': term})
+#             used_keywords.add(term)
+#
+#     # 当前/上个月
+#     if '当前' in question and '当前' not in used_keywords:
+#         time_results.append({'year': current_year, 'month': current_month, 'label': '当前', 'raw': '当前'})
+#         used_keywords.add('当前')
+#     if '上个月' in question and '上个月' not in used_keywords:
+#         prev_year = current_year if current_month > 1 else current_year - 1
+#         prev_month = current_month - 1 if current_month > 1 else 12
+#         time_results.append({'year': prev_year, 'month': prev_month, 'label': '上个月', 'raw': '上个月'})
+#         used_keywords.add('上个月')
+#
+#     # 季度和半年
+#     for term, (start_month, end_month) in season_mapping.items():
+#         if term in question and term not in used_keywords:
+#             time_results.append({
+#                 'year': current_year,
+#                 'label': term,
+#                 'start_month': start_month,
+#                 'end_month': end_month,
+#                 'raw': term
+#             })
+#             used_keywords.add(term)
+#
+#     # 地点识别
+#     locations = [p for p in provinces if p in question]
+#
+#     return time_results, locations
+# 先用 extract_time_location 判断问句包含哪类时间信息,然后只对结构匹配的模板子集做余弦匹配。
+def classify_by_time_type(query, time_info):
+    if any('start_year' in t and 'end_year' in t for t in time_info):
+        return ['3']  # 时间段
+    return list(template_dict.keys())  # fallback 所有模板
+def match_template_with_time_filter(query, template_dict, tokenizer, extract_time_location_func):
+    """
+    先基于时间信息筛选候选模板,再进行TF-IDF匹配。
+    """
+    # 提取时间
+    time_info, _ = extract_time_location_func(query)
+
+    # 通过时间判断候选模板 key
+    candidate_keys = classify_by_time_type(query, time_info)
+
+    # 构造候选子模板字典
+    filtered_template_dict = {k: template_dict[k] for k in candidate_keys}
+
+    # 使用你原来的 TF-IDF 匹配函数
+    return match_template(query, filtered_template_dict, tokenizer)
 # 找相似度最高的模板
 def match_template(query, template_dict, tokenizer):
     """
@@ -210,9 +454,19 @@ def load_template_info(matched_key, json_folder):
 def process_query(query, template_dict, json_folder, tokenizer=jieba_tokenizer):
     # 提取条件
     time_info, location_info = extract_time_location(query)
+    print(time_info)
+    candidate_ids = classify_by_time_type(query, time_info)
+    # 构造候选模板
+    candidates = [template_dict[k][0] for k in candidate_ids]
+
     conditions = {}
     # 匹配模板
-    matched_key, best_sentence, score = match_template(query, template_dict, tokenizer)
+    matched_key, best_sentence, score = match_template_with_time_filter(
+        query,
+        template_dict,
+        tokenizer,
+        extract_time_location_func=extract_time_location
+    )
     # 定义阈值
     similarity_threshold = 0.4
     # ★ 判断相似度阈值
@@ -328,28 +582,37 @@ def find_key_recursively(data, target_key):
     _search(data)
     return results
 # query = "当月省间交易完成的交易是多少?"
+query = "2024年1月到2月累计交易电量是多少?"
 # query = "但同样阿贾克斯大口径的话我可合金外壳设计文件突然发?"
-# json_folder = "templatesJson"
-#
-#
-# result = process_query(query, template_dict, json_folder)
-#
-# print("匹配的模板 key:", result["matched_key"])
-# print("最相似的模板句:", result["matched_template"])
-# print("相似度分数:", result["similarity_score"])
-# print("类型:", result["type"])
-# print("关键词:", result["keywords"])
-# print("查询字段:", result["target"])
-# print("模型名字", result["name"])
-# print("条件", result["conditions"])
-# print("返回的内容是:", result["content"])
-# print("问句是:", result["query"])
-# print("动作是:", result["play"])
+json_folder = "templatesJson"
+
+
+result = process_query(query, template_dict, json_folder)
+
+print("匹配的模板 key:", result["matched_key"])
+print("最相似的模板句:", result["matched_template"])
+print("相似度分数:", result["similarity_score"])
+print("类型:", result["type"])
+print("关键词:", result["keywords"])
+print("查询字段:", result["target"])
+print("模型名字", result["name"])
+print("条件", result["conditions"])
+print("返回的内容是:", result["content"])
+print("问句是:", result["query"])
+print("动作是:", result["play"])
+
+# type = result["type"]
 # content = result["content"]
 #
 # json_data_folder = "..\Json\json_data"
-# result = smart_find_value(json_data_folder, result["name"],result["conditions"],result["target"] )
+# if type == "query":
+#     result = smart_find_value(json_data_folder, result["name"],result["conditions"],result["target"] )
+# elif type == "calculate":
+#     result = calculate_sum_by_time_range(json_data_folder, result["name"], result["target"],)
+#
+# # 最终回答的文本
 # final_content = content.replace("?", str(result))
 # # print(f"{content}{result}")
 #
 # print(final_content)
+

+ 9 - 0
final/ByRules/templatesJson/3.json

@@ -0,0 +1,9 @@
+{
+  "dataJsonName": "sjjy1_B01_output",
+  "type": "calculate",
+  "keyword": "累计交易电量",
+  "target": "交易电量",
+  "content": "累计交易电量是&",
+  "play": "讲述文本",
+  "name": "月省间交易电量"
+}