2 months ago · e80321dda9
--- a/final/ByRules/__pycache__/similarity_answer_json.cpython-39.pyc
+++ b/final/ByRules/__pycache__/similarity_answer_json.cpython-39.pyc
--- a/final/ByRules/app.py
+++ b/final/ByRules/app.py
@@ -51,6 +51,23 @@ def process_query_route():
 
				                 "play": result["play"]
			
 
				             }
			
 
				             return jsonify(response)
			
 
				+        # 计算类问题
			
 
				+        elif result['type'] == 'calculate':
			
 
				+            conditions = result["conditions"]
			
 
				+            # 从条件中分离出开始时间和结束时间
			
 
				+            start_conditions = {('年' if 'year' in k else '月'): v for k, v in conditions.items() if k.startswith('start_')}
			
 
				+            end_conditions = {('年' if 'year' in k else '月'): v for k, v in conditions.items() if k.startswith('end_')}
			
 
				+
			
 
				+            fileName = result["dataJsonName"] + ".json"
			
 
				+            final_value = calculate_sum_by_time_range(DATA_FOLDER, fileName, result["target"], start_conditions, end_conditions)
			
 
				+            response = {
			
 
				+                "content_text": result["content"],
			
 
				+                "raw_result": final_value,
			
 
				+                "conditions": result["conditions"],
			
 
				+                "name": result["name"],
			
 
				+                "play": result["play"]
			
 
				+            }
			
 
				+            return jsonify(response)
			
 
				         # 比较类问题
			
 
				         elif result['type'] == 'compare_max_min':
			
 
				             find_max = str(result['find_max']).lower() == 'true'
			
--- a/final/ByRules/test.py
+++ b/final/ByRules/test.py
@@ -0,0 +1,180 @@
 
				+from sklearn.feature_extraction.text import TfidfVectorizer
			
 
				+from sklearn.metrics.pairwise import cosine_similarity
			
 
				+import jieba
			
 
				+
			
 
				+# ==========================
			
 
				+# 模板定义
			
 
				+# ==========================
			
 
				+# 模板中使用 {item} 作为槽位，用于匹配不同实体问句
			
 
				+template_dict = {
			
 
				+    "1": ["某年全年累计{item}是多少？"],
			
 
				+}
			
 
				+
			
 
				+# ==========================
			
 
				+# 实体词典
			
 
				+# ==========================
			
 
				+# 用户问句中可能出现的实际实体词，用于识别并替换成占位符进行匹配
			
 
				+item_lexicon = ["省间交易电量","送出电量", "受入电量"]
			
 
				+
			
 
				+# ==========================
			
 
				+# 分词器函数
			
 
				+# ==========================
			
 
				+def jieba_tokenizer(text):
			
 
				+    """
			
 
				+    使用 jieba 对中文文本进行分词
			
 
				+    参数:
			
 
				+        text (str): 要分词的句子
			
 
				+    返回:
			
 
				+        list[str]: 分词后的词列表
			
 
				+    """
			
 
				+    return list(jieba.cut(text))
			
 
				+
			
 
				+
			
 
				+# ==========================
			
 
				+# 构建模板列表
			
 
				+# ==========================
			
 
				+"""
			
 
				+    运行函数前的结果：
			
 
				+    template_dict = {
			
 
				+    "1": ["某年全年累计{item}是多少？", "全年{item}交易情况如何？"],
			
 
				+    "2": ["请问{item}在某年达到了多少？"]
			
 
				+}
			
 
				+    运行以后的结果：
			
 
				+    templates = [
			
 
				+    "某年全年累计XXX是多少？",
			
 
				+    "全年XXX交易情况如何？",
			
 
				+    "请问XXX在某年达到了多少？"
			
 
				+]
			
 
				+
			
 
				+    template_slots = [
			
 
				+    "某年全年累计{item}是多少？",
			
 
				+    "全年{item}交易情况如何？",
			
 
				+    "请问{item}在某年达到了多少？"
			
 
				+]
			
 
				+
			
 
				+    key_map = ["1", "1", "2"]
			
 
				+"""
			
 
				+def build_templates(template_dict):
			
 
				+    """
			
 
				+    构造可用于相似度匹配的模板列表，并保存 key 和原始模板的映射
			
 
				+    参数:
			
 
				+        template_dict (dict): 模板字典，key 为编号，value 为模板句子列表
			
 
				+    返回:
			
 
				+        templates (list[str]): 将 {item} 替换为 'XXX' 的模板，用于匹配
			
 
				+        template_slots (list[str]): 原始模板（保留槽位）
			
 
				+        key_map (list[str]): 模板 key 列表，对应每条模板
			
 
				+    """
			
 
				+    templates = []
			
 
				+    template_slots = []
			
 
				+    key_map = []
			
 
				+
			
 
				+    for key, sentences in template_dict.items():
			
 
				+        for s in sentences:
			
 
				+            display_sentence = s.replace("{item}", "XXX")  # 替换槽位以便匹配
			
 
				+            templates.append(display_sentence)
			
 
				+            template_slots.append(s)  # 保留原模板
			
 
				+            key_map.append(key)
			
 
				+    return templates, template_slots, key_map
			
 
				+
			
 
				+
			
 
				+# ==========================
			
 
				+# 将问句中的实体替换为占位符
			
 
				+# ==========================
			
 
				+def replace_item_in_query(query, item_lexicon):
			
 
				+    """
			
 
				+    从 query 中找到实体，并替换为 'XXX' 占位符
			
 
				+    参数:
			
 
				+        query (str): 用户输入的问句
			
 
				+        item_lexicon (list[str]): 可识别的实体词列表
			
 
				+    返回:
			
 
				+        (str, str): 替换后的问句、识别出的实体词（若未识别则为 None）
			
 
				+    """
			
 
				+    for item in item_lexicon:
			
 
				+        if item in query:
			
 
				+            return query.replace(item, "XXX"), item
			
 
				+    return query, None
			
 
				+
			
 
				+
			
 
				+# ==========================
			
 
				+# 从 query 中提取出槽位值
			
 
				+# ==========================
			
 
				+def extract_item_from_query(template, query):
			
 
				+    """
			
 
				+    根据模板结构，从 query 中提取 {item} 对应的实际值
			
 
				+    参数:
			
 
				+        template (str): 模板句，包含 {item}
			
 
				+        query (str): 用户原始输入句
			
 
				+    返回:
			
 
				+        str or None: 从 query 中提取出的实体（若匹配失败返回 None）
			
 
				+    """
			
 
				+    if "{item}" not in template:
			
 
				+        return None
			
 
				+    prefix, suffix = template.split("{item}")
			
 
				+    if query.startswith(prefix) and query.endswith(suffix):
			
 
				+        return query[len(prefix):-len(suffix)]
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+# ==========================
			
 
				+# 匹配模板主函数
			
 
				+# ==========================
			
 
				+def match_template(query, template_dict, item_lexicon):
			
 
				+    """
			
 
				+    主函数：将用户 query 与模板进行匹配，并返回最相似模板及提取信息
			
 
				+    参数:
			
 
				+        query (str): 用户问句
			
 
				+        template_dict (dict): 模板字典
			
 
				+        item_lexicon (list[str]): 实体词典
			
 
				+    返回:
			
 
				+        dict: 包含匹配信息（key、模板、相似度、提取的实体）
			
 
				+    """
			
 
				+    # 构建用于匹配的模板列表
			
 
				+    # templates: 将模板中的 {item} 替换为 "XXX" 后的版本，用于做匹配。
			
 
				+    # template_slots: 原始模板，保留 {item} 占位符。
			
 
				+    # key_map: 每个模板对应的编号 key。
			
 
				+    templates, template_slots, key_map = build_templates(template_dict)
			
 
				+
			
 
				+    # 将 query 中实体词替换为 XXX
			
 
				+    # query_for_match: 替换后的问句。
			
 
				+    # found_item: 找到的实体词（如 "送出电量"），如果没找到则为 None
			
 
				+    query_for_match, found_item = replace_item_in_query(query, item_lexicon)
			
 
				+
			
 
				+    # 计算 TF-IDF 相似度
			
 
				+    vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
			
 
				+    tfidf_matrix = vectorizer.fit_transform([query_for_match] + templates)
			
 
				+    cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
			
 
				+    most_similar_idx = cos_sim.argmax()
			
 
				+
			
 
				+    # 获取最相似模板的信息
			
 
				+    # best_match_template: 与query最相似的原始模板句（含{item}）。
			
 
				+    # matched_key: 匹配模板的编号。
			
 
				+    # similarity_score: 计算得到的相似度分数。
			
 
				+    best_match_template = template_slots[most_similar_idx]     # 原模板（带槽位）
			
 
				+    matched_key = key_map[most_similar_idx]                    # 模板 key
			
 
				+    similarity_score = cos_sim[0][most_similar_idx]            # 相似度得分
			
 
				+
			
 
				+    # 从原始 query 中提取 item 实体值
			
 
				+    extracted_item = extract_item_from_query(best_match_template, query)
			
 
				+
			
 
				+    # 返回最终结果
			
 
				+    result = {
			
 
				+        "matched_key": matched_key,                   # 模板编号
			
 
				+        "matched_template": best_match_template,      # 匹配到的模板
			
 
				+        "similarity_score": similarity_score,         # 相似度得分
			
 
				+        "extracted_item": extracted_item or found_item  # 提取出的实体
			
 
				+    }
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+# ==========================
			
 
				+# 测试部分（main 入口）
			
 
				+# ==========================
			
 
				+if __name__ == "__main__":
			
 
				+    query = "2023年全年累计受入电量是多少？"
			
 
				+    result = match_template(query, template_dict, item_lexicon)
			
 
				+
			
 
				+    print("用户问句：", query)
			
 
				+    print("匹配的模板 key：", result['matched_key'])
			
 
				+    print("最相似的模板：", result['matched_template'])
			
 
				+    print("相似度分数：", result['similarity_score'])
			
 
				+    print("提取出的 item：", result['extracted_item'])