123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.metrics.pairwise import cosine_similarity
- import jieba
- def jieba_tokenizer(text):
- return list(jieba.cut(text))
- template_dict = {
- "sjjy1_B03_output": [
- "某年省间交易电量按交易周期划分的电量是多少?",
- "某年省间交易电量按交易类型划分的电量是多少?",
- "某年省间交易电量按发电类型划分的电量是多少?",
- "某年省间交易电量按交易方式划分的电量是多少?",
- ],
- "sjjy1_B06_output": [
- "省间交易正在组织的交易有多少?",
- "省间交易当月完成的交易有多少?",
- "省间交易当年完成的交易有多少?",
- "省间交易当年达成的电量有多少?",
- "省间交易当年参与交易的家次有多少?",
- ],
- "sjjy1_B08_output": ["某年全年累计省间交易电量是多少?"],
- "sjjy1_B01_output": ["某年某月交易电量是多少?"],
- "sjjy1_B02_output": ["某年省间交易电量月内交易电量是多少??"],
- }
- # 构造模板和key的映射反向表
- templates = []
- key_map = []
- for key, sentences in template_dict.items():
- for s in sentences:
- templates.append(s)
- key_map.append(key)
- query = "2023年累计省间交易电量是多少??"
- vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
- tfidf_matrix = vectorizer.fit_transform([query] + templates)
- cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
- most_similar_idx = cos_sim.argmax()
- best_match_sentence = templates[most_similar_idx]
- matched_key = key_map[most_similar_idx]
- print("最相似的模板句:", best_match_sentence)
- print("相似度分数:", cos_sim[0][most_similar_idx])
- print("匹配的模板 key:", matched_key)
|