similarity.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. from sklearn.feature_extraction.text import TfidfVectorizer
  2. from sklearn.metrics.pairwise import cosine_similarity
  3. import jieba
  4. def jieba_tokenizer(text):
  5. return list(jieba.cut(text))
  6. template_dict = {
  7. "sjjy1_B03_output": [
  8. "某年省间交易电量按交易周期划分的电量是多少?",
  9. "某年省间交易电量按交易类型划分的电量是多少?",
  10. "某年省间交易电量按发电类型划分的电量是多少?",
  11. "某年省间交易电量按交易方式划分的电量是多少?",
  12. ],
  13. "sjjy1_B06_output": [
  14. "省间交易正在组织的交易有多少?",
  15. "省间交易当月完成的交易有多少?",
  16. "省间交易当年完成的交易有多少?",
  17. "省间交易当年达成的电量有多少?",
  18. "省间交易当年参与交易的家次有多少?",
  19. ],
  20. "sjjy1_B08_output": ["某年全年累计省间交易电量是多少?"],
  21. "sjjy1_B01_output": ["某年某月交易电量是多少?"],
  22. "sjjy1_B02_output": ["某年省间交易电量月内交易电量是多少??"],
  23. }
  24. # 构造模板和key的映射反向表
  25. templates = []
  26. key_map = []
  27. for key, sentences in template_dict.items():
  28. for s in sentences:
  29. templates.append(s)
  30. key_map.append(key)
  31. query = "2023年累计省间交易电量是多少??"
  32. vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
  33. tfidf_matrix = vectorizer.fit_transform([query] + templates)
  34. cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
  35. most_similar_idx = cos_sim.argmax()
  36. best_match_sentence = templates[most_similar_idx]
  37. matched_key = key_map[most_similar_idx]
  38. print("最相似的模板句:", best_match_sentence)
  39. print("相似度分数:", cos_sim[0][most_similar_idx])
  40. print("匹配的模板 key:", matched_key)