QI.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import re
  2. from typing import List, Dict
  3. # 中国省份列表
  4. provinces = [
  5. '北京', '天津', '河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江',
  6. '上海', '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南',
  7. '湖北', '湖南', '广东', '广西', '海南', '重庆', '四川', '贵州',
  8. '云南', '西藏', '陕西', '甘肃', '青海', '宁夏', '新疆'
  9. ]
  10. province_pattern = '|'.join([f'{p}省|{p}市|{p}' for p in provinces])
  11. # 意图关键词 → 意图类型
  12. intent_keywords = {
  13. # '电量': '省间交易电量',
  14. # '同比': '同比变化',
  15. # '环比': '环比变化',
  16. # '均价': '均价',
  17. '送出电量': '送出电量',
  18. '送出电量占售电量的比': '送出电量占售电量的比',
  19. '送出电量占比': '送出电量占售电量的比',
  20. '送电占比': '送出电量占售电量的比',
  21. '送出均价': '送出均价',
  22. '送电均价': '送出均价',
  23. '送出平均价': '送出均价',
  24. '送电平均价': '送出均价',
  25. '送出的平均价': '送出均价',
  26. # '受入电量': '受入电量',
  27. # '受电': '受入电量',
  28. # '成交未结算': '未结算电量',
  29. # '占比': '电源结构占比',
  30. # '最多': '最大值统计',
  31. # '地图': '地图特征分析',
  32. # '多少笔': '交易数量统计',
  33. # '完成的交易': '交易数量统计',
  34. # '参与': '参与主体数量'
  35. }
  36. def split_question(question: str) -> List[str]:
  37. """
  38. 拆分包含多个子意图的问句,返回独立问句列表。
  39. """
  40. # 中文问号或句号断句
  41. parts = re.split(r'[??。]', question)
  42. parts = [p.strip() for p in parts if p.strip()]
  43. result = []
  44. for part in parts:
  45. # 对同比/环比/相比做特殊处理,补足时间
  46. if '同比' in part and '相比' not in part:
  47. base = extract_base_time(part)
  48. result.append(f"{base}与去年同期相比变化如何?")
  49. elif '环比' in part and '相比' not in part:
  50. base = extract_base_time(part)
  51. result.append(f"{base}与上月相比变化如何?")
  52. else:
  53. result.append(part + '?')
  54. return result
  55. def extract_base_time(text: str) -> str:
  56. """
  57. 提取句子中的基本时间片段如“2024年3月”
  58. """
  59. m = re.search(r'\d{4}年\d{1,2}月', text)
  60. return m.group(0) if m else "当前月份"
  61. def extract_info(question: str) -> Dict:
  62. """
  63. 从单个问句中提取结构化信息
  64. """
  65. years = re.findall(r'\d{4}年', question)
  66. months = re.findall(r'(\d{1,2}月(?:至\d{1,2}月)?)', question)
  67. provinces_found = re.findall(province_pattern, question)
  68. provinces_found = list(set(p.replace('省', '').replace('市', '') for p in provinces_found))
  69. intents = []
  70. for keyword, label in intent_keywords.items():
  71. if keyword in question:
  72. intents.append(label)
  73. return {
  74. "question": question,
  75. "year": list(set(y.replace("年", "") for y in years)),
  76. "month": months,
  77. "province": provinces_found,
  78. "intent": list(set(intents))
  79. }
  80. def process_questions(questions: List[str]) -> List[Dict]:
  81. """
  82. 对一组原始问句进行拆分和信息抽取
  83. """
  84. all_results = []
  85. for q in questions:
  86. split_qs = split_question(q)
  87. for sq in split_qs:
  88. info = extract_info(sq)
  89. all_results.append(info)
  90. return all_results
  91. def load_questions_from_file(filepath: str) -> List[str]:
  92. """
  93. 从文本文件中逐行读取问句
  94. """
  95. with open(filepath, 'r', encoding='utf-8') as f:
  96. lines = f.readlines()
  97. return [line.strip() for line in lines if line.strip()]
  98. # 替换此部分,改为读取文件中的问句
  99. questions = load_questions_from_file("question_test.txt")
  100. # 调用处理函数
  101. results = process_questions(questions)
  102. # 输出结果
  103. from pprint import pprint
  104. pprint(results)