keyword_generator.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """唤醒词自动生成工具.
  4. 功能:
  5. 1. 输入中文自动转换为带声调拼音
  6. 2. 按字母分隔拼音(声母+韵母)
  7. 3. 验证token是否在tokens.txt中
  8. 4. 自动生成keywords.txt格式
  9. """
  10. import sys
  11. from pathlib import Path
  12. try:
  13. from pypinyin import Style, lazy_pinyin
  14. except ImportError:
  15. print("❌ 缺少依赖: pypinyin")
  16. print("请安装: pip install pypinyin")
  17. sys.exit(1)
  18. class KeywordGenerator:
  19. def __init__(self, model_dir: Path):
  20. """初始化唤醒词生成器.
  21. Args:
  22. model_dir: 模型目录路径(包含tokens.txt和keywords.txt)
  23. """
  24. self.model_dir = Path(model_dir)
  25. self.tokens_file = self.model_dir / "tokens.txt"
  26. self.keywords_file = self.model_dir / "keywords.txt"
  27. # 加载已有的tokens
  28. self.available_tokens = self._load_tokens()
  29. # 声母表(需要分离的)
  30. self.initials = [
  31. "b",
  32. "p",
  33. "m",
  34. "f",
  35. "d",
  36. "t",
  37. "n",
  38. "l",
  39. "g",
  40. "k",
  41. "h",
  42. "j",
  43. "q",
  44. "x",
  45. "zh",
  46. "ch",
  47. "sh",
  48. "r",
  49. "z",
  50. "c",
  51. "s",
  52. "y",
  53. "w",
  54. ]
  55. def _load_tokens(self) -> set:
  56. """
  57. 加载tokens.txt中的所有可用token.
  58. """
  59. if not self.tokens_file.exists():
  60. print(f"⚠️ 警告: tokens文件不存在: {self.tokens_file}")
  61. return set()
  62. tokens = set()
  63. with open(self.tokens_file, "r", encoding="utf-8") as f:
  64. for line in f:
  65. line = line.strip()
  66. if line and not line.startswith("#"):
  67. # 格式: "token id" 或 "token"
  68. parts = line.split()
  69. if parts:
  70. tokens.add(parts[0])
  71. print(f"✅ 加载了 {len(tokens)} 个可用tokens")
  72. return tokens
  73. def _split_pinyin(self, pinyin: str) -> list:
  74. """将拼音按声母韵母分隔.
  75. 例如: "xiǎo" -> ["x", "iǎo"] "mǐ" -> ["m", "ǐ"] "ài" -> ["ài"] (零声母)
  76. """
  77. if not pinyin:
  78. return []
  79. # 按长度优先尝试匹配声母(zh, ch, sh优先)
  80. for initial in sorted(self.initials, key=len, reverse=True):
  81. if pinyin.startswith(initial):
  82. final = pinyin[len(initial) :]
  83. if final:
  84. return [initial, final]
  85. else:
  86. return [initial]
  87. # 没有声母(零声母)
  88. return [pinyin]
  89. def chinese_to_keyword_format(self, chinese_text: str) -> str:
  90. """将中文转换为keyword格式.
  91. Args:
  92. chinese_text: 中文文本,如"小米小米"
  93. Returns:
  94. keyword格式,如"x iǎo m ǐ x iǎo m ǐ @小米小米"
  95. """
  96. # 转换为带声调拼音
  97. pinyin_list = lazy_pinyin(chinese_text, style=Style.TONE)
  98. # 分割每个拼音
  99. split_parts = []
  100. missing_tokens = []
  101. for pinyin in pinyin_list:
  102. parts = self._split_pinyin(pinyin)
  103. # 验证每个part是否在tokens中
  104. for part in parts:
  105. if part not in self.available_tokens:
  106. missing_tokens.append(part)
  107. split_parts.append(part)
  108. # 拼接结果
  109. pinyin_str = " ".join(split_parts)
  110. keyword_line = f"{pinyin_str} @{chinese_text}"
  111. # 如果有缺失的token,给出警告
  112. if missing_tokens:
  113. print(
  114. f"⚠️ 警告: 以下token不在tokens.txt中: {', '.join(set(missing_tokens))}"
  115. )
  116. print(f" 生成的关键词可能无法正常工作")
  117. return keyword_line
  118. def add_keyword(self, chinese_text: str, append: bool = True) -> bool:
  119. """添加唤醒词到keywords.txt.
  120. Args:
  121. chinese_text: 中文唤醒词
  122. append: 是否追加(True)或覆盖(False)
  123. Returns:
  124. 是否成功
  125. """
  126. try:
  127. # 生成keyword格式
  128. keyword_line = self.chinese_to_keyword_format(chinese_text)
  129. # 检查是否已存在
  130. if self.keywords_file.exists():
  131. with open(self.keywords_file, "r", encoding="utf-8") as f:
  132. content = f.read()
  133. if f"@{chinese_text}" in content:
  134. print(f"⚠️ 关键词 '{chinese_text}' 已存在")
  135. return False
  136. # 写入文件
  137. mode = "a" if append else "w"
  138. with open(self.keywords_file, mode, encoding="utf-8") as f:
  139. f.write(keyword_line + "\n")
  140. print(f"✅ 成功添加: {keyword_line}")
  141. return True
  142. except Exception as e:
  143. print(f"❌ 添加失败: {e}")
  144. return False
  145. def batch_add_keywords(self, chinese_texts: list, overwrite: bool = False):
  146. """批量添加唤醒词.
  147. Args:
  148. chinese_texts: 中文列表
  149. overwrite: 是否覆盖原文件
  150. """
  151. if overwrite:
  152. print("⚠️ 将覆盖现有keywords.txt")
  153. success_count = 0
  154. for text in chinese_texts:
  155. text = text.strip()
  156. if not text:
  157. continue
  158. if self.add_keyword(text, append=not overwrite):
  159. success_count += 1
  160. # 第一个后都追加
  161. overwrite = False
  162. print(f"\n📊 完成: 成功添加 {success_count}/{len(chinese_texts)} 个关键词")
  163. def list_keywords(self):
  164. """
  165. 列出当前所有关键词.
  166. """
  167. if not self.keywords_file.exists():
  168. print("⚠️ keywords.txt 不存在")
  169. return
  170. print(f"\n📄 当前关键词列表 ({self.keywords_file}):")
  171. print("-" * 60)
  172. with open(self.keywords_file, "r", encoding="utf-8") as f:
  173. for i, line in enumerate(f, 1):
  174. line = line.strip()
  175. if line and not line.startswith("#"):
  176. # 提取中文部分显示
  177. if "@" in line:
  178. pinyin_part, chinese_part = line.split("@", 1)
  179. print(
  180. f"{i}. {chinese_part.strip():15s} -> {pinyin_part.strip()}"
  181. )
  182. else:
  183. print(f"{i}. {line}")
  184. print("-" * 60)
  185. def main():
  186. """
  187. 主函数.
  188. """
  189. import argparse
  190. parser = argparse.ArgumentParser(
  191. description="唤醒词自动生成工具",
  192. formatter_class=argparse.RawDescriptionHelpFormatter,
  193. epilog="""
  194. 示例:
  195. # 添加单个关键词
  196. python keyword_generator.py -a "小米小米"
  197. # 批量添加关键词
  198. python keyword_generator.py -b "小米小米" "你好小智" "贾维斯"
  199. # 从文件批量导入(每行一个中文)
  200. python keyword_generator.py -f keywords_input.txt
  201. # 列出当前关键词
  202. python keyword_generator.py -l
  203. # 测试转换(不写入文件)
  204. python keyword_generator.py -t "小米小米"
  205. """,
  206. )
  207. parser.add_argument(
  208. "-m", "--model-dir", default="models", help="模型目录路径(默认: models)"
  209. )
  210. parser.add_argument("-a", "--add", help="添加单个关键词(中文)")
  211. parser.add_argument(
  212. "-b", "--batch", nargs="+", help="批量添加关键词(多个中文,空格分隔)"
  213. )
  214. parser.add_argument("-f", "--file", help="从文件批量导入(每行一个中文)")
  215. parser.add_argument("-l", "--list", action="store_true", help="列出当前所有关键词")
  216. parser.add_argument("-t", "--test", help="测试转换(不写入文件)")
  217. parser.add_argument(
  218. "--overwrite", action="store_true", help="覆盖模式(清空现有关键词)"
  219. )
  220. args = parser.parse_args()
  221. # 确定模型目录
  222. if Path(args.model_dir).is_absolute():
  223. model_dir = Path(args.model_dir)
  224. else:
  225. # 相对路径:相对于项目根目录
  226. script_dir = Path(__file__).parent
  227. project_root = script_dir.parent
  228. model_dir = project_root / args.model_dir
  229. if not model_dir.exists():
  230. print(f"❌ 模型目录不存在: {model_dir}")
  231. sys.exit(1)
  232. print(f"🔧 使用模型目录: {model_dir}")
  233. # 创建生成器
  234. generator = KeywordGenerator(model_dir)
  235. # 执行操作
  236. if args.test:
  237. # 测试模式
  238. print(f"\n🧪 测试转换:")
  239. keyword_line = generator.chinese_to_keyword_format(args.test)
  240. print(f" 输入: {args.test}")
  241. print(f" 输出: {keyword_line}")
  242. elif args.add:
  243. # 添加单个
  244. generator.add_keyword(args.add)
  245. elif args.batch:
  246. # 批量添加
  247. generator.batch_add_keywords(args.batch, overwrite=args.overwrite)
  248. elif args.file:
  249. # 从文件导入
  250. input_file = Path(args.file)
  251. if not input_file.exists():
  252. print(f"❌ 文件不存在: {input_file}")
  253. sys.exit(1)
  254. with open(input_file, "r", encoding="utf-8") as f:
  255. keywords = [line.strip() for line in f if line.strip()]
  256. print(f"📥 从文件导入 {len(keywords)} 个关键词")
  257. generator.batch_add_keywords(keywords, overwrite=args.overwrite)
  258. elif args.list:
  259. # 列出关键词
  260. generator.list_keywords()
  261. else:
  262. # 交互模式
  263. print("\n🎤 唤醒词生成工具(交互模式)")
  264. print("输入中文唤醒词,按 Ctrl+C 或输入 'q' 退出\n")
  265. try:
  266. while True:
  267. chinese = input("请输入中文唤醒词: ").strip()
  268. if not chinese or chinese.lower() == "q":
  269. break
  270. generator.add_keyword(chinese)
  271. print()
  272. except KeyboardInterrupt:
  273. print("\n\n👋 已退出")
  274. # 最后列出所有关键词
  275. if not args.list and (args.add or args.batch or args.file):
  276. generator.list_keywords()
  277. if __name__ == "__main__":
  278. main()