| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """唤醒词自动生成工具.
- 功能:
- 1. 输入中文自动转换为带声调拼音
- 2. 按字母分隔拼音(声母+韵母)
- 3. 验证token是否在tokens.txt中
- 4. 自动生成keywords.txt格式
- """
- import sys
- from pathlib import Path
- try:
- from pypinyin import Style, lazy_pinyin
- except ImportError:
- print("❌ 缺少依赖: pypinyin")
- print("请安装: pip install pypinyin")
- sys.exit(1)
- class KeywordGenerator:
- def __init__(self, model_dir: Path):
- """初始化唤醒词生成器.
- Args:
- model_dir: 模型目录路径(包含tokens.txt和keywords.txt)
- """
- self.model_dir = Path(model_dir)
- self.tokens_file = self.model_dir / "tokens.txt"
- self.keywords_file = self.model_dir / "keywords.txt"
- # 加载已有的tokens
- self.available_tokens = self._load_tokens()
- # 声母表(需要分离的)
- self.initials = [
- "b",
- "p",
- "m",
- "f",
- "d",
- "t",
- "n",
- "l",
- "g",
- "k",
- "h",
- "j",
- "q",
- "x",
- "zh",
- "ch",
- "sh",
- "r",
- "z",
- "c",
- "s",
- "y",
- "w",
- ]
- def _load_tokens(self) -> set:
- """
- 加载tokens.txt中的所有可用token.
- """
- if not self.tokens_file.exists():
- print(f"⚠️ 警告: tokens文件不存在: {self.tokens_file}")
- return set()
- tokens = set()
- with open(self.tokens_file, "r", encoding="utf-8") as f:
- for line in f:
- line = line.strip()
- if line and not line.startswith("#"):
- # 格式: "token id" 或 "token"
- parts = line.split()
- if parts:
- tokens.add(parts[0])
- print(f"✅ 加载了 {len(tokens)} 个可用tokens")
- return tokens
- def _split_pinyin(self, pinyin: str) -> list:
- """将拼音按声母韵母分隔.
- 例如: "xiǎo" -> ["x", "iǎo"] "mǐ" -> ["m", "ǐ"] "ài" -> ["ài"] (零声母)
- """
- if not pinyin:
- return []
- # 按长度优先尝试匹配声母(zh, ch, sh优先)
- for initial in sorted(self.initials, key=len, reverse=True):
- if pinyin.startswith(initial):
- final = pinyin[len(initial) :]
- if final:
- return [initial, final]
- else:
- return [initial]
- # 没有声母(零声母)
- return [pinyin]
- def chinese_to_keyword_format(self, chinese_text: str) -> str:
- """将中文转换为keyword格式.
- Args:
- chinese_text: 中文文本,如"小米小米"
- Returns:
- keyword格式,如"x iǎo m ǐ x iǎo m ǐ @小米小米"
- """
- # 转换为带声调拼音
- pinyin_list = lazy_pinyin(chinese_text, style=Style.TONE)
- # 分割每个拼音
- split_parts = []
- missing_tokens = []
- for pinyin in pinyin_list:
- parts = self._split_pinyin(pinyin)
- # 验证每个part是否在tokens中
- for part in parts:
- if part not in self.available_tokens:
- missing_tokens.append(part)
- split_parts.append(part)
- # 拼接结果
- pinyin_str = " ".join(split_parts)
- keyword_line = f"{pinyin_str} @{chinese_text}"
- # 如果有缺失的token,给出警告
- if missing_tokens:
- print(
- f"⚠️ 警告: 以下token不在tokens.txt中: {', '.join(set(missing_tokens))}"
- )
- print(f" 生成的关键词可能无法正常工作")
- return keyword_line
- def add_keyword(self, chinese_text: str, append: bool = True) -> bool:
- """添加唤醒词到keywords.txt.
- Args:
- chinese_text: 中文唤醒词
- append: 是否追加(True)或覆盖(False)
- Returns:
- 是否成功
- """
- try:
- # 生成keyword格式
- keyword_line = self.chinese_to_keyword_format(chinese_text)
- # 检查是否已存在
- if self.keywords_file.exists():
- with open(self.keywords_file, "r", encoding="utf-8") as f:
- content = f.read()
- if f"@{chinese_text}" in content:
- print(f"⚠️ 关键词 '{chinese_text}' 已存在")
- return False
- # 写入文件
- mode = "a" if append else "w"
- with open(self.keywords_file, mode, encoding="utf-8") as f:
- f.write(keyword_line + "\n")
- print(f"✅ 成功添加: {keyword_line}")
- return True
- except Exception as e:
- print(f"❌ 添加失败: {e}")
- return False
- def batch_add_keywords(self, chinese_texts: list, overwrite: bool = False):
- """批量添加唤醒词.
- Args:
- chinese_texts: 中文列表
- overwrite: 是否覆盖原文件
- """
- if overwrite:
- print("⚠️ 将覆盖现有keywords.txt")
- success_count = 0
- for text in chinese_texts:
- text = text.strip()
- if not text:
- continue
- if self.add_keyword(text, append=not overwrite):
- success_count += 1
- # 第一个后都追加
- overwrite = False
- print(f"\n📊 完成: 成功添加 {success_count}/{len(chinese_texts)} 个关键词")
- def list_keywords(self):
- """
- 列出当前所有关键词.
- """
- if not self.keywords_file.exists():
- print("⚠️ keywords.txt 不存在")
- return
- print(f"\n📄 当前关键词列表 ({self.keywords_file}):")
- print("-" * 60)
- with open(self.keywords_file, "r", encoding="utf-8") as f:
- for i, line in enumerate(f, 1):
- line = line.strip()
- if line and not line.startswith("#"):
- # 提取中文部分显示
- if "@" in line:
- pinyin_part, chinese_part = line.split("@", 1)
- print(
- f"{i}. {chinese_part.strip():15s} -> {pinyin_part.strip()}"
- )
- else:
- print(f"{i}. {line}")
- print("-" * 60)
- def main():
- """
- 主函数.
- """
- import argparse
- parser = argparse.ArgumentParser(
- description="唤醒词自动生成工具",
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="""
- 示例:
- # 添加单个关键词
- python keyword_generator.py -a "小米小米"
- # 批量添加关键词
- python keyword_generator.py -b "小米小米" "你好小智" "贾维斯"
- # 从文件批量导入(每行一个中文)
- python keyword_generator.py -f keywords_input.txt
- # 列出当前关键词
- python keyword_generator.py -l
- # 测试转换(不写入文件)
- python keyword_generator.py -t "小米小米"
- """,
- )
- parser.add_argument(
- "-m", "--model-dir", default="models", help="模型目录路径(默认: models)"
- )
- parser.add_argument("-a", "--add", help="添加单个关键词(中文)")
- parser.add_argument(
- "-b", "--batch", nargs="+", help="批量添加关键词(多个中文,空格分隔)"
- )
- parser.add_argument("-f", "--file", help="从文件批量导入(每行一个中文)")
- parser.add_argument("-l", "--list", action="store_true", help="列出当前所有关键词")
- parser.add_argument("-t", "--test", help="测试转换(不写入文件)")
- parser.add_argument(
- "--overwrite", action="store_true", help="覆盖模式(清空现有关键词)"
- )
- args = parser.parse_args()
- # 确定模型目录
- if Path(args.model_dir).is_absolute():
- model_dir = Path(args.model_dir)
- else:
- # 相对路径:相对于项目根目录
- script_dir = Path(__file__).parent
- project_root = script_dir.parent
- model_dir = project_root / args.model_dir
- if not model_dir.exists():
- print(f"❌ 模型目录不存在: {model_dir}")
- sys.exit(1)
- print(f"🔧 使用模型目录: {model_dir}")
- # 创建生成器
- generator = KeywordGenerator(model_dir)
- # 执行操作
- if args.test:
- # 测试模式
- print(f"\n🧪 测试转换:")
- keyword_line = generator.chinese_to_keyword_format(args.test)
- print(f" 输入: {args.test}")
- print(f" 输出: {keyword_line}")
- elif args.add:
- # 添加单个
- generator.add_keyword(args.add)
- elif args.batch:
- # 批量添加
- generator.batch_add_keywords(args.batch, overwrite=args.overwrite)
- elif args.file:
- # 从文件导入
- input_file = Path(args.file)
- if not input_file.exists():
- print(f"❌ 文件不存在: {input_file}")
- sys.exit(1)
- with open(input_file, "r", encoding="utf-8") as f:
- keywords = [line.strip() for line in f if line.strip()]
- print(f"📥 从文件导入 {len(keywords)} 个关键词")
- generator.batch_add_keywords(keywords, overwrite=args.overwrite)
- elif args.list:
- # 列出关键词
- generator.list_keywords()
- else:
- # 交互模式
- print("\n🎤 唤醒词生成工具(交互模式)")
- print("输入中文唤醒词,按 Ctrl+C 或输入 'q' 退出\n")
- try:
- while True:
- chinese = input("请输入中文唤醒词: ").strip()
- if not chinese or chinese.lower() == "q":
- break
- generator.add_keyword(chinese)
- print()
- except KeyboardInterrupt:
- print("\n\n👋 已退出")
- # 最后列出所有关键词
- if not args.list and (args.add or args.batch or args.file):
- generator.list_keywords()
- if __name__ == "__main__":
- main()
|