词库系统

词库系统是 Annotate Translate 的高级特性，支持根据词库自动标注页面中的重点词汇，适用于英语学习场景。

功能概述

✅ 多个标准词库 - CET-4/6、TOEFL、IELTS、GRE、考研
✅ 批量扫描标注 - 自动扫描页面并标注词库中的单词
✅ 灵活过滤 - 按词库标签、Collins 星级、词频过滤
✅ 中断支持 - 可随时中止扫描任务
✅ 性能优化 - 缓存、批量操作、并发控制

架构

mermaid

graph TB
    User[用户开启词汇模式] --> VocabService[VocabularyService]
    VocabService --> Provider{词库提供商}

    Provider --> CET[CETVocabularyProvider]
    Provider --> Freq[FrequencyVocabularyProvider]
    Provider --> Unified[UnifiedVocabularyProvider]

    Unified --> Data[(vocabulary-core.json<br/>10万+ 单词)]

    User --> Scanner[AnnotationScanner]
    Scanner --> Extract[提取文本节点]
    Extract --> Tokenize[分词]
    Tokenize --> VocabService
    VocabService --> Filter{shouldAnnotate?}

    Filter -->|是| Translate[TranslationService]
    Filter -->|否| Skip[跳过]

    Translate --> Inject[注入 Ruby 标注]
    Inject --> DOM[更新 DOM]

核心组件

1. VocabularyService

词库服务管理器，负责词库查询和批量检查。

javascript

class VocabularyService {
  constructor() {
    this.providers = new Map();      // 注册的词库提供商
    this.activeProvider = null;      // 当前活跃提供商
    this.activeOptions = null;       // 当前配置选项
    this.cache = new Map();          // 查询缓存
    this.maxCacheSize = 1000;
  }

  /**
   * 设置当前词库提供商
   */
  async setActiveProvider(name, options) {
    if (!this.providers.has(name)) {
      throw new Error(`Vocabulary provider not found: ${name}`);
    }

    const provider = this.providers.get(name);

    // 初始化提供商（加载词库数据）
    if (!provider.initialized) {
      await provider.initialize();
    }

    this.activeProvider = provider;
    this.activeOptions = options;

    console.log(`[VocabularyService] Active provider: ${name}`, options);
  }

  /**
   * 判断单词是否应该标注
   * @param {string} word - 要检查的单词
   * @param {Object} context - 上下文信息（预留参数，当前未使用）
   * @returns {boolean}
   */
  shouldAnnotate(word, context = {}) {
    if (!this.activeProvider) {
      return false;
    }

    // 规范化单词
    const normalized = this.activeProvider.normalizeWord(word);

    // 检查缓存
    const providerName = this.activeProvider.name;
    const optionsHash = JSON.stringify(this.activeOptions);
    const cacheKey = `${normalized}:${providerName}:${optionsHash}`;
    if (this.cache.has(cacheKey)) {
      return this.cache.get(cacheKey);
    }

    // 查询提供商（注意：provider 不接受 context 参数）
    const result = this.activeProvider.shouldAnnotate(
      normalized,
      this.activeOptions
    );

    // 写入缓存
    this.addToCache(cacheKey, result);

    return result;
  }

  /**
   * 批量检查单词
   * @param {string[]} words - 单词列表
   * @returns {Map<string, boolean>}
   */
  batchCheck(words) {
    const results = new Map();

    for (const word of words) {
      results.set(word, this.shouldAnnotate(word));
    }

    return results;
  }

  /**
   * 获取单词元数据
   */
  getMetadata(word) {
    if (!this.activeProvider) {
      return null;
    }

    const normalized = this.activeProvider.normalizeWord(word);
    return this.activeProvider.getMetadata(normalized);
  }

  /**
   * 添加到缓存（LRU）
   */
  addToCache(key, value) {
    if (this.cache.size >= this.maxCacheSize) {
      const firstKey = this.cache.keys().next().value;
      this.cache.delete(firstKey);
    }
    this.cache.set(key, value);
  }

  /**
   * 清除缓存
   */
  clearCache() {
    this.cache.clear();
  }
}

// 全局单例
const vocabularyService = new VocabularyService();

2. VocabularyProvider (抽象基类)

javascript

class BaseVocabularyProvider {
  constructor(name) {
    this.name = name;
    this.initialized = false;
    this.vocabulary = new Map();
  }

  /**
   * 初始化（加载词库数据）
   */
  async initialize() {
    throw new Error('Must implement initialize()');
  }

  /**
   * 判断单词是否应该标注
   * @param {string} word - 单词
   * @param {Object} options - 配置选项
   */
  shouldAnnotate(word, options) {
    throw new Error('Must implement shouldAnnotate()');
  }

  /**
   * 规范化单词
   */
  normalizeWord(word) {
    return word.toLowerCase().trim();
  }

  /**
   * 获取单词元数据
   */
  getMetadata(word) {
    return this.vocabulary.get(word) || null;
  }
}

3. UnifiedVocabularyProvider

推荐使用的提供商，整合了所有词库数据。

javascript

class UnifiedVocabularyProvider extends BaseVocabularyProvider {
  constructor() {
    super('unified');
  }

  /**
   * 初始化 - 加载词库数据
   */
  async initialize() {
    if (this.initialized) return;

    try {
      // 加载核心词库数据
      const url = chrome.runtime.getURL('src/data/vocabularies/vocabulary-core.json');
      const response = await fetch(url);
      const data = await response.json();

      // 转换为 Map 格式
      this.vocabulary = new Map(Object.entries(data.words || data));
      this.meta = data.meta || {};

      this.initialized = true;
      console.log(`[UnifiedVocabularyProvider] Loaded ${this.vocabulary.size} words`);

    } catch (error) {
      console.error('[UnifiedVocabularyProvider] Failed to load:', error);
      throw error;
    }
  }

  /**
   * 判断单词是否应该标注
   * @param {string} word - 单词
   * @param {Object} options - 配置选项
   */
  shouldAnnotate(word, options = {}) {
    const entry = this.vocabulary.get(word);
    if (!entry) return false;

    const {
      targetTags = [],      // 目标词库标签，如 ['cet4', 'cet6']
      mode = 'any',         // 匹配模式：any（任一）、all（全部）、exact（精确）
      minCollins = 0,       // 最低 Collins 星级
      includeBase = false   // 是否包含基础级别
    } = options;

    // 1. 检查词库标签
    if (targetTags.length > 0) {
      const hasTag = entry.tags.some(tag => targetTags.includes(tag));
      if (!hasTag) return false;
    }

    // 2. 检查 Collins 星级
    const collins = entry.collins || 0;
    if (collins < minCollins) {
      return false;
    }

    // 3. 其他条件...

    return true;
  }

  /**
   * 获取单词元数据
   */
  getMetadata(word) {
    const entry = this.vocabulary.get(word);
    if (!entry) return null;

    return {
      word,
      tags: entry.tags || [],
      collins: entry.collins || 0,
      frequency: entry.frequency || 0,
      level: this.getLevel(entry)
    };
  }

  /**
   * 获取单词难度等级
   */
  getLevel(entry) {
    if (entry.tags?.includes('cet4')) return 'CET-4';
    if (entry.tags?.includes('cet6')) return 'CET-6';
    if (entry.tags?.includes('toefl')) return 'TOEFL';
    if (entry.tags?.includes('ielts')) return 'IELTS';
    if (entry.tags?.includes('gre')) return 'GRE';
    return 'Unknown';
  }
}

4. AnnotationScanner

页面扫描器，负责提取文本、分词、标注。

javascript

class AnnotationScanner {
  constructor(vocabularyService, translationService) {
    this.vocabularyService = vocabularyService;
    this.translationService = translationService;
    this.abortController = null;
    this.isScanning = false;
  }

  /**
   * 扫描并标注页面
   */
  async scanAndAnnotate(rootElement, options = {}) {
    if (this.isScanning) {
      console.warn('[AnnotationScanner] Already scanning');
      return;
    }

    this.isScanning = true;
    this.abortController = new AbortController();

    const {
      concurrency = 3,      // 并发翻译数
      delay = 500,          // 翻译间隔（ms）
      maxWords = 100        // 最多标注单词数
    } = options;

    try {
      // 1. 提取文本节点
      console.log('[AnnotationScanner] Extracting text nodes...');
      const textNodes = this.extractTextNodes(rootElement);
      console.log(`[AnnotationScanner] Found ${textNodes.length} text nodes`);

      // 2. 分词
      console.log('[AnnotationScanner] Tokenizing...');
      const words = this.extractWords(textNodes);
      console.log(`[AnnotationScanner] Extracted ${words.size} unique words`);

      // 3. 过滤 - 批量检查哪些词需要标注
      console.log('[AnnotationScanner] Filtering with vocabulary...');
      const toAnnotate = new Map();

      for (const [word, positions] of words.entries()) {
        if (toAnnotate.size >= maxWords) break;

        if (this.vocabularyService.shouldAnnotate(word)) {
          toAnnotate.set(word, positions);
        }
      }

      console.log(`[AnnotationScanner] ${toAnnotate.size} words to annotate`);

      // 4. 批量翻译和标注
      await this.batchTranslateAndAnnotate(toAnnotate, {
        concurrency,
        delay
      });

      console.log('[AnnotationScanner] Scan complete');

    } catch (error) {
      if (error.name === 'AbortError') {
        console.log('[AnnotationScanner] Scan aborted');
      } else {
        console.error('[AnnotationScanner] Scan failed:', error);
      }
    } finally {
      this.isScanning = false;
      this.abortController = null;
    }
  }

  /**
   * 提取文本节点
   */
  extractTextNodes(rootElement) {
    const textNodes = [];
    const walker = document.createTreeWalker(
      rootElement,
      NodeFilter.SHOW_TEXT,
      {
        acceptNode: (node) => {
          // 跳过脚本、样式等
          const parent = node.parentElement;
          if (!parent) return NodeFilter.FILTER_REJECT;

          const tagName = parent.tagName.toLowerCase();
          if (['script', 'style', 'noscript', 'iframe'].includes(tagName)) {
            return NodeFilter.FILTER_REJECT;
          }

          // 跳过已标注的
          if (parent.tagName === 'RUBY' || parent.closest('ruby')) {
            return NodeFilter.FILTER_REJECT;
          }

          // 跳过空白文本
          if (node.textContent.trim().length === 0) {
            return NodeFilter.FILTER_REJECT;
          }

          return NodeFilter.FILTER_ACCEPT;
        }
      }
    );

    let node;
    while (node = walker.nextNode()) {
      textNodes.push(node);
    }

    return textNodes;
  }

  /**
   * 分词
   */
  extractWords(textNodes) {
    const words = new Map(); // Map<word, positions[]>

    for (const node of textNodes) {
      const text = node.textContent;
      const regex = /\b[a-zA-Z]{2,}\b/g; // 匹配英文单词（2个字母以上）

      let match;
      while ((match = regex.exec(text)) !== null) {
        const word = match[0].toLowerCase();

        if (!words.has(word)) {
          words.set(word, []);
        }

        words.get(word).push({
          node,
          startOffset: match.index,
          endOffset: match.index + match[0].length,
          originalWord: match[0]
        });
      }
    }

    return words;
  }

  /**
   * 批量翻译和标注
   */
  async batchTranslateAndAnnotate(words, options = {}) {
    const { concurrency = 3, delay = 500 } = options;
    const wordList = Array.from(words.keys());

    for (let i = 0; i < wordList.length; i += concurrency) {
      // 检查中断信号
      if (this.abortController.signal.aborted) {
        throw new DOMException('Scan aborted', 'AbortError');
      }

      // 取一批单词
      const batch = wordList.slice(i, i + concurrency);

      // 并发翻译
      const results = await Promise.all(
        batch.map(word =>
          this.translationService.translate(word, 'zh-CN', 'en')
            .catch(error => {
              console.error(`[AnnotationScanner] Failed to translate "${word}":`, error);
              return null;
            })
        )
      );

      // 标注
      for (let j = 0; j < batch.length; j++) {
        const word = batch[j];
        const result = results[j];

        if (result) {
          this.annotateWord(word, words.get(word), result);
        }
      }

      // 延迟避免速率限制
      if (i + concurrency < wordList.length) {
        await new Promise(r => setTimeout(r, delay));
      }
    }
  }

  /**
   * 标注单词
   */
  annotateWord(word, positions, translationResult) {
    for (const pos of positions) {
      try {
        const { node, startOffset, endOffset, originalWord } = pos;

        // 创建 Ruby 元素
        const ruby = document.createElement('ruby');
        ruby.textContent = originalWord;
        ruby.className = 'vocab-annotation';

        const rt = document.createElement('rt');
        rt.textContent = translationResult.annotationText;
        ruby.appendChild(rt);

        // 替换文本节点
        const text = node.textContent;
        const before = text.substring(0, startOffset);
        const after = text.substring(endOffset);

        const parent = node.parentElement;
        const beforeNode = document.createTextNode(before);
        const afterNode = document.createTextNode(after);

        parent.insertBefore(beforeNode, node);
        parent.insertBefore(ruby, node);
        parent.insertBefore(afterNode, node);
        parent.removeChild(node);

        // 只处理第一个出现的位置
        break;

      } catch (error) {
        console.error('[AnnotationScanner] Failed to annotate:', error);
      }
    }
  }

  /**
   * 中止扫描
   */
  abort() {
    if (this.abortController && this.isScanning) {
      this.abortController.abort();
      return true;
    }
    return false;
  }

  /**
   * 检查是否正在扫描
   */
  isScanningNow() {
    return this.isScanning;
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245

数据结构

词库数据格式

json

{
  "meta": {
    "version": "1.0.0",
    "totalWords": 100000,
    "source": "ECDICT",
    "license": "MIT"
  },
  "words": {
    "abandon": {
      "tags": ["cet4", "cet6", "toefl", "ielts"],
      "collins": 4,
      "frequency": 5234,
      "bnc": 3456,
      "frq": 2345
    },
    "chamber": {
      "tags": ["cet6", "toefl", "ielts", "gre"],
      "collins": 5,
      "frequency": 8901
    }
  }
}

字段说明:

tags - 词库标签数组
collins - Collins 词典星级 (1-5)
frequency - 词频（数字越大越常用）
bnc - BNC 语料库词频
frq - 综合词频

使用示例

基本用法

javascript

// 1. 初始化词库提供商
const unifiedProvider = new UnifiedVocabularyProvider();
await vocabularyService.registerProvider('unified', unifiedProvider);

// 2. 设置当前提供商和选项
await vocabularyService.setActiveProvider('unified', {
  targetTags: ['cet4', 'cet6'],  // 只标注 CET-4/6 词汇
  minCollins: 2,                  // Collins 星级 >= 2
  minFrequency: 1000              // 词频 >= 1000
});

// 3. 创建扫描器
const scanner = new AnnotationScanner(
  vocabularyService,
  translationService
);

// 4. 扫描并标注整个页面
await scanner.scanAndAnnotate(document.body, {
  concurrency: 3,    // 并发翻译 3 个单词
  delay: 500,        // 每批间隔 500ms
  maxWords: 50       // 最多标注 50 个单词
});

检查单个单词

javascript

// 设置词库
await vocabularyService.setActiveProvider('unified', {
  targetTags: ['toefl', 'ielts']
});

// 检查单词
const shouldAnnotate = vocabularyService.shouldAnnotate('abandon');
console.log(shouldAnnotate); // true（如果在目标词库中）

// 获取单词元数据
const metadata = vocabularyService.getMetadata('abandon');
console.log(metadata);
// {
//   word: 'abandon',
//   tags: ['cet4', 'cet6', 'toefl', 'ielts'],
//   collins: 4,
//   frequency: 5234,
//   level: 'CET-4'
// }

批量检查

javascript

const words = ['hello', 'abandon', 'chamber', 'world'];
const results = vocabularyService.batchCheck(words);

for (const [word, shouldAnnotate] of results.entries()) {
  console.log(`${word}: ${shouldAnnotate}`);
}

中断扫描

javascript

// 开始扫描
scanner.scanAndAnnotate(document.body);

// 用户点击"停止"按钮
document.getElementById('stop-scan').addEventListener('click', () => {
  if (scanner.abort()) {
    console.log('Scan aborted');
  }
});

性能优化

1. 缓存策略

javascript

// VocabularyService 使用 LRU 缓存
// 缓存键: `${word}:${JSON.stringify(options)}`

// 示例
"abandon:{\"targetTags\":[\"cet4\"]}" -> true
"hello:{\"targetTags\":[\"cet6\"]}" -> false

2. 批量操作

javascript

// 一次性检查多个单词
const words = ['word1', 'word2', 'word3'];
const results = vocabularyService.batchCheck(words);

// 而非
// for (const word of words) {
//   vocabularyService.shouldAnnotate(word); // 低效
// }

3. 并发控制

javascript

// 控制并发翻译数，避免速率限制
await scanner.scanAndAnnotate(document.body, {
  concurrency: 3,    // 每次最多 3 个并发请求
  delay: 500         // 每批间隔 500ms
});

4. 限制标注数量

javascript

// 避免标注过多单词影响阅读
await scanner.scanAndAnnotate(document.body, {
  maxWords: 50  // 最多标注 50 个单词
});

扩展性

添加自定义词库

javascript

class CustomVocabularyProvider extends BaseVocabularyProvider {
  constructor() {
    super('custom');
  }

  async initialize() {
    // 加载自定义词库数据
    const data = await loadCustomData();
    this.vocabulary = new Map(data);
    this.initialized = true;
  }

  shouldAnnotate(word, options) {
    // 自定义过滤逻辑
    return this.vocabulary.has(word);
  }
}

// 注册
vocabularyService.registerProvider('custom', new CustomVocabularyProvider());

完整教程 →

总结

词库系统的特点：

灵活过滤 - 多维度条件（标签、星级、词频）
高性能 - 缓存、批量操作、并发控制
可中断 - AbortController 支持
易扩展 - 自定义词库提供商

词库系统 ​

功能概述 ​

架构 ​

核心组件 ​

1. VocabularyService ​

2. VocabularyProvider (抽象基类) ​

3. UnifiedVocabularyProvider ​

4. AnnotationScanner ​

数据结构 ​

词库数据格式 ​

使用示例 ​

基本用法 ​

检查单个单词 ​

批量检查 ​

中断扫描 ​

性能优化 ​

1. 缓存策略 ​

2. 批量操作 ​

3. 并发控制 ​

4. 限制标注数量 ​

扩展性 ​

添加自定义词库 ​

总结 ​

相关文档 ​

词库系统

功能概述

架构

核心组件

1. VocabularyService

2. VocabularyProvider (抽象基类)

3. UnifiedVocabularyProvider

4. AnnotationScanner

数据结构

词库数据格式

使用示例

基本用法

检查单个单词

批量检查

中断扫描

性能优化

1. 缓存策略

2. 批量操作

3. 并发控制

4. 限制标注数量

扩展性

添加自定义词库

总结

相关文档