classBPE_Tokenizer: def__init__(self,vocab: dict[int, bytes], merges: list[tuple[bytes, bytes]], special_tokens: list[str] | None = None): *"""* * Construct a tokenizer from a given vocabulary, list of merges, and (optionally) a list of special tokens.* * Args:* * vocab: dict[int, bytes]* * merges: list[tuple[bytes, bytes]]* * special_tokens: list[str] | None = None* * """* * *pass
@classmethod deffrom_files(cls, vocab_filepath: str, merges_filepath: str, special_tokens=None): *"""* * Class method that constructs and return a Tokenizer from a serialized vocabulary and list of merges (in the same format that your BPE training code output) and (optionally) a list of special tokens.* * Args:* * vocab_filepath: str* * merges_filepath: str* * special_tokens: list[str] | None = None* * """* * *pass
@staticmethod defencode(self, text: str) -> list[int]: *"""* * Encode an input text into a sequence of token IDs.* * """* * *pass
defencode_iterable(self, iterable: Iterable[str]) -> Iterator[int]: *"""* * Given an iterable of strings (e.g., a Python file handle), return a generator that lazily yields token IDs. This is required for memory-efficient tokenization of large files that we cannot directly load into memory.* * """* * *pass
defdecode(self, ids: list[int]) -> str: *"""* * Decode a sequence of token IDs into text.* * """*
@classmethod deffrom_files(cls, vocab_filepath: str, merges_filepath: str, special_tokens=None): *"""* * Class method that constructs and return a Tokenizer from a serialized vocabulary and list of merges (in the same format that your BPE training code output) and (optionally) a list of special tokens.* * Args:* * vocab_filepath: str* * merges_filepath: str* * special_tokens: list[str] | None = None* * """* * *withopen(vocab_filepath,"rb") as vf, open(merges_filepath,"rb") as mf: vocab: dict[int, bytes] = pickle.load(vf) merges: list[tuple[bytes, bytes]] = pickle.load(mf) size = len(vocab) for token in special_tokens: if token.encode("utf-8") notin vocab.values(): vocab[size] = bytes(token.encode("utf-8")) size += 1 return cls(vocab, merges, special_tokens=special_tokens) return cls(vocab, merges, special_tokens=special_tokens)
defencode_iterable(self, iterable: Iterable[str]) -> Iterator[int]: """ Given an iterable of strings (e.g., a Python file handle), return a generator that lazily yields token IDs. This is required for memory-efficient tokenization of large files that we cannot directly load into memory. """ for chunk in iterable: yieldfrom self.encode(chunk)
init()
先来考虑一下编码的全过程:假设现在输入文本是the cat ate,词表vocab是{0: b' ', 1: b'a', 2: b'c', 3: b'e', 4: b'h', 5: b't', 6: b'th', 7: b' c', 8: b' a', 9: b'the', 10: b' at'},合并项列表merges是[(b't', b'h'), (b' ', b'c'), (b' ', b'a'), (b'th', b'e'), (b' a', b't')]。
defencode(self, text: str) -> list[int]: """ Encode an input text into a sequence of token IDs. """ ifnot text: return [] pattern = re.compile(r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") token_ids = []
# 首先进行token分词 blocks = BPE_Tokenizer.split_by_special(text, special_tokens=self.special_tokens, drop_special=False) for block in blocks: if self.special_tokens and block in self.special_tokens: # 是special_token,直接找词表转换为id block_bytes = bytes(block.encode("utf-8")) token_ids.append(self.bytes_to_id[block_bytes]) else: # 是一般token,进行分词 tokens: list[str] = [] formatchin re.finditer(pattern, block): tokens.append(match.group(0)) for token in tokens: token_bytes = [bytes([b]) for b inlist(token.encode("utf-8"))] bytes_list = self.get_best_merge(token_bytes) for b in bytes_list: token_ids.append(self.bytes_to_id[b])
return token_ids
还是以过程讲方法:假设现在输入的text是Hello<|endoftext|>How are you
通过调用split_by_special可以将special_token从text中单独区分出来。现在的blocks就是["Hello","<|endoftext|>","How are you"]。
defdecode(self, ids: list[int]) -> str: """ Decode a sequence of token IDs into text. """ returnb''.join([self.vocab[t] for t in ids]).decode('utf-8',errors='replace')
可以看到还完成了文档的replace要求,其实通过传参就可以实现了。
test_tokenizer
在终端启动测试
1
uv run pytest tests/test_tokenizer.py
结果如下:
test_encode_iterable_memory_usage这个测试耗费的时间有点长(大概8s),测试的大量时间耗在上面。另外test_encode_memory_usage结果为XFAIL 是正常的,因为要求里就写着“Tokenizer.encode is expected to take more memory than allotted (1MB).”,说明你没有作弊(