from tokenizers import Tokenizer# 加载一个预训练的 BERT tokenizer(文件需要提前下载,比如bert-base-uncased)
tokenizer = Tokenizer.from_file("bert-base-uncased-tokenizer.json")# 对文本进行编码
output = tokenizer.encode("Hello, I love studying AI with BERT!")print("Tokens:", output.tokens) # 分出来的 token
print("IDs:", output.ids) # 对应的 token id
🌟 案例 2:自己训练一个小分词器
from tokenizers import Tokenizer, models, trainers, pre_tokenizers# 使用 WordPiece 作为分词模型(BERT 用的就是这个)
tokenizer = Tokenizer(models.WordPiece())# 设置预分词器(按空格和标点分)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()# 训练器
trainer = trainers.WordPieceTrainer(vocab_size=1000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])# 用一些文本来训练(这里随便写几个)
texts = ["I love natural language processing.","BERT is a transformer model.","Deep learning is fun!"
]tokenizer.train_from_iterator(texts, trainer)# 保存分词器
tokenizer.save("my-tokenizer.json")# 使用训练好的分词器
output = tokenizer.encode("I love BERT!")print("Tokens:", output.tokens)
print("IDs:", output.ids)
🌟 案例 3:解码(从 ID 还原文本)
from tokenizers import Tokenizertokenizer = Tokenizer.from_file("my-tokenizer.json")output = tokenizer.encode("BERT makes NLP easier.")
print("IDs:", output.ids)# 解码回文本
decoded = tokenizer.decode(output.ids)
print("Decoded:", decoded)
🌟 案例 4:批量处理
from tokenizers import Tokenizertokenizer = Tokenizer.from_file("my-tokenizer.json")batch = tokenizer.encode_batch(["I like AI.","Transformers are powerful models."
])for out in batch:print(out.tokens, out.ids)