关闭 subword 分词
from transformers import BertTokenizer
# 初始化分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 输入文本
text = "I love natural language processing."
# 手动分词,关闭 subword 分词
tokens = ['[CLS]'] + tokenizer.basic_tokenizer.tokenize(text) + ['[SEP]']
# 转换为 token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
# 输出编码后的 token IDs
print(token_ids)