一、基础用法
导包
1
| from transformers import AutoTokenizer
|
加载Tokenizer
1 2
| tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
|
1 2
| tokenizer = AutoTokenizer.from_pretrained("./bert_chinese_tokenizer")
|
使用分词器
1 2 3 4 5
| sen = "小鱼吃猫博客网站" tokens = tokenizer.tokenize(sen) print(tokens)
|
二、进阶用法
索引转换
1 2 3
| ids = tokenizer.convert_tokens_to_ids(tokens)
|
1 2 3
| tokens = tokenizer.convert_ids_to_tokens(ids)
|
1 2 3
| str_sen = tokenizer.convert_tokens_to_string(tokens)
|
便捷实现方式
1 2 3
|
ids = tokenizer(sen, add_special_tokens=True)
|
1 2 3
| str_sen = tokenizer.decode(ids, skip_special_tokens=False) ids = tokenizer.encode(sen, add_special_tokens=True)
|
1 2 3
| str_sen = tokenizer.decode(ids, skip_special_tokens=False)
|
填充与截断
1 2 3
| ids = tokenizer.encode(sen, padding="max_length", max_length=15)
|
1 2
| str_sen = tokenizer.decode(ids, skip_special_tokens=False)
|
1 2 3
| ids = tokenizer.encode(sen, max_length=5, truncation=True)
|
1 2 3
| str_sen = tokenizer.decode(ids, skip_special_tokens=True)
|
其他输入部分
1 2
| ids = tokenizer.encode(sen, padding="max_length", max_length=15) ids
|
[101, 2207, 7824, 1391, 4344, 1300, 2145, 5381, 4991, 102, 0, 0, 0, 0, 0]
1 2 3 4
| attention_mask = [1 if idx != 0 else 0 for idx in ids] token_type_ids = [0] * len(ids) ids, attention_mask, token_type_ids
|
([101, 2207, 7824, 1391, 4344, 1300, 2145, 5381, 4991, 102, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
快速调用方式
1 2 3
| inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15) inputs
|
{'input_ids': [101, 2207, 7824, 1391, 4344, 1300, 2145, 5381, 4991, 102, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}
1 2 3
| inputs = tokenizer(sen, padding="max_length", max_length=15) inputs
|
{'input_ids': [101, 2207, 7824, 1391, 4344, 1300, 2145, 5381, 4991, 102, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}
批处理
1 2 3 4 5
| sens = ["弱小的我也有大梦想", "有梦想谁都了不起", "追逐梦想的心,比梦想本身,更可贵"] res = tokenizer(sens) res
|
{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], [101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
Fast / Slow Tokenizer
1 2 3
| fast_tokenizer = AutoTokenizer.from_pretrained("./bert_chinese_tokenizer") fast_tokenizer
|
1 2
| slow_tokenizer = AutoTokenizer.from_pretrained("./bert_chinese_tokenizer", use_fast=False) slow_tokenizer
|
1 2
| inputs = fast_tokenizer(sen, return_offsets_mapping=True) inputs
|
{'input_ids': [101, 2207, 7824, 1391, 4344, 1300, 2145, 5381, 4991, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (0, 0)]}
1 2
| inputs = slow_tokenizer(sen, return_offsets_mapping=False) inputs
|
特殊Tokenizer的加载
1 2 3
| tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) tokenizer.decode(tokenizer.encode(sen))
|