文本摘要之前缀语言模型-GLM

文本摘要相关内容查阅 预训练模型实战之文本摘要

1 导入相关包

1
2
3
4
5
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

2 加载数据集

1
2
3
4
5
6
7
8
9
import json

# train.json有140w数据,dev.json只有1w数据,
with open('./LCSTS_new/train.json','r') as f:
lines = f.readlines()
train_data = [json.loads(line) for line in lines[:10000]]
ds = Dataset.from_list(train_data)
ds = ds.train_test_split(0.1, seed=42)
ds

3 数据处理

1
2
3
4
5
6
7
tokenzier = AutoTokenizer.from_pretrained("/data1/model/glm-large-chinese", trust_remote_code=True)
def process_func(exmaples):
contents = ["文本摘要: \n" + e + tokenzier.mask_token for e in exmaples["content"]]
inputs = tokenzier(contents, max_length=384, truncation=True, padding="max_length", return_tensors="pt")
inputs = tokenzier.build_inputs_for_generation(inputs, targets=exmaples['summary'], padding=True, max_gen_length=64)
return inputs
tokenized_ds = ds.map(process_func, batched=True,remove_columns=ds["train"].column_names)

4 创建模型

1
model = AutoModelForSeq2SeqLM.from_pretrained("/data1/model/glm-large-chinese", trust_remote_code=True)

5 配置训练参数

1
2
3
4
5
6
7
8
args = Seq2SeqTrainingArguments(
output_dir="./summary_glm",
per_device_train_batch_size=4,
per_device_eval_batch_size=8,
gradient_accumulation_文本摘要s=8,
logging_steps=8,
num_train_epochs=1
)

6 创建训练器

1
2
3
4
5
6
trainer = Seq2SeqTrainer(
args=args,
model=model,
train_dataset=tokenized_ds["train"],
tokenizer=tokenzier,
)

7 模型训练

1
trainer.train()

8 模型推理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
input_text = ds["test"][-1]["content"]
inputs = tokenzier("文本摘要: \n" + input_text + tokenzier.mask_token, return_tensors="pt")
inputs = tokenzier.build_inputs_for_generation(inputs, max_gen_length=64)
inputs = inputs.to("cuda")
output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenzier.eop_token_id, do_sample=True)
tokenzier.decode(output[0].tolist())

import torch
model = model.eval()
def predict_test():
predict = []
with torch.inference_mode():
for d in ds["test"]:
inputs = tokenzier("文本摘要: \n" + d["content"] + tokenzier.mask_token, return_tensors="pt")
inputs = tokenzier.build_inputs_for_generation(inputs, max_gen_length=64)
inputs = inputs.to("cuda")
output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenzier.eop_token_id, do_sample=True)
predict.append(tokenzier.decode(output[0].tolist()).split("<|startofpiece|>")[1].replace("<|endofpiece|>", "").strip())
return predict
result = predict_test()

9 模型评估

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from rouge_chinese import Rouge

rouge = Rouge()
docode_preds = [" ".join(p) for p in result]
decode_labels = [" ".join(l) for l in ds["test"]["summary"]]
scores = rouge.get_scores(docode_preds, decode_labels, avg=True)
result={
"rouge-1": scores["rouge-1"]["f"],
"rouge-2": scores["rouge-2"]["f"],
"rouge-l": scores["rouge-l"]["f"],
}
print(result)
#{'rouge-1': 0.2986737414721831,
#'rouge-2': 0.1592513373476627,
#'rouge-l': 0.255562709540291}