文本摘要之序列到序列模型-t5

文本摘要相关内容查阅预训练模型之文本摘要

1 导入相关包

1
2
3
4
5
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

2 加载数据集

1
2
3
4
5
6
7
import json
# train.json有140w数据,dev.json只有1w数据,
with open('./LCSTS_new/train.json','r') as f:
lines = f.readlines()
train_data = [json.loads(line) for line in lines[:100000]]
ds = Dataset.from_list(train_data)
ds = ds.train_test_split(0.1, seed=42)

3 数据处理

1
2
3
4
5
6
7
8
tokenzier = AutoTokenizer.from_pretrained("/data1/model/mengzi-t5-base")
def process_func(exmaples):
contents = ["文本摘要: \n" + e for e in exmaples["content"]]
inputs = tokenzier(contents, max_length=384, truncation=True)
labels = tokenzier(text_target=exmaples["summary"], max_length=64, truncation=True)
inputs["labels"] = labels["input_ids"]
return inputs
tokenized_ds = ds.map(process_func, batched=True)

4 创建模型

1
model = AutoModelForSeq2SeqLM.from_pretrained("/data1/model/mengzi-t5-base")

5 创建评估函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import numpy as np
from rouge_chinese import Rouge

rouge = Rouge()
def compute_metric(evalPred):
predictions, labels = evalPred
decode_preds = tokenzier.batch_decode(predictions, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenzier.pad_token_id)
decode_labels = tokenzier.batch_decode(labels, skip_special_tokens=True)
decode_preds = [" ".join(p) for p in decode_preds]
decode_labels = [" ".join(l) for l in decode_labels]
scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
return {
"rouge-1": scores["rouge-1"]["f"],
"rouge-2": scores["rouge-2"]["f"],
"rouge-l": scores["rouge-l"]["f"],
}

6 配置训练参数

1
2
3
4
5
6
7
8
9
10
11
args = Seq2SeqTrainingArguments(
output_dir="./t5-base",
per_device_train_batch_size=128,
per_device_eval_batch_size=64,
gradient_accumulation_steps=8,
logging_steps=8,
evaluation_strategy="epoch",
save_strategy="epoch",
metric_for_best_model="rouge-l",
predict_with_generate=True
)

7 创建训练器

1
2
3
4
5
6
7
8
9
trainer = Seq2SeqTrainer(
args=args,
model=model,
train_dataset=tokenized_ds["train"],
eval_dataset=tokenized_ds["test"],
tokenizer=tokenzier,
compute_metrics=compute_metric,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenzier)
)

8 模型训练

1
trainer.train()

9 模型推理

1
2
3
4
5
from transformers import pipeline

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenzier, device=0)
pipe("文本摘要:\n" + ds["test"][-1]["content"], max_length=64, do_sample=True)

10 模型评测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from rouge_chinese import Rouge
def gen_result(arg):
return pipe("文本摘要:\n" + arg, max_length=64, do_sample=True)[0]['generated_text']

with open('./LCSTS_new/dev.json','r',encoding='utf-8') as f:
lines = f.readlines()
dev_data = [json.loads(line) for line in lines[:100]]

true_list=[' '.join(arg['summary']) for arg in dev_data]
pred_list=[]

from tqdm import tqdm
for arg in tqdm(dev_data):
pred_list.append(' '.join(gen_result(arg['content'])))

rouge = Rouge()
scores = rouge.get_scores(pred_list, true_list, avg=True)
result ={
"rouge-1": scores["rouge-1"]["f"],
"rouge-2": scores["rouge-2"]["f"],
"rouge-l": scores["rouge-l"]["f"],
}
print(result)