164 lines
6.0 KiB
Python
164 lines
6.0 KiB
Python
import pandas as pd
|
|
import torch
|
|
import numpy as np
|
|
import evaluate
|
|
from datasets import Dataset
|
|
from transformers import (
|
|
ByT5Tokenizer,
|
|
T5ForConditionalGeneration,
|
|
Seq2SeqTrainer,
|
|
Seq2SeqTrainingArguments,
|
|
DataCollatorForSeq2Seq,
|
|
EarlyStoppingCallback
|
|
)
|
|
|
|
# ==========================================
|
|
# 1. Hardware Check
|
|
# ==========================================
|
|
device = "cpu"
|
|
if torch.cuda.is_available():
|
|
device = "cuda"
|
|
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
|
|
else:
|
|
print("WARNING: No GPU found. Check your SLURM --gres configuration.")
|
|
|
|
# ==========================================
|
|
# 2. Data Loading & Mapping (UPDATED FOR TRUE BENCHMARK)
|
|
# ==========================================
|
|
train_file_path = "new_dataset.csv"
|
|
test_file_path = "benchmark_dataset.csv" # The new hidden test set!
|
|
|
|
print(f"Loading training data from {train_file_path}...")
|
|
print(f"Loading benchmark data from {test_file_path}...")
|
|
|
|
df_train = pd.read_csv(train_file_path)
|
|
df_test = pd.read_csv(test_file_path)
|
|
|
|
# Map integer labels to text strings for ByT5 text-to-text generation
|
|
label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}
|
|
df_train['label_text'] = df_train['label'].map(label_map)
|
|
df_test['label_text'] = df_test['label'].map(label_map)
|
|
|
|
# Load into datasets WITHOUT randomly splitting them
|
|
dataset_train = Dataset.from_pandas(df_train)
|
|
dataset_test = Dataset.from_pandas(df_test)
|
|
|
|
# ==========================================
|
|
# 3. Model & Tokenizer Initialization
|
|
# ==========================================
|
|
model_path = "./byt5_base_local_weights"
|
|
|
|
print(f"Loading tokenizer and model from local path: {model_path}...")
|
|
|
|
tokenizer = ByT5Tokenizer.from_pretrained(model_path)
|
|
model = T5ForConditionalGeneration.from_pretrained(model_path)
|
|
model.to(device)
|
|
|
|
# ==========================================
|
|
# 4. Preprocessing Function
|
|
# ==========================================
|
|
def preprocess_function(examples):
|
|
inputs = [f"Context: {s1} Statement: {s2} Question: Does the context entail, contradict, or remain neutral to the statement? Answer:" for s1, s2 in zip(examples['s1'], examples['s2'])]
|
|
|
|
# Explicitly enforce padding so all inputs are a perfectly uniform 512-length tensor
|
|
model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
|
|
|
|
# Explicitly enforce padding so all labels are a perfectly uniform 16-length tensor
|
|
labels = tokenizer(text_target=examples['label_text'], max_length=16, padding="max_length", truncation=True)
|
|
|
|
# CRITICAL: Replace the padding blanks with -100 so the loss function ignores them
|
|
labels_with_ignore_index = []
|
|
for label in labels["input_ids"]:
|
|
labels_with_ignore_index.append([l if l != tokenizer.pad_token_id else -100 for l in label])
|
|
|
|
model_inputs["labels"] = labels_with_ignore_index
|
|
return model_inputs
|
|
|
|
print("Tokenizing datasets...")
|
|
# Process and strip columns from BOTH datasets separately
|
|
tokenized_train = dataset_train.map(preprocess_function, batched=True, num_proc=4).remove_columns(["s1", "s2", "label", "label_text"])
|
|
tokenized_test = dataset_test.map(preprocess_function, batched=True, num_proc=4).remove_columns(["s1", "s2", "label", "label_text"])
|
|
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
|
|
|
|
# ==========================================
|
|
# 5. Evaluation Metrics
|
|
# ==========================================
|
|
metric_acc = evaluate.load("accuracy")
|
|
metric_f1 = evaluate.load("f1")
|
|
|
|
def compute_metrics(eval_preds):
|
|
preds, labels = eval_preds
|
|
if isinstance(preds, tuple):
|
|
preds = preds[0]
|
|
|
|
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
|
|
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
|
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
|
|
|
decoded_preds = [pred.strip().lower() for pred in decoded_preds]
|
|
decoded_labels = [label.strip().lower() for label in decoded_labels]
|
|
|
|
reverse_label_map = {"entailment": 0, "neutral": 1, "contradiction": 2}
|
|
|
|
# Default to neutral (1) if the model hallucinates a random word
|
|
int_preds = [reverse_label_map.get(p, 1) for p in decoded_preds]
|
|
int_labels = [reverse_label_map.get(l, 1) for l in decoded_labels]
|
|
|
|
acc = metric_acc.compute(predictions=int_preds, references=int_labels)["accuracy"]
|
|
f1 = metric_f1.compute(predictions=int_preds, references=int_labels, average="macro")["f1"]
|
|
|
|
return {"accuracy": acc, "f1_macro": f1}
|
|
|
|
# ==========================================
|
|
# 6. Training Configuration & Execution
|
|
# ==========================================
|
|
training_args = Seq2SeqTrainingArguments(
|
|
output_dir="./byt5-taglish-nli-v3",
|
|
eval_strategy="epoch",
|
|
save_strategy="epoch",
|
|
learning_rate=1e-4,
|
|
|
|
# ADD THE NEW SCHEDULER SETTINGS HERE:
|
|
lr_scheduler_type="cosine",
|
|
warmup_ratio=0.1,
|
|
|
|
per_device_train_batch_size=8,
|
|
per_device_eval_batch_size=8,
|
|
gradient_accumulation_steps=4,
|
|
num_train_epochs=30,
|
|
logging_steps=10,
|
|
metric_for_best_model="f1_macro",
|
|
|
|
dataloader_num_workers=4,
|
|
weight_decay=0.01,
|
|
predict_with_generate=True,
|
|
bf16=torch.cuda.is_bf16_supported(),
|
|
load_best_model_at_end=True,
|
|
|
|
generation_max_length=25,
|
|
)
|
|
|
|
trainer = Seq2SeqTrainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_train, # <--- FED EXACTLY
|
|
eval_dataset=tokenized_test, # <--- FED EXACTLY
|
|
processing_class=tokenizer,
|
|
data_collator=data_collator,
|
|
compute_metrics=compute_metrics,
|
|
callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
|
|
)
|
|
|
|
print("Starting training...")
|
|
trainer.train()
|
|
|
|
# ==========================================
|
|
# 7. Save the Final Model
|
|
# ==========================================
|
|
final_save_path = "./byt5-taglish-nli-final-v3"
|
|
trainer.save_model(final_save_path)
|
|
tokenizer.save_pretrained(final_save_path)
|
|
print(f"Training complete! Model saved to {final_save_path}")
|
|
|