import pandas as pd import torch import numpy as np import evaluate from datasets import Dataset from transformers import ( ByT5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback ) # ========================================== # 1. Hardware Check # ========================================== device = "cpu" if torch.cuda.is_available(): device = "cuda" print(f"Using GPU: {torch.cuda.get_device_name(0)}") else: print("WARNING: No GPU found. Check your SLURM --gres configuration.") # ========================================== # 2. Data Loading & Mapping (UPDATED FOR TRUE BENCHMARK) # ========================================== train_file_path = "new_dataset.csv" test_file_path = "benchmark_dataset.csv" # The new hidden test set! print(f"Loading training data from {train_file_path}...") print(f"Loading benchmark data from {test_file_path}...") df_train = pd.read_csv(train_file_path) df_test = pd.read_csv(test_file_path) # Map integer labels to text strings for ByT5 text-to-text generation label_map = {0: "entailment", 1: "neutral", 2: "contradiction"} df_train['label_text'] = df_train['label'].map(label_map) df_test['label_text'] = df_test['label'].map(label_map) # Load into datasets WITHOUT randomly splitting them dataset_train = Dataset.from_pandas(df_train) dataset_test = Dataset.from_pandas(df_test) # ========================================== # 3. Model & Tokenizer Initialization # ========================================== model_path = "./byt5_local_weights" print(f"Loading tokenizer and model from local path: {model_path}...") tokenizer = ByT5Tokenizer.from_pretrained(model_path) model = T5ForConditionalGeneration.from_pretrained(model_path) model.to(device) # ========================================== # 4. Preprocessing Function # ========================================== def preprocess_function(examples): inputs = [f"nli premise: {s1} hypothesis: {s2}" for s1, s2 in zip(examples['s1'], examples['s2'])] # Explicitly enforce padding so all inputs are a perfectly uniform 512-length tensor model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True) # Explicitly enforce padding so all labels are a perfectly uniform 16-length tensor labels = tokenizer(text_target=examples['label_text'], max_length=16, padding="max_length", truncation=True) # CRITICAL: Replace the padding blanks with -100 so the loss function ignores them labels_with_ignore_index = [] for label in labels["input_ids"]: labels_with_ignore_index.append([l if l != tokenizer.pad_token_id else -100 for l in label]) model_inputs["labels"] = labels_with_ignore_index return model_inputs print("Tokenizing datasets...") # Process and strip columns from BOTH datasets separately tokenized_train = dataset_train.map(preprocess_function, batched=True, num_proc=4).remove_columns(["s1", "s2", "label", "label_text"]) tokenized_test = dataset_test.map(preprocess_function, batched=True, num_proc=4).remove_columns(["s1", "s2", "label", "label_text"]) data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) # ========================================== # 5. Evaluation Metrics # ========================================== metric_acc = evaluate.load("accuracy") metric_f1 = evaluate.load("f1") def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds = [pred.strip().lower() for pred in decoded_preds] decoded_labels = [label.strip().lower() for label in decoded_labels] reverse_label_map = {"entailment": 0, "neutral": 1, "contradiction": 2} # Default to neutral (1) if the model hallucinates a random word int_preds = [reverse_label_map.get(p, 1) for p in decoded_preds] int_labels = [reverse_label_map.get(l, 1) for l in decoded_labels] acc = metric_acc.compute(predictions=int_preds, references=int_labels)["accuracy"] f1 = metric_f1.compute(predictions=int_preds, references=int_labels, average="macro")["f1"] return {"accuracy": acc, "f1_macro": f1} # ========================================== # 6. Training Configuration & Execution # ========================================== training_args = Seq2SeqTrainingArguments( output_dir="./byt5-taglish-nli-v2", eval_strategy="epoch", save_strategy="epoch", learning_rate=1e-4, per_device_train_batch_size=8, per_device_eval_batch_size=8, gradient_accumulation_steps=4, num_train_epochs=30, logging_steps=10, metric_for_best_model="f1_macro", dataloader_num_workers=4, weight_decay=0.01, predict_with_generate=True, bf16=torch.cuda.is_bf16_supported(), load_best_model_at_end=True, generation_max_length=25, ) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=tokenized_train, # <--- FED EXACTLY eval_dataset=tokenized_test, # <--- FED EXACTLY processing_class=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=10)] ) print("Starting training...") trainer.train() # ========================================== # 7. Save the Final Model # ========================================== final_save_path = "./byt5-taglish-nli-final-v2" trainer.save_model(final_save_path) tokenizer.save_pretrained(final_save_path) print(f"Training complete! Model saved to {final_save_path}")