first commit

2026-05-21 01:21:51 +08:00
commit 699a04a3b2
26 changed files with 36741 additions and 0 deletions
@@ -0,0 +1,163 @@
+import pandas as pd
+import torch
+import numpy as np
+import evaluate
+from datasets import Dataset
+from transformers import (
+    ByT5Tokenizer, 
+    T5ForConditionalGeneration, 
+    Seq2SeqTrainer,                  
+    Seq2SeqTrainingArguments,        
+    DataCollatorForSeq2Seq,
+    EarlyStoppingCallback
+)
+
+# ==========================================
+# 1. Hardware Check
+# ==========================================
+device = "cpu"
+if torch.cuda.is_available():
+    device = "cuda"
+    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
+else:
+    print("WARNING: No GPU found. Check your SLURM --gres configuration.")
+
+# ==========================================
+# 2. Data Loading & Mapping (UPDATED FOR TRUE BENCHMARK)
+# ==========================================
+train_file_path = "new_dataset.csv" 
+test_file_path = "benchmark_dataset.csv" # The new hidden test set!
+
+print(f"Loading training data from {train_file_path}...")
+print(f"Loading benchmark data from {test_file_path}...")
+
+df_train = pd.read_csv(train_file_path)
+df_test = pd.read_csv(test_file_path)
+
+# Map integer labels to text strings for ByT5 text-to-text generation
+label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}
+df_train['label_text'] = df_train['label'].map(label_map)
+df_test['label_text'] = df_test['label'].map(label_map)
+
+# Load into datasets WITHOUT randomly splitting them
+dataset_train = Dataset.from_pandas(df_train)
+dataset_test = Dataset.from_pandas(df_test)
+
+# ==========================================
+# 3. Model & Tokenizer Initialization
+# ==========================================
+model_path = "./byt5_base_local_weights"
+
+print(f"Loading tokenizer and model from local path: {model_path}...")
+
+tokenizer = ByT5Tokenizer.from_pretrained(model_path)
+model = T5ForConditionalGeneration.from_pretrained(model_path) 
+model.to(device)
+
+# ==========================================
+# 4. Preprocessing Function 
+# ==========================================
+def preprocess_function(examples):
+    inputs = [f"Context: {s1} Statement: {s2} Question: Does the context entail, contradict, or remain neutral to the statement? Answer:" for s1, s2 in zip(examples['s1'], examples['s2'])]
+    
+    # Explicitly enforce padding so all inputs are a perfectly uniform 512-length tensor
+    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
+    
+    # Explicitly enforce padding so all labels are a perfectly uniform 16-length tensor
+    labels = tokenizer(text_target=examples['label_text'], max_length=16, padding="max_length", truncation=True)
+    
+    # CRITICAL: Replace the padding blanks with -100 so the loss function ignores them
+    labels_with_ignore_index = []
+    for label in labels["input_ids"]:
+        labels_with_ignore_index.append([l if l != tokenizer.pad_token_id else -100 for l in label])
+        
+    model_inputs["labels"] = labels_with_ignore_index
+    return model_inputs
+
+print("Tokenizing datasets...")
+# Process and strip columns from BOTH datasets separately
+tokenized_train = dataset_train.map(preprocess_function, batched=True, num_proc=4).remove_columns(["s1", "s2", "label", "label_text"])
+tokenized_test = dataset_test.map(preprocess_function, batched=True, num_proc=4).remove_columns(["s1", "s2", "label", "label_text"])
+
+data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
+
+# ==========================================
+# 5. Evaluation Metrics
+# ==========================================
+metric_acc = evaluate.load("accuracy")
+metric_f1 = evaluate.load("f1")
+
+def compute_metrics(eval_preds):
+    preds, labels = eval_preds
+    if isinstance(preds, tuple):
+        preds = preds[0]
+    
+    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    
+    decoded_preds = [pred.strip().lower() for pred in decoded_preds]
+    decoded_labels = [label.strip().lower() for label in decoded_labels]
+    
+    reverse_label_map = {"entailment": 0, "neutral": 1, "contradiction": 2}
+    
+    # Default to neutral (1) if the model hallucinates a random word
+    int_preds = [reverse_label_map.get(p, 1) for p in decoded_preds]
+    int_labels = [reverse_label_map.get(l, 1) for l in decoded_labels]
+    
+    acc = metric_acc.compute(predictions=int_preds, references=int_labels)["accuracy"]
+    f1 = metric_f1.compute(predictions=int_preds, references=int_labels, average="macro")["f1"]
+    
+    return {"accuracy": acc, "f1_macro": f1}
+
+# ==========================================
+# 6. Training Configuration & Execution
+# ==========================================
+training_args = Seq2SeqTrainingArguments(
+    output_dir="./byt5-taglish-nli-v3",
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    learning_rate=1e-4,
+    
+    # ADD THE NEW SCHEDULER SETTINGS HERE:
+    lr_scheduler_type="cosine",
+    warmup_ratio=0.1,
+
+    per_device_train_batch_size=8,   
+    per_device_eval_batch_size=8,
+    gradient_accumulation_steps=4,   
+    num_train_epochs=30,             
+    logging_steps=10,                
+    metric_for_best_model="f1_macro",
+    
+    dataloader_num_workers=4,        
+    weight_decay=0.01,
+    predict_with_generate=True,          
+    bf16=torch.cuda.is_bf16_supported(), 
+    load_best_model_at_end=True,
+
+    generation_max_length=25,
+)
+
+trainer = Seq2SeqTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_train, # <--- FED EXACTLY
+    eval_dataset=tokenized_test,   # <--- FED EXACTLY
+    processing_class=tokenizer,      
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]  
+)
+
+print("Starting training...")
+trainer.train()
+
+# ==========================================
+# 7. Save the Final Model
+# ==========================================
+final_save_path = "./byt5-taglish-nli-final-v3"
+trainer.save_model(final_save_path)
+tokenizer.save_pretrained(final_save_path)
+print(f"Training complete! Model saved to {final_save_path}")
+