Files
dsp15/logs/train_9966.out
2026-05-21 01:21:51 +08:00

313 lines
29 KiB
Plaintext

Using GPU: Tesla V100-SXM2-32GB
Loading training data from new_dataset.csv...
Loading benchmark data from benchmark_dataset.csv...
Loading tokenizer and model from local path: ./byt5_base_local_weights...
Tokenizing datasets...
Starting training...
{'loss': '17.49', 'grad_norm': '126.5', 'learning_rate': '1.667e-06', 'epoch': '0.05556'}
{'loss': '17.71', 'grad_norm': '394.1', 'learning_rate': '3.519e-06', 'epoch': '0.1111'}
{'loss': '17.12', 'grad_norm': '84.43', 'learning_rate': '5.37e-06', 'epoch': '0.1667'}
{'loss': '16.68', 'grad_norm': '75.35', 'learning_rate': '7.222e-06', 'epoch': '0.2222'}
{'loss': '16.43', 'grad_norm': '73.7', 'learning_rate': '9.074e-06', 'epoch': '0.2778'}
{'loss': '14.46', 'grad_norm': '50.42', 'learning_rate': '1.093e-05', 'epoch': '0.3333'}
{'loss': '13.43', 'grad_norm': '56.6', 'learning_rate': '1.278e-05', 'epoch': '0.3889'}
{'loss': '11.79', 'grad_norm': '39.79', 'learning_rate': '1.463e-05', 'epoch': '0.4444'}
{'loss': '10.1', 'grad_norm': '37.04', 'learning_rate': '1.648e-05', 'epoch': '0.5'}
{'loss': '7.905', 'grad_norm': '44.11', 'learning_rate': '1.833e-05', 'epoch': '0.5556'}
{'loss': '5.206', 'grad_norm': '43.47', 'learning_rate': '2.019e-05', 'epoch': '0.6111'}
{'loss': '4.517', 'grad_norm': '43.53', 'learning_rate': '2.204e-05', 'epoch': '0.6667'}
{'loss': '3.873', 'grad_norm': '21.54', 'learning_rate': '2.389e-05', 'epoch': '0.7222'}
{'loss': '2.97', 'grad_norm': '17.54', 'learning_rate': '2.574e-05', 'epoch': '0.7778'}
{'loss': '2.359', 'grad_norm': '81.92', 'learning_rate': '2.759e-05', 'epoch': '0.8333'}
{'loss': '1.847', 'grad_norm': '12.21', 'learning_rate': '2.944e-05', 'epoch': '0.8889'}
{'loss': '1.58', 'grad_norm': '20.55', 'learning_rate': '3.13e-05', 'epoch': '0.9444'}
{'loss': '1.506', 'grad_norm': '16.61', 'learning_rate': '3.315e-05', 'epoch': '1'}
{'eval_loss': '0.1121', 'eval_accuracy': '0.3333', 'eval_f1_macro': '0.1667', 'eval_runtime': '106.5', 'eval_samples_per_second': '2.057', 'eval_steps_per_second': '0.263', 'epoch': '1'}
{'loss': '1.54', 'grad_norm': '8.567', 'learning_rate': '3.5e-05', 'epoch': '1.056'}
{'loss': '0.91', 'grad_norm': '39.48', 'learning_rate': '3.685e-05', 'epoch': '1.111'}
{'loss': '0.9203', 'grad_norm': '5.683', 'learning_rate': '3.87e-05', 'epoch': '1.167'}
{'loss': '0.7447', 'grad_norm': '4.14', 'learning_rate': '4.056e-05', 'epoch': '1.222'}
{'loss': '0.6591', 'grad_norm': '11.54', 'learning_rate': '4.241e-05', 'epoch': '1.278'}
{'loss': '0.6295', 'grad_norm': '6.007', 'learning_rate': '4.426e-05', 'epoch': '1.333'}
{'loss': '0.5455', 'grad_norm': '4.925', 'learning_rate': '4.611e-05', 'epoch': '1.389'}
{'loss': '0.5387', 'grad_norm': '1.691', 'learning_rate': '4.796e-05', 'epoch': '1.444'}
{'loss': '0.5033', 'grad_norm': '3.911', 'learning_rate': '4.981e-05', 'epoch': '1.5'}
{'loss': '0.5167', 'grad_norm': '6.595', 'learning_rate': '5.167e-05', 'epoch': '1.556'}
{'loss': '0.4878', 'grad_norm': '7.394', 'learning_rate': '5.352e-05', 'epoch': '1.611'}
{'loss': '0.5138', 'grad_norm': '5.293', 'learning_rate': '5.537e-05', 'epoch': '1.667'}
{'loss': '0.4671', 'grad_norm': '1.434', 'learning_rate': '5.722e-05', 'epoch': '1.722'}
{'loss': '0.4832', 'grad_norm': '1.653', 'learning_rate': '5.907e-05', 'epoch': '1.778'}
{'loss': '0.5022', 'grad_norm': '12.34', 'learning_rate': '6.093e-05', 'epoch': '1.833'}
{'loss': '0.451', 'grad_norm': '3.589', 'learning_rate': '6.278e-05', 'epoch': '1.889'}
{'loss': '0.4325', 'grad_norm': '1.188', 'learning_rate': '6.463e-05', 'epoch': '1.944'}
{'loss': '0.3984', 'grad_norm': '1.646', 'learning_rate': '6.648e-05', 'epoch': '2'}
{'eval_loss': '0.09331', 'eval_accuracy': '0.4749', 'eval_f1_macro': '0.4241', 'eval_runtime': '105.7', 'eval_samples_per_second': '2.071', 'eval_steps_per_second': '0.265', 'epoch': '2'}
{'loss': '0.4126', 'grad_norm': '1.936', 'learning_rate': '6.833e-05', 'epoch': '2.056'}
{'loss': '0.4013', 'grad_norm': '1.64', 'learning_rate': '7.019e-05', 'epoch': '2.111'}
{'loss': '0.3856', 'grad_norm': '1.888', 'learning_rate': '7.204e-05', 'epoch': '2.167'}
{'loss': '0.3317', 'grad_norm': '1.572', 'learning_rate': '7.389e-05', 'epoch': '2.222'}
{'loss': '0.3377', 'grad_norm': '1.841', 'learning_rate': '7.574e-05', 'epoch': '2.278'}
{'loss': '0.3252', 'grad_norm': '1.3', 'learning_rate': '7.759e-05', 'epoch': '2.333'}
{'loss': '0.3498', 'grad_norm': '1.531', 'learning_rate': '7.944e-05', 'epoch': '2.389'}
{'loss': '0.3116', 'grad_norm': '1.637', 'learning_rate': '8.13e-05', 'epoch': '2.444'}
{'loss': '0.3697', 'grad_norm': '2.368', 'learning_rate': '8.315e-05', 'epoch': '2.5'}
{'loss': '0.3553', 'grad_norm': '1.5', 'learning_rate': '8.5e-05', 'epoch': '2.556'}
{'loss': '0.3292', 'grad_norm': '1.308', 'learning_rate': '8.685e-05', 'epoch': '2.611'}
{'loss': '0.3383', 'grad_norm': '1.561', 'learning_rate': '8.87e-05', 'epoch': '2.667'}
{'loss': '0.3353', 'grad_norm': '1.995', 'learning_rate': '9.056e-05', 'epoch': '2.722'}
{'loss': '0.2964', 'grad_norm': '3.824', 'learning_rate': '9.241e-05', 'epoch': '2.778'}
{'loss': '0.2852', 'grad_norm': '1.144', 'learning_rate': '9.426e-05', 'epoch': '2.833'}
{'loss': '0.2988', 'grad_norm': '3.979', 'learning_rate': '9.611e-05', 'epoch': '2.889'}
{'loss': '0.2903', 'grad_norm': '11.14', 'learning_rate': '9.796e-05', 'epoch': '2.944'}
{'loss': '0.2688', 'grad_norm': '0.9756', 'learning_rate': '9.981e-05', 'epoch': '3'}
{'eval_loss': '0.08739', 'eval_accuracy': '0.5297', 'eval_f1_macro': '0.4854', 'eval_runtime': '106.1', 'eval_samples_per_second': '2.063', 'eval_steps_per_second': '0.264', 'epoch': '3'}
{'loss': '0.2418', 'grad_norm': '1.327', 'learning_rate': '0.0001', 'epoch': '3.056'}
{'loss': '0.2638', 'grad_norm': '1.858', 'learning_rate': '0.0001', 'epoch': '3.111'}
{'loss': '0.2647', 'grad_norm': '1.894', 'learning_rate': '9.999e-05', 'epoch': '3.167'}
{'loss': '0.252', 'grad_norm': '1.613', 'learning_rate': '9.998e-05', 'epoch': '3.222'}
{'loss': '0.2114', 'grad_norm': '1.535', 'learning_rate': '9.997e-05', 'epoch': '3.278'}
{'loss': '0.1979', 'grad_norm': '1.696', 'learning_rate': '9.996e-05', 'epoch': '3.333'}
{'loss': '0.2036', 'grad_norm': '1.235', 'learning_rate': '9.995e-05', 'epoch': '3.389'}
{'loss': '0.2308', 'grad_norm': '1.665', 'learning_rate': '9.993e-05', 'epoch': '3.444'}
{'loss': '0.2274', 'grad_norm': '1.34', 'learning_rate': '9.992e-05', 'epoch': '3.5'}
{'loss': '0.191', 'grad_norm': '1.447', 'learning_rate': '9.99e-05', 'epoch': '3.556'}
{'loss': '0.2044', 'grad_norm': '2.155', 'learning_rate': '9.988e-05', 'epoch': '3.611'}
{'loss': '0.1755', 'grad_norm': '1.544', 'learning_rate': '9.985e-05', 'epoch': '3.667'}
{'loss': '0.1824', 'grad_norm': '2.093', 'learning_rate': '9.983e-05', 'epoch': '3.722'}
{'loss': '0.1727', 'grad_norm': '2.961', 'learning_rate': '9.98e-05', 'epoch': '3.778'}
{'loss': '0.2239', 'grad_norm': '1.734', 'learning_rate': '9.977e-05', 'epoch': '3.833'}
{'loss': '0.1899', 'grad_norm': '1.448', 'learning_rate': '9.974e-05', 'epoch': '3.889'}
{'loss': '0.1814', 'grad_norm': '2.32', 'learning_rate': '9.97e-05', 'epoch': '3.944'}
{'loss': '0.1755', 'grad_norm': '1.86', 'learning_rate': '9.967e-05', 'epoch': '4'}
{'eval_loss': '0.07903', 'eval_accuracy': '0.6895', 'eval_f1_macro': '0.6899', 'eval_runtime': '106.6', 'eval_samples_per_second': '2.054', 'eval_steps_per_second': '0.263', 'epoch': '4'}
{'loss': '0.1275', 'grad_norm': '3.106', 'learning_rate': '9.963e-05', 'epoch': '4.056'}
{'loss': '0.1281', 'grad_norm': '2.33', 'learning_rate': '9.959e-05', 'epoch': '4.111'}
{'loss': '0.1452', 'grad_norm': '1.859', 'learning_rate': '9.954e-05', 'epoch': '4.167'}
{'loss': '0.1269', 'grad_norm': '1.896', 'learning_rate': '9.95e-05', 'epoch': '4.222'}
{'loss': '0.1233', 'grad_norm': '2.57', 'learning_rate': '9.945e-05', 'epoch': '4.278'}
{'loss': '0.1465', 'grad_norm': '2.685', 'learning_rate': '9.94e-05', 'epoch': '4.333'}
{'loss': '0.1661', 'grad_norm': '0.9907', 'learning_rate': '9.935e-05', 'epoch': '4.389'}
{'loss': '0.1102', 'grad_norm': '2.938', 'learning_rate': '9.93e-05', 'epoch': '4.444'}
{'loss': '0.1182', 'grad_norm': '1.862', 'learning_rate': '9.925e-05', 'epoch': '4.5'}
{'loss': '0.1721', 'grad_norm': '2.176', 'learning_rate': '9.919e-05', 'epoch': '4.556'}
{'loss': '0.1368', 'grad_norm': '1.267', 'learning_rate': '9.913e-05', 'epoch': '4.611'}
{'loss': '0.1245', 'grad_norm': '3.203', 'learning_rate': '9.907e-05', 'epoch': '4.667'}
{'loss': '0.1536', 'grad_norm': '2.007', 'learning_rate': '9.901e-05', 'epoch': '4.722'}
{'loss': '0.1365', 'grad_norm': '2.343', 'learning_rate': '9.894e-05', 'epoch': '4.778'}
{'loss': '0.1671', 'grad_norm': '3.646', 'learning_rate': '9.887e-05', 'epoch': '4.833'}
{'loss': '0.1417', 'grad_norm': '1.09', 'learning_rate': '9.88e-05', 'epoch': '4.889'}
{'loss': '0.1002', 'grad_norm': '1.696', 'learning_rate': '9.873e-05', 'epoch': '4.944'}
{'loss': '0.1144', 'grad_norm': '1.926', 'learning_rate': '9.866e-05', 'epoch': '5'}
{'eval_loss': '0.06116', 'eval_accuracy': '0.7808', 'eval_f1_macro': '0.7831', 'eval_runtime': '106.6', 'eval_samples_per_second': '2.055', 'eval_steps_per_second': '0.263', 'epoch': '5'}
{'loss': '0.1054', 'grad_norm': '1.36', 'learning_rate': '9.858e-05', 'epoch': '5.056'}
{'loss': '0.08459', 'grad_norm': '3.187', 'learning_rate': '9.851e-05', 'epoch': '5.111'}
{'loss': '0.1275', 'grad_norm': '2.495', 'learning_rate': '9.843e-05', 'epoch': '5.167'}
{'loss': '0.08002', 'grad_norm': '1.458', 'learning_rate': '9.835e-05', 'epoch': '5.222'}
{'loss': '0.1002', 'grad_norm': '2.299', 'learning_rate': '9.826e-05', 'epoch': '5.278'}
{'loss': '0.1129', 'grad_norm': '0.8684', 'learning_rate': '9.818e-05', 'epoch': '5.333'}
{'loss': '0.07822', 'grad_norm': '0.9398', 'learning_rate': '9.809e-05', 'epoch': '5.389'}
{'loss': '0.09044', 'grad_norm': '2.049', 'learning_rate': '9.8e-05', 'epoch': '5.444'}
{'loss': '0.09677', 'grad_norm': '8.718', 'learning_rate': '9.791e-05', 'epoch': '5.5'}
{'loss': '0.1067', 'grad_norm': '2.777', 'learning_rate': '9.782e-05', 'epoch': '5.556'}
{'loss': '0.08454', 'grad_norm': '1.695', 'learning_rate': '9.772e-05', 'epoch': '5.611'}
{'loss': '0.07226', 'grad_norm': '0.8294', 'learning_rate': '9.762e-05', 'epoch': '5.667'}
{'loss': '0.07516', 'grad_norm': '0.9004', 'learning_rate': '9.752e-05', 'epoch': '5.722'}
{'loss': '0.1167', 'grad_norm': '1.561', 'learning_rate': '9.742e-05', 'epoch': '5.778'}
{'loss': '0.07728', 'grad_norm': '3.634', 'learning_rate': '9.732e-05', 'epoch': '5.833'}
{'loss': '0.1273', 'grad_norm': '2.117', 'learning_rate': '9.721e-05', 'epoch': '5.889'}
{'loss': '0.05944', 'grad_norm': '1.621', 'learning_rate': '9.711e-05', 'epoch': '5.944'}
{'loss': '0.06912', 'grad_norm': '1.946', 'learning_rate': '9.7e-05', 'epoch': '6'}
{'eval_loss': '0.05491', 'eval_accuracy': '0.7991', 'eval_f1_macro': '0.801', 'eval_runtime': '105.8', 'eval_samples_per_second': '2.071', 'eval_steps_per_second': '0.265', 'epoch': '6'}
{'loss': '0.0387', 'grad_norm': '1.792', 'learning_rate': '9.688e-05', 'epoch': '6.056'}
{'loss': '0.07151', 'grad_norm': '3.826', 'learning_rate': '9.677e-05', 'epoch': '6.111'}
{'loss': '0.06049', 'grad_norm': '3.699', 'learning_rate': '9.666e-05', 'epoch': '6.167'}
{'loss': '0.08918', 'grad_norm': '2.199', 'learning_rate': '9.654e-05', 'epoch': '6.222'}
{'loss': '0.06295', 'grad_norm': '0.2884', 'learning_rate': '9.642e-05', 'epoch': '6.278'}
{'loss': '0.0692', 'grad_norm': '2.308', 'learning_rate': '9.63e-05', 'epoch': '6.333'}
{'loss': '0.06504', 'grad_norm': '2.358', 'learning_rate': '9.618e-05', 'epoch': '6.389'}
{'loss': '0.08678', 'grad_norm': '1.519', 'learning_rate': '9.605e-05', 'epoch': '6.444'}
{'loss': '0.05805', 'grad_norm': '1.092', 'learning_rate': '9.592e-05', 'epoch': '6.5'}
{'loss': '0.04817', 'grad_norm': '2.172', 'learning_rate': '9.579e-05', 'epoch': '6.556'}
{'loss': '0.08549', 'grad_norm': '3.618', 'learning_rate': '9.566e-05', 'epoch': '6.611'}
{'loss': '0.08363', 'grad_norm': '2.229', 'learning_rate': '9.553e-05', 'epoch': '6.667'}
{'loss': '0.06852', 'grad_norm': '2.268', 'learning_rate': '9.54e-05', 'epoch': '6.722'}
{'loss': '0.06173', 'grad_norm': '1.336', 'learning_rate': '9.526e-05', 'epoch': '6.778'}
{'loss': '0.05815', 'grad_norm': '1.254', 'learning_rate': '9.512e-05', 'epoch': '6.833'}
{'loss': '0.0697', 'grad_norm': '0.8746', 'learning_rate': '9.498e-05', 'epoch': '6.889'}
{'loss': '0.04526', 'grad_norm': '1.095', 'learning_rate': '9.484e-05', 'epoch': '6.944'}
{'loss': '0.06447', 'grad_norm': '1.257', 'learning_rate': '9.47e-05', 'epoch': '7'}
{'eval_loss': '0.08354', 'eval_accuracy': '0.7717', 'eval_f1_macro': '0.774', 'eval_runtime': '105.7', 'eval_samples_per_second': '2.071', 'eval_steps_per_second': '0.265', 'epoch': '7'}
{'loss': '0.03113', 'grad_norm': '0.6693', 'learning_rate': '9.455e-05', 'epoch': '7.056'}
{'loss': '0.03981', 'grad_norm': '2.623', 'learning_rate': '9.44e-05', 'epoch': '7.111'}
{'loss': '0.0439', 'grad_norm': '1.121', 'learning_rate': '9.425e-05', 'epoch': '7.167'}
{'loss': '0.05636', 'grad_norm': '0.9213', 'learning_rate': '9.41e-05', 'epoch': '7.222'}
{'loss': '0.03435', 'grad_norm': '0.2191', 'learning_rate': '9.395e-05', 'epoch': '7.278'}
{'loss': '0.05258', 'grad_norm': '1.381', 'learning_rate': '9.379e-05', 'epoch': '7.333'}
{'loss': '0.03393', 'grad_norm': '0.4537', 'learning_rate': '9.364e-05', 'epoch': '7.389'}
{'loss': '0.02731', 'grad_norm': '1.041', 'learning_rate': '9.348e-05', 'epoch': '7.444'}
{'loss': '0.01783', 'grad_norm': '0.5819', 'learning_rate': '9.332e-05', 'epoch': '7.5'}
{'loss': '0.0405', 'grad_norm': '1.067', 'learning_rate': '9.316e-05', 'epoch': '7.556'}
{'loss': '0.07658', 'grad_norm': '2.287', 'learning_rate': '9.299e-05', 'epoch': '7.611'}
{'loss': '0.05072', 'grad_norm': '1.161', 'learning_rate': '9.283e-05', 'epoch': '7.667'}
{'loss': '0.04726', 'grad_norm': '0.1099', 'learning_rate': '9.266e-05', 'epoch': '7.722'}
{'loss': '0.06298', 'grad_norm': '1.294', 'learning_rate': '9.249e-05', 'epoch': '7.778'}
{'loss': '0.05601', 'grad_norm': '0.8206', 'learning_rate': '9.232e-05', 'epoch': '7.833'}
{'loss': '0.05385', 'grad_norm': '0.9627', 'learning_rate': '9.214e-05', 'epoch': '7.889'}
{'loss': '0.0451', 'grad_norm': '1.334', 'learning_rate': '9.197e-05', 'epoch': '7.944'}
{'loss': '0.05816', 'grad_norm': '3.167', 'learning_rate': '9.179e-05', 'epoch': '8'}
{'eval_loss': '0.08189', 'eval_accuracy': '0.7763', 'eval_f1_macro': '0.7742', 'eval_runtime': '106.1', 'eval_samples_per_second': '2.065', 'eval_steps_per_second': '0.264', 'epoch': '8'}
{'loss': '0.04599', 'grad_norm': '2.56', 'learning_rate': '9.161e-05', 'epoch': '8.056'}
{'loss': '0.04268', 'grad_norm': '0.6989', 'learning_rate': '9.143e-05', 'epoch': '8.111'}
{'loss': '0.02385', 'grad_norm': '0.1719', 'learning_rate': '9.125e-05', 'epoch': '8.167'}
{'loss': '0.04146', 'grad_norm': '0.6262', 'learning_rate': '9.107e-05', 'epoch': '8.222'}
{'loss': '0.02073', 'grad_norm': '1.106', 'learning_rate': '9.088e-05', 'epoch': '8.278'}
{'loss': '0.01266', 'grad_norm': '0.05547', 'learning_rate': '9.07e-05', 'epoch': '8.333'}
{'loss': '0.02974', 'grad_norm': '1.085', 'learning_rate': '9.051e-05', 'epoch': '8.389'}
{'loss': '0.02639', 'grad_norm': '2.068', 'learning_rate': '9.032e-05', 'epoch': '8.444'}
{'loss': '0.0302', 'grad_norm': '0.1163', 'learning_rate': '9.013e-05', 'epoch': '8.5'}
{'loss': '0.02345', 'grad_norm': '1.685', 'learning_rate': '8.993e-05', 'epoch': '8.556'}
{'loss': '0.02137', 'grad_norm': '0.4356', 'learning_rate': '8.974e-05', 'epoch': '8.611'}
{'loss': '0.02701', 'grad_norm': '0.4995', 'learning_rate': '8.954e-05', 'epoch': '8.667'}
{'loss': '0.04726', 'grad_norm': '2.221', 'learning_rate': '8.934e-05', 'epoch': '8.722'}
{'loss': '0.04754', 'grad_norm': '1.238', 'learning_rate': '8.914e-05', 'epoch': '8.778'}
{'loss': '0.04136', 'grad_norm': '0.9662', 'learning_rate': '8.894e-05', 'epoch': '8.833'}
{'loss': '0.05404', 'grad_norm': '1.516', 'learning_rate': '8.873e-05', 'epoch': '8.889'}
{'loss': '0.0614', 'grad_norm': '0.7286', 'learning_rate': '8.853e-05', 'epoch': '8.944'}
{'loss': '0.03767', 'grad_norm': '0.5761', 'learning_rate': '8.832e-05', 'epoch': '9'}
{'eval_loss': '0.09824', 'eval_accuracy': '0.7717', 'eval_f1_macro': '0.7688', 'eval_runtime': '105.9', 'eval_samples_per_second': '2.068', 'eval_steps_per_second': '0.264', 'epoch': '9'}
{'loss': '0.01035', 'grad_norm': '0.272', 'learning_rate': '8.811e-05', 'epoch': '9.056'}
{'loss': '0.01853', 'grad_norm': '0.4025', 'learning_rate': '8.79e-05', 'epoch': '9.111'}
{'loss': '0.03058', 'grad_norm': '0.4254', 'learning_rate': '8.769e-05', 'epoch': '9.167'}
{'loss': '0.01423', 'grad_norm': '0.04442', 'learning_rate': '8.748e-05', 'epoch': '9.222'}
{'loss': '0.02278', 'grad_norm': '1.237', 'learning_rate': '8.727e-05', 'epoch': '9.278'}
{'loss': '0.03703', 'grad_norm': '0.5286', 'learning_rate': '8.705e-05', 'epoch': '9.333'}
{'loss': '0.02075', 'grad_norm': '0.7312', 'learning_rate': '8.683e-05', 'epoch': '9.389'}
{'loss': '0.01099', 'grad_norm': '0.933', 'learning_rate': '8.661e-05', 'epoch': '9.444'}
{'loss': '0.01046', 'grad_norm': '0.2793', 'learning_rate': '8.639e-05', 'epoch': '9.5'}
{'loss': '0.02538', 'grad_norm': '1.843', 'learning_rate': '8.617e-05', 'epoch': '9.556'}
{'loss': '0.02904', 'grad_norm': '2.633', 'learning_rate': '8.594e-05', 'epoch': '9.611'}
{'loss': '0.03254', 'grad_norm': '0.04969', 'learning_rate': '8.572e-05', 'epoch': '9.667'}
{'loss': '0.0275', 'grad_norm': '2.421', 'learning_rate': '8.549e-05', 'epoch': '9.722'}
{'loss': '0.01437', 'grad_norm': '0.4819', 'learning_rate': '8.526e-05', 'epoch': '9.778'}
{'loss': '0.006621', 'grad_norm': '0.5603', 'learning_rate': '8.503e-05', 'epoch': '9.833'}
{'loss': '0.01574', 'grad_norm': '0.1494', 'learning_rate': '8.48e-05', 'epoch': '9.889'}
{'loss': '0.01828', 'grad_norm': '1.458', 'learning_rate': '8.457e-05', 'epoch': '9.944'}
{'loss': '0.03175', 'grad_norm': '0.0801', 'learning_rate': '8.434e-05', 'epoch': '10'}
{'eval_loss': '0.1252', 'eval_accuracy': '0.758', 'eval_f1_macro': '0.7554', 'eval_runtime': '106.1', 'eval_samples_per_second': '2.065', 'eval_steps_per_second': '0.264', 'epoch': '10'}
{'loss': '0.01547', 'grad_norm': '0.05208', 'learning_rate': '8.41e-05', 'epoch': '10.06'}
{'loss': '0.02259', 'grad_norm': '0.06858', 'learning_rate': '8.386e-05', 'epoch': '10.11'}
{'loss': '0.02182', 'grad_norm': '1.297', 'learning_rate': '8.362e-05', 'epoch': '10.17'}
{'loss': '0.01751', 'grad_norm': '0.511', 'learning_rate': '8.338e-05', 'epoch': '10.22'}
{'loss': '0.02615', 'grad_norm': '0.2301', 'learning_rate': '8.314e-05', 'epoch': '10.28'}
{'loss': '0.01886', 'grad_norm': '1.289', 'learning_rate': '8.29e-05', 'epoch': '10.33'}
{'loss': '0.00789', 'grad_norm': '0.07528', 'learning_rate': '8.266e-05', 'epoch': '10.39'}
{'loss': '0.00409', 'grad_norm': '0.9508', 'learning_rate': '8.241e-05', 'epoch': '10.44'}
{'loss': '0.01602', 'grad_norm': '3.103', 'learning_rate': '8.216e-05', 'epoch': '10.5'}
{'loss': '0.02292', 'grad_norm': '0.9874', 'learning_rate': '8.192e-05', 'epoch': '10.56'}
{'loss': '0.01431', 'grad_norm': '2.612', 'learning_rate': '8.167e-05', 'epoch': '10.61'}
{'loss': '0.02169', 'grad_norm': '0.1059', 'learning_rate': '8.142e-05', 'epoch': '10.67'}
{'loss': '0.01996', 'grad_norm': '0.09859', 'learning_rate': '8.116e-05', 'epoch': '10.72'}
{'loss': '0.02897', 'grad_norm': '0.04338', 'learning_rate': '8.091e-05', 'epoch': '10.78'}
{'loss': '0.02495', 'grad_norm': '0.1942', 'learning_rate': '8.066e-05', 'epoch': '10.83'}
{'loss': '0.009281', 'grad_norm': '1.979', 'learning_rate': '8.04e-05', 'epoch': '10.89'}
{'loss': '0.01778', 'grad_norm': '0.05166', 'learning_rate': '8.014e-05', 'epoch': '10.94'}
{'loss': '0.005439', 'grad_norm': '0.0107', 'learning_rate': '7.988e-05', 'epoch': '11'}
{'eval_loss': '0.1359', 'eval_accuracy': '0.7717', 'eval_f1_macro': '0.7718', 'eval_runtime': '105.9', 'eval_samples_per_second': '2.067', 'eval_steps_per_second': '0.264', 'epoch': '11'}
{'loss': '0.01016', 'grad_norm': '0.005502', 'learning_rate': '7.962e-05', 'epoch': '11.06'}
{'loss': '0.01504', 'grad_norm': '5.932', 'learning_rate': '7.936e-05', 'epoch': '11.11'}
{'loss': '0.01613', 'grad_norm': '4.961', 'learning_rate': '7.91e-05', 'epoch': '11.17'}
{'loss': '0.03184', 'grad_norm': '2.615', 'learning_rate': '7.884e-05', 'epoch': '11.22'}
{'loss': '0.01815', 'grad_norm': '3.035', 'learning_rate': '7.857e-05', 'epoch': '11.28'}
{'loss': '0.02053', 'grad_norm': '0.8841', 'learning_rate': '7.831e-05', 'epoch': '11.33'}
{'loss': '0.01506', 'grad_norm': '0.4513', 'learning_rate': '7.804e-05', 'epoch': '11.39'}
{'loss': '0.001491', 'grad_norm': '0.324', 'learning_rate': '7.777e-05', 'epoch': '11.44'}
{'loss': '0.00905', 'grad_norm': '0.02892', 'learning_rate': '7.75e-05', 'epoch': '11.5'}
{'loss': '0.01546', 'grad_norm': '0.1166', 'learning_rate': '7.723e-05', 'epoch': '11.56'}
{'loss': '0.01899', 'grad_norm': '0.02845', 'learning_rate': '7.696e-05', 'epoch': '11.61'}
{'loss': '0.01518', 'grad_norm': '0.1284', 'learning_rate': '7.669e-05', 'epoch': '11.67'}
{'loss': '0.01559', 'grad_norm': '0.5628', 'learning_rate': '7.641e-05', 'epoch': '11.72'}
{'loss': '0.01559', 'grad_norm': '1.986', 'learning_rate': '7.614e-05', 'epoch': '11.78'}
{'loss': '0.007582', 'grad_norm': '0.18', 'learning_rate': '7.586e-05', 'epoch': '11.83'}
{'loss': '0.01779', 'grad_norm': '0.09533', 'learning_rate': '7.559e-05', 'epoch': '11.89'}
{'loss': '0.02005', 'grad_norm': '0.02764', 'learning_rate': '7.531e-05', 'epoch': '11.94'}
{'loss': '0.0105', 'grad_norm': '0.2502', 'learning_rate': '7.503e-05', 'epoch': '12'}
{'eval_loss': '0.1306', 'eval_accuracy': '0.7671', 'eval_f1_macro': '0.7666', 'eval_runtime': '106', 'eval_samples_per_second': '2.065', 'eval_steps_per_second': '0.264', 'epoch': '12'}
{'loss': '0.02529', 'grad_norm': '1.387', 'learning_rate': '7.475e-05', 'epoch': '12.06'}
{'loss': '0.005053', 'grad_norm': '0.03735', 'learning_rate': '7.447e-05', 'epoch': '12.11'}
{'loss': '0.006847', 'grad_norm': '2.675', 'learning_rate': '7.418e-05', 'epoch': '12.17'}
{'loss': '0.01695', 'grad_norm': '1.011', 'learning_rate': '7.39e-05', 'epoch': '12.22'}
{'loss': '0.02034', 'grad_norm': '0.2724', 'learning_rate': '7.362e-05', 'epoch': '12.28'}
{'loss': '0.005324', 'grad_norm': '2.115', 'learning_rate': '7.333e-05', 'epoch': '12.33'}
{'loss': '0.002621', 'grad_norm': '0.01744', 'learning_rate': '7.304e-05', 'epoch': '12.39'}
{'loss': '0.0177', 'grad_norm': '0.6538', 'learning_rate': '7.276e-05', 'epoch': '12.44'}
{'loss': '0.006162', 'grad_norm': '0.1963', 'learning_rate': '7.247e-05', 'epoch': '12.5'}
{'loss': '0.01943', 'grad_norm': '4.31', 'learning_rate': '7.218e-05', 'epoch': '12.56'}
{'loss': '0.01834', 'grad_norm': '0.009528', 'learning_rate': '7.189e-05', 'epoch': '12.61'}
{'loss': '0.009164', 'grad_norm': '0.02772', 'learning_rate': '7.16e-05', 'epoch': '12.67'}
{'loss': '0.009574', 'grad_norm': '4.845', 'learning_rate': '7.131e-05', 'epoch': '12.72'}
{'loss': '0.02951', 'grad_norm': '0.0584', 'learning_rate': '7.101e-05', 'epoch': '12.78'}
{'loss': '0.0177', 'grad_norm': '0.1512', 'learning_rate': '7.072e-05', 'epoch': '12.83'}
{'loss': '0.01241', 'grad_norm': '0.3222', 'learning_rate': '7.043e-05', 'epoch': '12.89'}
{'loss': '0.0135', 'grad_norm': '0.02367', 'learning_rate': '7.013e-05', 'epoch': '12.94'}
{'loss': '0.01088', 'grad_norm': '1.636', 'learning_rate': '6.983e-05', 'epoch': '13'}
{'eval_loss': '0.1271', 'eval_accuracy': '0.7854', 'eval_f1_macro': '0.7845', 'eval_runtime': '106.7', 'eval_samples_per_second': '2.052', 'eval_steps_per_second': '0.262', 'epoch': '13'}
{'loss': '0.01551', 'grad_norm': '0.3399', 'learning_rate': '6.954e-05', 'epoch': '13.06'}
{'loss': '0.01869', 'grad_norm': '0.04705', 'learning_rate': '6.924e-05', 'epoch': '13.11'}
{'loss': '0.01117', 'grad_norm': '0.03859', 'learning_rate': '6.894e-05', 'epoch': '13.17'}
{'loss': '0.02566', 'grad_norm': '0.1825', 'learning_rate': '6.864e-05', 'epoch': '13.22'}
{'loss': '0.007216', 'grad_norm': '0.9025', 'learning_rate': '6.834e-05', 'epoch': '13.28'}
{'loss': '0.005052', 'grad_norm': '0.2419', 'learning_rate': '6.804e-05', 'epoch': '13.33'}
{'loss': '0.02049', 'grad_norm': '0.2686', 'learning_rate': '6.774e-05', 'epoch': '13.39'}
{'loss': '0.01055', 'grad_norm': '0.05016', 'learning_rate': '6.743e-05', 'epoch': '13.44'}
{'loss': '0.0143', 'grad_norm': '0.05017', 'learning_rate': '6.713e-05', 'epoch': '13.5'}
{'loss': '0.01368', 'grad_norm': '2.059', 'learning_rate': '6.683e-05', 'epoch': '13.56'}
{'loss': '0.005512', 'grad_norm': '0.05989', 'learning_rate': '6.652e-05', 'epoch': '13.61'}
{'loss': '0.006226', 'grad_norm': '0.7136', 'learning_rate': '6.622e-05', 'epoch': '13.67'}
{'loss': '0.006618', 'grad_norm': '0.007432', 'learning_rate': '6.591e-05', 'epoch': '13.72'}
{'loss': '0.01074', 'grad_norm': '0.1246', 'learning_rate': '6.56e-05', 'epoch': '13.78'}
{'loss': '0.003471', 'grad_norm': '0.001922', 'learning_rate': '6.53e-05', 'epoch': '13.83'}
{'loss': '0.0003102', 'grad_norm': '0.04363', 'learning_rate': '6.499e-05', 'epoch': '13.89'}
{'loss': '0.0006633', 'grad_norm': '0.03808', 'learning_rate': '6.468e-05', 'epoch': '13.94'}
{'loss': '0.0119', 'grad_norm': '2.759', 'learning_rate': '6.437e-05', 'epoch': '14'}
{'eval_loss': '0.1682', 'eval_accuracy': '0.7534', 'eval_f1_macro': '0.7541', 'eval_runtime': '106.6', 'eval_samples_per_second': '2.055', 'eval_steps_per_second': '0.263', 'epoch': '14'}
{'loss': '0.0133', 'grad_norm': '1.193', 'learning_rate': '6.406e-05', 'epoch': '14.06'}
{'loss': '0.01642', 'grad_norm': '0.2833', 'learning_rate': '6.375e-05', 'epoch': '14.11'}
{'loss': '0.00481', 'grad_norm': '0.02297', 'learning_rate': '6.344e-05', 'epoch': '14.17'}
{'loss': '0.002218', 'grad_norm': '1.638', 'learning_rate': '6.313e-05', 'epoch': '14.22'}
{'loss': '0.003484', 'grad_norm': '0.008354', 'learning_rate': '6.282e-05', 'epoch': '14.28'}
{'loss': '0.00768', 'grad_norm': '0.014', 'learning_rate': '6.25e-05', 'epoch': '14.33'}
{'loss': '0.002859', 'grad_norm': '0.005389', 'learning_rate': '6.219e-05', 'epoch': '14.39'}
{'loss': '0.00363', 'grad_norm': '0.01703', 'learning_rate': '6.188e-05', 'epoch': '14.44'}
{'loss': '0.004527', 'grad_norm': '4.999', 'learning_rate': '6.156e-05', 'epoch': '14.5'}
{'loss': '0.01228', 'grad_norm': '0.001673', 'learning_rate': '6.125e-05', 'epoch': '14.56'}
{'loss': '0.01024', 'grad_norm': '1.325', 'learning_rate': '6.093e-05', 'epoch': '14.61'}
{'loss': '0.002511', 'grad_norm': '2.234', 'learning_rate': '6.062e-05', 'epoch': '14.67'}
{'loss': '0.002677', 'grad_norm': '0.005474', 'learning_rate': '6.03e-05', 'epoch': '14.72'}
{'loss': '0.008674', 'grad_norm': '0.002243', 'learning_rate': '5.998e-05', 'epoch': '14.78'}
{'loss': '0.01416', 'grad_norm': '0.00276', 'learning_rate': '5.967e-05', 'epoch': '14.83'}
{'loss': '0.001077', 'grad_norm': '0.1665', 'learning_rate': '5.935e-05', 'epoch': '14.89'}
{'loss': '0.004752', 'grad_norm': '0.02118', 'learning_rate': '5.903e-05', 'epoch': '14.94'}
{'loss': '0.006503', 'grad_norm': '1.967', 'learning_rate': '5.871e-05', 'epoch': '15'}
{'eval_loss': '0.1228', 'eval_accuracy': '0.79', 'eval_f1_macro': '0.7896', 'eval_runtime': '105.7', 'eval_samples_per_second': '2.071', 'eval_steps_per_second': '0.265', 'epoch': '15'}
{'loss': '0.00756', 'grad_norm': '0.004638', 'learning_rate': '5.84e-05', 'epoch': '15.06'}
{'loss': '0.002511', 'grad_norm': '0.006597', 'learning_rate': '5.808e-05', 'epoch': '15.11'}
{'loss': '0.002242', 'grad_norm': '0.8027', 'learning_rate': '5.776e-05', 'epoch': '15.17'}
{'loss': '0.0002855', 'grad_norm': '0.02001', 'learning_rate': '5.744e-05', 'epoch': '15.22'}
{'loss': '0.00473', 'grad_norm': '0.01003', 'learning_rate': '5.712e-05', 'epoch': '15.28'}
{'loss': '0.002294', 'grad_norm': '0.00268', 'learning_rate': '5.68e-05', 'epoch': '15.33'}
{'loss': '0.01172', 'grad_norm': '0.002129', 'learning_rate': '5.648e-05', 'epoch': '15.39'}
{'loss': '0.001938', 'grad_norm': '3.331', 'learning_rate': '5.616e-05', 'epoch': '15.44'}
{'loss': '0.0007251', 'grad_norm': '0.01041', 'learning_rate': '5.584e-05', 'epoch': '15.5'}
{'loss': '0.00259', 'grad_norm': '0.005297', 'learning_rate': '5.552e-05', 'epoch': '15.56'}
{'loss': '0.001559', 'grad_norm': '0.005115', 'learning_rate': '5.519e-05', 'epoch': '15.61'}
{'loss': '0.00113', 'grad_norm': '0.03056', 'learning_rate': '5.487e-05', 'epoch': '15.67'}
{'loss': '0.005897', 'grad_norm': '0.003646', 'learning_rate': '5.455e-05', 'epoch': '15.72'}
{'loss': '0.003432', 'grad_norm': '0.006152', 'learning_rate': '5.423e-05', 'epoch': '15.78'}
{'loss': '0.003579', 'grad_norm': '0.001413', 'learning_rate': '5.391e-05', 'epoch': '15.83'}
{'loss': '0.01223', 'grad_norm': '0.05825', 'learning_rate': '5.358e-05', 'epoch': '15.89'}
{'loss': '0.00897', 'grad_norm': '5.674', 'learning_rate': '5.326e-05', 'epoch': '15.94'}
{'loss': '0.00431', 'grad_norm': '3.035', 'learning_rate': '5.294e-05', 'epoch': '16'}
{'eval_loss': '0.1812', 'eval_accuracy': '0.7534', 'eval_f1_macro': '0.7498', 'eval_runtime': '135.6', 'eval_samples_per_second': '1.615', 'eval_steps_per_second': '0.206', 'epoch': '16'}
{'train_runtime': '1.94e+04', 'train_samples_per_second': '8.906', 'train_steps_per_second': '0.278', 'train_loss': '0.6819', 'epoch': '16'}
Training complete! Model saved to ./byt5-taglish-nli-final-v3