first commit

2026-05-21 01:21:51 +08:00
commit 699a04a3b2
26 changed files with 36741 additions and 0 deletions
@@ -0,0 +1,16 @@
+import pandas as pd
+
+# Load both datasets
+train_df = pd.read_csv("new_dataset.csv")
+benchmark_df = pd.read_csv("benchmark_dataset.csv")
+
+# Create a unique string for each row by combining premise and hypothesis
+train_pairs = set(train_df['s1'] + " ||| " + train_df['s2'])
+benchmark_pairs = set(benchmark_df['s1'] + " ||| " + benchmark_df['s2'])
+
+# Find any exact matches that exist in both sets
+leaked_data = train_pairs.intersection(benchmark_pairs)
+
+print(f"Number of leaked samples: {len(leaked_data)}")
+if len(leaked_data) > 0:
+    print("Warning: You have overlapping data!")