import pandas as pd # Load both datasets train_df = pd.read_csv("new_dataset.csv") benchmark_df = pd.read_csv("benchmark_dataset.csv") # Create a unique string for each row by combining premise and hypothesis train_pairs = set(train_df['s1'] + " ||| " + train_df['s2']) benchmark_pairs = set(benchmark_df['s1'] + " ||| " + benchmark_df['s2']) # Find any exact matches that exist in both sets leaked_data = train_pairs.intersection(benchmark_pairs) print(f"Number of leaked samples: {len(leaked_data)}") if len(leaked_data) > 0: print("Warning: You have overlapping data!")