first commit
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
import pandas as pd
|
||||
|
||||
# Load both datasets
|
||||
train_df = pd.read_csv("new_dataset.csv")
|
||||
benchmark_df = pd.read_csv("benchmark_dataset.csv")
|
||||
|
||||
# Create a unique string for each row by combining premise and hypothesis
|
||||
train_pairs = set(train_df['s1'] + " ||| " + train_df['s2'])
|
||||
benchmark_pairs = set(benchmark_df['s1'] + " ||| " + benchmark_df['s2'])
|
||||
|
||||
# Find any exact matches that exist in both sets
|
||||
leaked_data = train_pairs.intersection(benchmark_pairs)
|
||||
|
||||
print(f"Number of leaked samples: {len(leaked_data)}")
|
||||
if len(leaked_data) > 0:
|
||||
print("Warning: You have overlapping data!")
|
||||
Reference in New Issue
Block a user