pythonpandascsv

Am I correctly generating a list of randomly assigned pairs with exclusions in python?


I have an array of names and roles of people within a company: Example array:

names_and_titles = [
    ("Samantha Reyes", "Innovation", "Product Owner"),
    ("Ethan McAllister", "Data Scientist"),
    ("Priya Deshmukh", "Data Architect", "SMT"),
    ("Marcus Liu", "Stream 3"),
    ("Elena Petrova", "SMT", "Stream 3"),
]

I also have a csv file with all of the previous pairs that have been generated.

I want to create a list of paired individuals that have not been paired before (unique pair) which also do not share the same role e.g. "Marcus Liu" and "Elena Petrova" cannot be paired together as they share the same role "Stream 3".

This is my code for generating unique pairs and saving the generated pairs back into the CSV file.

import random
import pandas as pd

# An array of all names and titles
names_and_titles = [
    ("Samantha Reyes", "Innovation", "Product Owner"),
    ("Ethan McAllister", "Data Scientist"),
    ("Priya Deshmukh", "Data Architect", "SMT"),
    ("Marcus Liu", "Stream 3"),
    ("Elena Petrova", "SMT", "Stream 3"),
]

# Load previously generated pairs from the CSV file
def load_seen_pairs_from_csv(prev_pairs2):
    df = pd.read_csv(prev_pairs2)
    seen_pairs = set()
    for _, row in df.iterrows():
        pair = (row['name1'], row['name2'])
        reverse_pair = (row['name2'], row['name1'])
        seen_pairs.add(pair)
        seen_pairs.add(reverse_pair)
    return seen_pairs

# Save new pairs to the CSV file
def save_pairs_to_csv(prev_pairs2, pairs):
    df = pd.DataFrame(pairs, columns=['name1', 'name2'])
    df.to_csv(prev_pairs2, index=False, mode='a', header=False)

# Path to the CSV file containing previously generated pairs
csv_file_path = 'prev_pairs2.csv'

# Initialize the seen_pairs set with previously created pairs
seen_pairs = load_seen_pairs_from_csv(csv_file_path)

# Excluded pairs for pairing logic
excluded_pairs = []

def create_unique_pairs_with_debugging_and_fallback(names_and_titles, seen_pairs, excluded_pairs):
    excluded_set = set(excluded_pairs) | set((pair[1], pair[0]) for pair in excluded_pairs)  # Include reverse pairs
    max_retries = 10000  # Increase the number of retries to improve pairing chances
    retries = 0

    while retries < max_retries:
        random.shuffle(names_and_titles)
        pairs = []
        used_names = set()  # Track names already paired in this run
        skipped = []  # Track skipped individuals for debugging

        for i in range(0, len(names_and_titles) - 1, 2):
            person1, person2 = names_and_titles[i], names_and_titles[i + 1]
            roles1 = set(person1[2:]) if len(person1) > 2 else set()  # Handle blank roles
            roles2 = set(person2[2:]) if len(person2) > 2 else set()  # Handle blank roles
            pair = (person1[0], person2[0])
            reverse_pair = (person2[0], person1[0])

            # Debugging: Log why pairs are skipped
            if pair in excluded_set or reverse_pair in excluded_set:
                print(f"Skipping pair due to exclusion: {person1[0]} - {person2[0]}")
                continue
            if pair in seen_pairs or reverse_pair in seen_pairs:
                print(f"Skipping pair due to seen pair: {person1[0]} - {person2[0]}")
                continue
            if roles1 & roles2:  # Skip if roles overlap
                print(f"Skipping pair due to role overlap: {person1[0]} ({roles1}) - {person2[0]} ({roles2})")
                continue
            if person1[0] in used_names or person2[0] in used_names:  # Avoid duplicate pairings in this run
                print(f"Skipping pair due to duplicate usage: {person1[0]} - {person2[0]}")
                continue

            pairs.append((person1, person2))
            used_names.update([person1[0], person2[0]])  # Mark names as used

        # Add skipped individuals to the next round
        skipped = [person for person in names_and_titles if person[0] not in used_names]
        if not skipped:  # If no one is skipped, pairing is complete
            pairs_set = set((pair[0][0], pair[1][0]) for pair in pairs)
            if not any((pair in seen_pairs or (pair[1], pair[0]) in seen_pairs) for pair in pairs_set):
                seen_pairs.update(pairs_set)
                return pairs

        retries += 1

    # If retries are exhausted, raise an error with debugging information
    print("Unable to generate unique pairs with the given restrictions.")
    print(f"Skipped individuals: {[person[0] for person in skipped]}")
    raise ValueError("Unable to generate unique pairs with the given restrictions.")

# Generate unique pairs with debugging and fallback logic
unique_pairs = create_unique_pairs_with_debugging_and_fallback(names_and_titles, seen_pairs, excluded_pairs)

# Print the unique pairs with names only
for pair in unique_pairs:
    print(f"{pair[0][0]} - {pair[1][0]}")
print("-----")

# Print the total number of pairs
print(f"Total pairs: {len(unique_pairs)}")

# Save the unique pairs to the CSV file
save_pairs_to_csv(csv_file_path, [(pair[0][0], pair[1][0]) for pair in unique_pairs])

I was expecting to be given a list of pairs who are unique and do not share the same role. However I am generating a list of pairs where a few pairings share the same role.

For those wondering this is the format of the prev_pairs2.csv file:

name1,name2
Ava Thompson,Noah Bennett
Liam Carter,Zara Mahmood
Maya Chen,Oliver Grant
Nina Kowalski,Tomás Rivera
Rajiv Mehta,Grace O'Connor
Daniel Okoro,Isla McKenzie

Solution

  • I believe your problem lies in the fact that:

            roles1 = set(person1[2:]) if len(person1) > 2 else set()  # Handle blank roles
            roles2 = set(person2[2:]) if len(person2) > 2 else set()  # Handle blank roles
            
    

    Should be:

            # check here --------v----xxx not needed
            roles1 = set(person1[1:]) # if len(person1) > 1 else set()  # Handle blank roles
            roles2 = set(person2[1:]) # if len(person2) > 1 else set()  # Handle blank roles
            
    

    as python uses 0-base indexing, and your person items look like:

    #[0:     [1:      [2:      [3:
    ('Name', 'Role1', 'Role2', ...)
    

    The way it is currently written, you are omitting the first role for each person. Therefore, it is normal if you observe overlaps ;)

    Edit: As stated per @simon in the comments, the if ... else ... part is not even needed.