python algorithm graph graph-theory network-flow

Finding minimum weighted matching in sink source graph

I have three lists of nodes. sources, sinks, and pipes. there is a directed weighted graph from sources to pipes to sinks. Sources are only connected to pipes and pipes only to sinks. But sources are not directly connected to sinks. Pipes are zero-sum, meaning that the sum of the weights that come to each pipe from sources is equal to the sum of the edges that go from that pipe to sinks.

I would like to add the minimum number of edges to this graph from sinks back to sources so that sinks and sources also become zero-sum. I know this problem is np-complete I'm interested to see if there is any good polynomial approximation to this problem that would work in real life.

In simpler words: I have a list of sinks and sources. Each sink has a negative number and each source has a positive number so that the sum of all the numbers in the nodes of the graph are zero(no edges so far). I would like to add the minimum number of edges to this graph so that the sum of the weights of the edges going out/in to each node becomes equal to the number on that node.

Here is a sample Code for testing if a graph summarizes another graph:

from functools import reduce
from collections import Counter

source_edges = {
    "a0": {"p0": 1, "p2": 5}, 
    "a1": {"p0": 2}, 
    "a2": {"p1": 3}
}
sink_edges = {
    "b0": {"p0": 1},
    "b1": {"p0": 1, "p1": 1},
    "b2": {"p0": 1, "p1": 2, "p2": 5},
}
res = {
    "a0": {"b0": 1, "b2": 5}, 
    "a1": {"b1": 2}, 
    "a2": {"b2": 3}
}

sink_degs1 = {k: sum(v.values()) for k, v in sink_edges.items()} 
sink_degs2 = dict(reduce(lambda x, y: x + y, (Counter(v) for v in res.values())))
source_degs1 ={k: sum(v.values()) for k, v in res.items()} 
source_degs2 ={k: sum(v.values()) for k, v in source_edges.items()}

if sink_degs1 == sink_degs2 and source_degs1 == source_degs2:
    print('res summerizes the graph')
else:
    print('res does not summerize this graph')

And a visualization of this graph:

Solution

This gives a sub-optimum solution with less than n-1 edges.

from numpy.random import randint
from collections import defaultdict
import copy


def create_sample(source_count=5000, sink_count=200):
    diff = -1
    while diff < 0:
        sinks = [["b" + str(i), randint(source_count)] for i in range(sink_count)]
        sources = [["a" + str(i), randint(sink_count)] for i in range(source_count)]
        sink_sum = sum([x[1] for x in sinks])
        source_sum = sum([x[1] for x in sources])
        diff = sink_sum - source_sum
    avg_refill = diff // source_count + 1
    weights_match = False
    while not weights_match:
        for i in range(source_count):
            if not diff:
                break
            rnd = randint(avg_refill * 2.5) if diff > 10 * (avg_refill) else diff
            diff -= rnd
            sources[i][1] += rnd
        weights_match = sum([x[1] for x in sources]) == sum([x[1] for x in sinks])
    return sources, sinks


def solve(sources, sinks):
    src = sorted(copy.deepcopy(sources), key=lambda x: x[1])
    snk = sorted(copy.deepcopy(sinks), key=lambda x: x[1])
    res = []
    while snk:
        if src[0][1] > snk[0][1]:
            edge = (src[0][0], *snk[0])
            src[0][1] -= snk[0][1]
            del snk[0]
        elif src[0][1] < snk[0][1]:
            edge = (src[0][0], snk[0][0], src[0][1])
            snk[0][1] -= src[0][1]
            del src[0]
        else:
            edge = (src[0][0], *snk[0])
            del src[0], snk[0]
        res += [edge]
    return res


def test(sources, sinks):
    res = solve(sources, sinks)
    d_sources = defaultdict(int)
    d_sinks = defaultdict(int)
    w_sources = defaultdict(int)
    w_sinks = defaultdict(int)
    for a, b, c in res:
        d_sources[a] += 1
        d_sinks[b] += 1
        w_sources[a] += c
        w_sinks[b] += c
    print("source " + ("is" if dict(sources) == w_sources else "isn't") + " source")
    print("sink " + ("is" if dict(sinks) == w_sinks else "isn't") + " sink")
    print(
        f"source:\n \tdeg_sum = {sum(d_sources.values())}\n\tmax_deg = {max(d_sources.values())}"
    )
    print(
        f"sink:\n \tdeg_sum = {sum(d_sinks.values())}\n\tmax_deg = {max(d_sinks.values())}"
    )

Here is a sample run:

In [1]: %run solver.py
In [2]: test(*create_sample())
source is source
sink is sink
source:
        deg_sum = 5196
        max_deg = 3
sink:
        deg_sum = 5196
        max_deg = 56

Here is an illustration of how it works:

sources: 4,5,3,2
sinks: 2,7,2,2,1

sorted:
        55555|44|44|33|32|2
        77777|77|22|22|22|1
So we have 6 edges.

Here is a comparison between sorted and unsorted solution with this algorithm:

---------------------------------------------
|                (1000,1000)                |
---------------------------------------------
| criteria          | sorted | random order |
| source degree sum | 1991   | 1999         |
| source max degree | 3      | 7            |
| sink degreee sum  | 1991   | 1999         |
| sink max degree   | 3      | 8            |
---------------------------------------------

---------------------------------------------
|                (200,5000)                 |
---------------------------------------------
| criteria          | sorted | random order |
| source degree sum | 5198   | 5198         |
| source max degree | 2      | 3            |
| sink degreee sum  | 5198   | 5198         |
| sink max degree   | 43     | 54           |
---------------------------------------------