pythonsparqlrdfrdflibblank-nodes

Sparql query returns undesired results when using blank nodes (rdflib)


I use rdflib python library to model a graph of contacts, and perform sparql queries to retrieve who knows who. This works fine when people as added as URIRef, but not when using BNode.

The example graph can be represented as follow:

bob   - knows -> linda
alice - knows -> linda
tom   - knows -> linda
        knows -> bob

Only Tom knows Bob, and no one knows Tom.

I perform the following 2 queries:

  1. The first one to retrieves Tom; it works as expected.
  2. In the second query, I use Tom node id to retrieve who knows him. I expect an empty list. When Tom is added as a URIRef, it works as expected. However, when Tom is added as a BNode, this query returns 3 names!

use_blank_node = True # switch to see the undesired behavior happens only with blank node

pred_knows = URIRef("http://example.org/knows")
pred_named = URIRef("http://example.org/named")

def create_graph() -> Graph:
    graph = Graph()

    bob = URIRef("http://example.org/people/Bob")
    linda = BNode()  # a GUID is generated
    alice = BNode()
    tom = BNode() if use_blank_node else URIRef("http://example.org/people/Tom")
    print(f"{str(tom)=}")
    remy = BNode()

    graph.add((bob, pred_named, Literal("Bob")))
    graph.add((alice, pred_named, Literal("Alice")))
    graph.add((tom, pred_named, Literal("Tom")))
    graph.add((linda, pred_named, Literal("Linda")))
    graph.add((remy, pred_named, Literal("Remy")))

    graph.add((bob, pred_knows, linda))
    graph.add((alice, pred_knows, linda))
    graph.add((tom, pred_knows, linda))
    graph.add((tom, pred_knows, bob))

    return graph


find_tom_who_knows_bob_query = f"""SELECT DISTINCT ?knowsbob ?nameofwhoknowsbob
WHERE 
{{ ?knowsbob <{pred_knows}> <http://example.org/people/Bob> ;
             <{pred_named}> ?nameofwhoknowsbob . 
 }}"""


def find_who_know_tom(tom_id) -> str:
    tom_query = f"_:{tom_id}" if type(tom_id) is BNode else f"<{tom_id}>"

    return f"""SELECT DISTINCT ?nameOfWhoKnowsTom
    WHERE 
    {{ ?iriOfWhoKnowsTom  <{pred_knows}> {tom_query} ;
                          <{pred_named}> ?nameOfWhoKnowsTom}}"""


def main():
    graph = create_graph()
    print("=" * 60, "\n", graph.serialize(), "\n", "=" * 60)

    result = list(graph.query(find_tom_who_knows_bob_query))
    assert len(result) == 1 and len(result[0]) == 2
    tom_id = result[0][0]
    print(f"{str(tom_id)=}")
    assert (type(tom_id) == BNode and use_blank_node) or (type(tom_id) == URIRef and use_blank_node is False)
    assert str(result[0][1]) == "Tom"

    query = find_who_know_tom(tom_id)
    print(query)
    result = list(graph.query(query))
    print(
        "They know Tom:", ", ".join([str(r[0]) for r in result])
    )  # why is it not empty when use_blank_node = True
    # prints: "They know Tom: Bob, Alice, Tom"


if __name__ == "__main__":
    main()


My question: how to correctly use sparql so that the query also works with blank node ?


Solution

  • Blank nodes are similar to free variables. From SPARQL 1.1 Query Language:

    An application writer should not expect blank node labels in a query to refer to a particular blank node in the data.

    Your second query, instead of

    SELECT ?nameOfWhoKnowsTom WHERE {
       ?iriOfWhoKnowsTom ex:knows _:N6fb3b031995c43cfbf3e257ec0c0eac0 ;
                         ex:named ?nameOfWhoKnowsTom .
    }
    

    should be like this one:

    SELECT ?nameOfWhoKnowsTom WHERE {
       ?iriOfWhoKnowsTom ex:knows / ex:named "Tom" ;
                         ex:named ?nameOfWhoKnowsTom .
    }
    

    See also the tag info.