elasticsearchnlpsentence-similarity

Boosting documents with term matches in elasticsearch after cosine similarity


I am using text embeddings stored in elasticsearch to get documents similar to a query. But I noticed that in some cases, I get documents that don't have the words from the query in them with a higher score. So I want to boost the score for documents that have the words from the query. How do I do this in elasticsearch?

This is my index

{
    "mappings": {
        "properties": {
            "question_text": {
            "type": "text"
            },
            "question_vector": {
            "type": "dense_vector",
            "dims": 768
            }
        }
    }
}

I tried doing this

{
    "query":{
        "script_score": {
            "query": {
                "bool": {
                    "must": [
                        {
                            "more_like_this": {
                                "fields": [
                                    "question_text"
                                ],
                                "like": query_text,
                                "min_term_freq": 1,
                                "max_query_terms": 12,
                                "minimum_should_match": "3<60%"
                            }
                        }
                    ]
                }
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    },
    "fields": [
        "question_text"
    ],
    "_source": false
}

But now I only get documents that have the words in them. Is there a way to do this, but still get matches that don't have the words in them, but with lower scores?


Solution

  • {
        "query": {
            "boosting": {
                "positive": {
                    "function_score": {
                        "query": {
                            "match_all": {}
                        },
                        "script_score": {
                            "script": {
                                "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                                "params": {"query_vector": embedding}
                            },
                        }
                    }
                },
                "negative": {
                    "bool": {
                        "must_not": [
                            {
                                "more_like_this": {
                                    "fields": [
                                        "question_text"
                                    ],
                                    "like": text,
                                    "min_doc_freq": 0,
                                    "min_term_freq": 0,
                                    "max_query_terms": 12,
                                    "minimum_should_match": "3<60%",
                                }
                            }
                        ]
                    }
                },
                "negative_boost": 0.8
            }
        },
        "_source": "question_text"
    }
    

    This query selects all the documents and computes cosine similarity. Then, it reduces the scores of the documents which do not have matching terms.