[SOLVED] Boosting documents with term matches in elasticsearch after cosine similarity

Boosting documents with term matches in elasticsearch after cosine similarity

I am using text embeddings stored in elasticsearch to get documents similar to a query. But I noticed that in some cases, I get documents that don't have the words from the query in them with a higher score. So I want to boost the score for documents that have the words from the query. How do I do this in elasticsearch?

This is my index

{
    "mappings": {
        "properties": {
            "question_text": {
            "type": "text"
            },
            "question_vector": {
            "type": "dense_vector",
            "dims": 768
            }
        }
    }
}

I tried doing this

{
    "query":{
        "script_score": {
            "query": {
                "bool": {
                    "must": [
                        {
                            "more_like_this": {
                                "fields": [
                                    "question_text"
                                ],
                                "like": query_text,
                                "min_term_freq": 1,
                                "max_query_terms": 12,
                                "minimum_should_match": "3<60%"
                            }
                        }
                    ]
                }
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    },
    "fields": [
        "question_text"
    ],
    "_source": false
}

But now I only get documents that have the words in them. Is there a way to do this, but still get matches that don't have the words in them, but with lower scores?

Solution

{
    "query": {
        "boosting": {
            "positive": {
                "function_score": {
                    "query": {
                        "match_all": {}
                    },
                    "script_score": {
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                            "params": {"query_vector": embedding}
                        },
                    }
                }
            },
            "negative": {
                "bool": {
                    "must_not": [
                        {
                            "more_like_this": {
                                "fields": [
                                    "question_text"
                                ],
                                "like": text,
                                "min_doc_freq": 0,
                                "min_term_freq": 0,
                                "max_query_terms": 12,
                                "minimum_should_match": "3<60%",
                            }
                        }
                    ]
                }
            },
            "negative_boost": 0.8
        }
    },
    "_source": "question_text"
}

This query selects all the documents and computes cosine similarity. Then, it reduces the scores of the documents which do not have matching terms.