pythonelasticsearchopensearchamazon-opensearch

Opensearch/Elasticsearch Must_Not Clause not Accepting a List


I'm trying to write an Opensearch query which will exclude any results which match a list of terms. Per what I can tell in Opensearch documentation, this sort of behavior should be supported. However, of the two queries I have below, only the one which doesn't use a list (working_query) works. Unfortunately, googling / searching stackoverflow for the error hasn't yielded any results that seem relevant. Thanks in advance!

working_query =  {
    "bool": {
        "must": [
            {
                "match_phrase": {
                    "textContent": {
                        "query": "must_phrase",
                        "slop": 10,
                    }
                }
            },
            {
                "bool": {
                    "should": [
                        {"match_phrase": {"textContent": "should_phrase"}},
                    ]
                }
            },
        ],
        "must_not":[
            {"match_phrase":{"folderIdentifier.identifierValue":'elem1'}}
        ],
    }
}

broken_query =  {
    "bool": {
        "must": [
            {
                "match_phrase": {
                    "textContent": {
                        "query": "must_phrase",
                        "slop": 10,
                    }
                }
            },
            {
                "bool": {
                    "should": [
                        {"match_phrase": {"textContent": "should_phrase"}},
                    ]
                }
            },
        ],
        "must_not":[
            {"match_phrase":{"folderIdentifier.identifierValue":['elem1','elem2']}}
        ],
    }
}

Error:

---------------------------------------------------------------------------
RequestError                              Traceback (most recent call last)
Cell In[56], line 25
      1 broken_query =  {
      2     "bool": {
      3         "must": [
   (...)
     23     }
     24 }
---> 25 test=search_docs_full(broken_query)

Cell In[11], line 29, in search_docs_full(text_query)
     25 if last_hit != 0:
     26     query['search_after'] = [last_hit]
---> 29 results = searchwrapper.client.search(index=index_name,body=query, request_timeout=1200)
     30 if len(results['hits']['hits']) != 0:
     31     previous_hit = last_hit

File ~/.local/lib/python3.11/site-packages/opensearchpy/client/utils.py:181, in query_params.<locals>._wrapper.<locals>._wrapped(*args, **kwargs)
    178         if v is not None:
    179             params[p] = _escape(v)
--> 181 return func(*args, params=params, headers=headers, **kwargs)

File ~/.local/lib/python3.11/site-packages/opensearchpy/client/__init__.py:1742, in OpenSearch.search(self, body, index, params, headers)
   1739 if "from_" in params:
   1740     params["from"] = params.pop("from_")
-> 1742 return self.transport.perform_request(
   1743     "POST",
   1744     _make_path(index, "_search"),
   1745     params=params,
   1746     headers=headers,
   1747     body=body,
   1748 )

File ~/.local/lib/python3.11/site-packages/opensearchpy/transport.py:448, in Transport.perform_request(self, method, url, params, body, timeout, ignore, headers)
    446             raise e
    447     else:
--> 448         raise e
    450 else:
    451     # connection didn't fail, confirm its live status
    452     self.connection_pool.mark_live(connection)

File ~/.local/lib/python3.11/site-packages/opensearchpy/transport.py:409, in Transport.perform_request(self, method, url, params, body, timeout, ignore, headers)
    406 connection = self.get_connection()
    408 try:
--> 409     status, headers_response, data = connection.perform_request(
    410         method,
    411         url,
    412         params,
    413         body,
    414         headers=headers,
    415         ignore=ignore,
    416         timeout=timeout,
    417     )
    419     # Lowercase all the header names for consistency in accessing them.
    420     headers_response = {
    421         header.lower(): value for header, value in headers_response.items()
    422     }

File ~/.local/lib/python3.11/site-packages/opensearchpy/connection/http_requests.py:232, in RequestsHttpConnection.perform_request(self, method, url, params, body, timeout, allow_redirects, ignore, headers)
    219 if (
    220     not (200 <= response.status_code < 300)
    221     and response.status_code not in ignore
    222 ):
    223     self.log_request_fail(
    224         method,
    225         url,
   (...)
    230         raw_data,
    231     )
--> 232     self._raise_error(
    233         response.status_code,
    234         raw_data,
    235         response.headers.get("Content-Type"),
    236     )
    238 self.log_request_success(
    239     method,
    240     url,
   (...)
    245     duration,
    246 )
    248 return response.status_code, response.headers, raw_data

File ~/.local/lib/python3.11/site-packages/opensearchpy/connection/base.py:316, in Connection._raise_error(self, status_code, raw_data, content_type)
    313 except (ValueError, TypeError) as err:
    314     logger.warning("Undecodable raw error response from server: %s", err)
--> 316 raise HTTP_EXCEPTIONS.get(status_code, TransportError)(
    317     status_code, error_message, additional_info
    318 )

RequestError: RequestError(400, 'x_content_parse_exception', '[1:325] [bool] failed to parse field [must_not]')

Solution

  • match_phrase dosen't support array as values you need to change this

    {"match_phrase":{"folderIdentifier.identifierValue":['elem1','elem2']}}
    

    to this

    "bool":{
    
        "should":[
        {"match_phrase":{"folderIdentifier.identifierValue":'elem1'}},
        {"match_phrase":{"folderIdentifier.identifierValue":'elem2'}}]}