pythonlucenepylucene

How can one retrieve a particular field from all the indexed documents using PyLucene?


In java it could be done using "MatchAllDocsQuery()", but there is no documentation for Pylucene that mentions how could it be done.

This is the python code to post individual queries and then extract all the fields from the retrieved documents.

INDEX_DIR = "directory/where/the/document/index/is/stored"

import sys, os, lucene

from java.nio.file import Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher

def run(searcher, analyzer):
    while True:
        print
        print("Hit enter with no input to quit.")
        command = input("Query:")
        if command == '':
            return

        print
        print("Searching for:", command)
        query = QueryParser("contents", analyzer).parse(command)
        #query = "MatchAllDocsQuery()"
        scoreDocs = searcher.search(query, 50).scoreDocs
        print("%s total matching documents." % len(scoreDocs))

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            print(table['doi'])
            #print('path:', doc.get("path"), 'name:', doc.get("name"), 'title:', doc.get("text"))


if __name__ == '__main__':
    lucene.initVM()
    print('lucene', lucene.VERSION)
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory.open(Paths.get(INDEX_DIR))
    print("Directory name is given below")
    print(directory)

    searcher = IndexSearcher(DirectoryReader.open(directory))
    print(searcher)
    analyzer = StandardAnalyzer()

    # Calling the run function for execution
    run(searcher, analyzer)
    del searcher

Solution

  • The minor change in the query can make Lucene retrieve all the indexed document. This is to just replace the command variable with (command = ".✱."). The .✱. searches all the fields and field values in all the documents (using asterisk mark).

    INDEX_DIR = "directory/where/the/document/index/is/stored"
    
    import sys, os, lucene
    
    from java.nio.file import Paths
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.store import SimpleFSDirectory
    from org.apache.lucene.search import IndexSearcher
    
    def run(searcher, analyzer):
        command = ".*."
        print("Searching for:", command)
        query = QueryParser("contents", analyzer).parse(command)
        #query = "MatchAllDocsQuery()"
        scoreDocs = searcher.search(query, 50).scoreDocs
        print("%s total matching documents." % len(scoreDocs))
    
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            print(table['doi'])
                #print('path:', doc.get("path"), 'name:', doc.get("name"), 'title:', doc.get("text"))
    
    
    if __name__ == '__main__':
        lucene.initVM()
        print('lucene', lucene.VERSION)
        base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
        directory = SimpleFSDirectory.open(Paths.get(INDEX_DIR))
        print("Directory name is given below")
        print(directory)
    
        searcher = IndexSearcher(DirectoryReader.open(directory))
        print(searcher)
        analyzer = StandardAnalyzer()
    
        # Calling the run function for execution
        run(searcher, analyzer)
        del searcher