javamultithreadingjava.util.concurrentpdfclown

How to improve performance for search keywords highlighting on file using pdfclown


I am using pdfclown and below code is taking around 100 seconds to highlighting search keywords in same file.Kindly provide your inputs for improving performance in below code.Please find the jar path in below url to run this code. https://drive.google.com/drive/folders/1nW8bk6bcAG6g7LZYy2YAAMk46hI9IPUh

import java.awt.Color;
import java.awt.Desktop;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.File;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;

import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;
import org.pdfclown.tools.TextExtractor;

public class pdfclown2 {
    private static int count;

    public static void main(String[] args) throws IOException {

        highlight("book.pdf","C:\\Users\\\Downloads\\6.pdf");
        System.out.println("OK");
    }
    private static void highlight(String inputPath, String outputPath) throws IOException {

        URL url = new URL(inputPath);
        InputStream in = url.openStream();
        org.pdfclown.files.File file = null;
        //"C:\\Users\\Desktop\\pdf\\80743064.pdf"
        try {
            file = new org.pdfclown.files.File("C:\\Users\\uc23\\Desktop\\pdf\\80743064.pdf);

        Map<String, String> m = new HashMap<String, String>();
    for(int i=0;i<3500;i++){

        if(i<=2){
        m.put("The","hi");
        m.put("know","hello");
        m.put("is","Welcome");
        }else{
            m.put(""+i,"hi");
        }
    }

        System.out.println("map size"+m.size());
         long startTime = System.currentTimeMillis();

        for (Map.Entry<String, String> entry : m.entrySet()) {

            Pattern pattern;
            String serachKey =  entry.getKey().toLowerCase();
            final String translationKeyword = entry.getValue();

                if ((serachKey.contains(")") && serachKey.contains("("))
                        || (serachKey.contains("(") && !serachKey.contains(")"))
                        || (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
                        || serachKey.contains("*") || serachKey.contains("+")) {
                    pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
                }
                else
                     pattern = Pattern.compile( "\\b"+serachKey+"\\b", Pattern.CASE_INSENSITIVE);


            // 2. Iterating through the document pages...
            TextExtractor textExtractor = new TextExtractor(true, true);
            for (final Page page : file.getDocument().getPages()) {
                // 2.1. Extract the page text!
                Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
            //System.out.println(textStrings.toString().indexOf(entry.getKey()));

                // 2.2. Find the text pattern matches!
                final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings).toLowerCase());
                // 2.3. Highlight the text pattern matches!
                textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() {
                    public boolean hasNext() {
                        // System.out.println(matcher.find());
                        // if(key.getMatchCriteria() == 1){
                        if (matcher.find()) {
                            return true;
                        }
                        /*
                         * } else if(key.getMatchCriteria() == 2) { if
                         * (matcher.hitEnd()) { count++; return true; } }
                         */
                        return false;

                    }

                    public Interval<Integer> next() {
                        return new Interval<Integer>(matcher.start(), matcher.end());
                    }

                    public void process(Interval<Integer> interval, ITextString match) {
                        // Defining the highlight box of the text pattern
                        // match...
                        System.out.println(match);
                        List<Quad> highlightQuads = new ArrayList<Quad>();
                        {
                            Rectangle2D textBox = null;
                            for (TextChar textChar : match.getTextChars()) {
                                Rectangle2D textCharBox = textChar.getBox();
                                if (textBox == null) {
                                    textBox = (Rectangle2D) textCharBox.clone();
                                } else {
                                    if (textCharBox.getY() > textBox.getMaxY()) {
                                        highlightQuads.add(Quad.get(textBox));
                                        textBox = (Rectangle2D) textCharBox.clone();
                                    } else {
                                        textBox.add(textCharBox);
                                    }
                                }
                            }
                            textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight());
                            highlightQuads.add(Quad.get(textBox));
                        }

                        new TextMarkup(page, highlightQuads, translationKeyword, MarkupTypeEnum.Highlight);

                    }

                    public void remove() {
                        throw new UnsupportedOperationException();
                    }

                });
            }

        }

        SerializationModeEnum serializationMode = SerializationModeEnum.Incremental;

            file.save(new java.io.File(outputPath), serializationMode);

            System.out.println("file created");
            long endTime = System.currentTimeMillis();

             System.out.println("seconds take for execution is:"+(endTime-startTime)/1000);

        } catch (Exception e) {
               e.printStackTrace();
        }
        finally{
            in.close();
        }


    }
}

Solution

  • My guess is that process is the bottle neck, which can be easily tested (comment the code out). Measure times. A good time for profiling the application.

    A simple heuristic optimisation: taking the first and last TextChar rectangles for one liners, and considering font ascenders and descenders, one could create ab entire rectangle. That would already speed things up.

    Alternatives probably exist. Place a more specific question.

    Further improvements:

        InputStream in = url.openStream();
    

    should be

        InputStream in = new BufferedInputStream(url.openStream());
    

    And the multiply searchKey.contains might possibly be a Pattern declared before the loop.

    The same technique might be done for the original highlighting code, but then multi-line support should be added, a Quad for every line.

    The textExtractor is reused for every page which seems the fastest way, but try declare it in the page loop.

    I hope you get a more concrete answer, though I doubt it, hence this one. Better would have been to isolate the slow code from the entirety. But I understand the wish for overall performance gain.


    A less precise, maybe faster highlight code:

                        List<TextChar> textChars = match.getTextChars();
                        Rectangle2D firstRect = textChars.get(0).getBox();
                        Rectangle2D lastRect = textChars.get(textChars.size() - 1).getBox();
                        Rectangle2D rect = firstRect.createUnion(lastRect);
                        highlightQuads.add(Quad.get(rect));
    

    After other comment

    It seems that the bottle neck lies elsewhere. My guess is the text extraction then: so invert the two loops:

    TextExtractor textExtractor = new TextExtractor(true, true);
    for (final Page page : file.getDocument().getPages()) {
    
        for (Map.Entry<String, String> entry : m.entrySet()) {
            Pattern pattern;
            String serachKey =  entry.getKey().toLowerCase();
            final String translationKeyword = entry.getValue();
    
            if ((serachKey.contains(")") && serachKey.contains("("))
                        || (serachKey.contains("(") && !serachKey.contains(")"))
                        || (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
                        || serachKey.contains("*") || serachKey.contains("+")) {
                    pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
            }
            else
                 pattern = Pattern.compile( "\\b"+serachKey+"\\b", Pattern.CASE_INSENSITIVE);
    

    It probably makes sense to have a map of Pattern as Pattern.compile is slow.

    And then I am out of ideas / have other things to do.