pdfbox

Identifying the text based on the output in PDF using PDFBOX


Iam using the PDF BOX for getting color information of the text in PDF. I could able to get the output by using the following code. But my doubt is what StrokingColor represents, what Non stroking color represents. Based on this how will i decide which text is having which color. Anyone suggest me? My cuurent output is like this:DeviceRGB DeviceCMYK java.awt.Color[r=63,g=240,b=0] java.awt.Color[r=35,g=31,b=32] 34.934998 31.11 31.875

PDDocument doc = null;
        try {
            doc = PDDocument.load(strFilepath);
            PDFStreamEngine engine = new PDFStreamEngine(ResourceLoader.loadProperties("org/apache/pdfbox/resources/PageDrawer.properties"));
            PDPage page = (PDPage)doc.getDocumentCatalog().getAllPages().get(1);
            engine.processStream(page, page.findResources(), page.getContents().getStream());
            PDGraphicsState graphicState = engine.getGraphicsState();
            System.out.println(graphicState.getStrokingColor().getColorSpace().getName());
            System.out.println(graphicState.getNonStrokingColor().getColorSpace().getName());
            System.out.println(graphicState.getNonStrokingColor().getJavaColor()); 
            System.out.println(graphicState.getStrokingColor().getJavaColor());
            float colorSpaceValues[] = graphicState.getStrokingColor().getColorSpaceValue();
            for (float c : colorSpaceValues) {
                System.out.println(c * 255);
            }
        }
        finally {
            if (doc != null) {
                doc.close();
            }
        }

Solution

  • According to the clarifications in comments the OP wants to

    compare the font colors of one pdf page to another pdf page [...] if there is a text "Sample" in black color and some other text "sample1" in grey color....i need to know that sample--> black color, sample1-->grey color like this..i want the full text and its color

    PDFBox has a text extraction engine, the PDFTextStripper. There are some challenges in using it for the task at hand, though, among them:

    For PDFBox 1.8.x

    As indicated, we extend the PDFTextStripper like this:

    import java.io.IOException;
    import java.util.Arrays;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    import org.apache.pdfbox.util.PDFTextStripper;
    import org.apache.pdfbox.util.TextPosition;
    
    public class ColorTextStripper extends PDFTextStripper
    {
        public ColorTextStripper() throws IOException
        {
            super();
            setSuppressDuplicateOverlappingText(false);
    
            registerOperatorProcessor("CS", new org.apache.pdfbox.util.operator.SetStrokingColorSpace());
            registerOperatorProcessor("cs", new org.apache.pdfbox.util.operator.SetNonStrokingColorSpace());
            registerOperatorProcessor("SC", new org.apache.pdfbox.util.operator.SetStrokingColor());
            registerOperatorProcessor("sc", new org.apache.pdfbox.util.operator.SetNonStrokingColor());
            registerOperatorProcessor("SCN", new org.apache.pdfbox.util.operator.SetStrokingColor());
            registerOperatorProcessor("scn", new org.apache.pdfbox.util.operator.SetNonStrokingColor());
            registerOperatorProcessor("G", new org.apache.pdfbox.util.operator.SetStrokingGrayColor());
            registerOperatorProcessor("g", new org.apache.pdfbox.util.operator.SetNonStrokingGrayColor());
            registerOperatorProcessor("RG", new org.apache.pdfbox.util.operator.SetStrokingRGBColor());
            registerOperatorProcessor("rg", new org.apache.pdfbox.util.operator.SetNonStrokingRGBColor());
            registerOperatorProcessor("K", new org.apache.pdfbox.util.operator.SetStrokingCMYKColor());
            registerOperatorProcessor("k", new org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor());
        }
    
        @Override
        protected void processTextPosition(TextPosition text)
        {
            renderingMode.put(text, getGraphicsState().getTextState().getRenderingMode());
            strokingColor.put(text, getGraphicsState().getStrokingColor().getColorSpaceValue());
            nonStrokingColor.put(text, getGraphicsState().getNonStrokingColor().getColorSpaceValue());
    
            super.processTextPosition(text);
        }
    
        Map<TextPosition, Integer> renderingMode = new HashMap<TextPosition, Integer>();
        Map<TextPosition, float[]> strokingColor = new HashMap<TextPosition, float[]>();
        Map<TextPosition, float[]> nonStrokingColor = new HashMap<TextPosition, float[]>();
    
        final static List<Integer> FILLING_MODES = Arrays.asList(0, 2, 4, 6);
        final static List<Integer> STROKING_MODES = Arrays.asList(1, 2, 5, 6);
        final static List<Integer> CLIPPING_MODES = Arrays.asList(4, 5, 6, 7);
    
        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException
        {
            for (TextPosition textPosition: textPositions)
            {
                Integer charRenderingMode = renderingMode.get(textPosition);
                float[] charStrokingColor = strokingColor.get(textPosition);
                float[] charNonStrokingColor = nonStrokingColor.get(textPosition);
    
                StringBuilder textBuilder = new StringBuilder();
                textBuilder.append(textPosition.getCharacter())
                           .append("{");
    
                if (FILLING_MODES.contains(charRenderingMode))
                {
                    textBuilder.append("FILL:")
                               .append(toString(charNonStrokingColor))
                               .append(';');
                }
    
                if (STROKING_MODES.contains(charRenderingMode))
                {
                    textBuilder.append("STROKE:")
                               .append(toString(charStrokingColor))
                               .append(';');
                }
    
                if (CLIPPING_MODES.contains(charRenderingMode))
                {
                    textBuilder.append("CLIP;");
                }
    
                textBuilder.append("}");
                writeString(textBuilder.toString());
            }
        }
    
        String toString(float[] values)
        {
            if (values == null)
                return "null";
            StringBuilder builder = new StringBuilder();
            switch(values.length)
            {
            case 1:
                builder.append("GRAY"); break;
            case 3:
                builder.append("RGB"); break;
            case 4:
                builder.append("CMYK"); break;
            default:
                builder.append("UNKNOWN");
            }
            for (float f: values)
            {
                builder.append(' ')
                       .append(f);
            }
    
            return builder.toString();
        }
    }
    

    You can call it like this:

    PDFTextStripper stripper = new ColorTextStripper();
    
    PDDocument document = PDDocument.load(SOURCE_FILE);
    
    String text = stripper.getText(document);
    

    The resulting text contains something like this:

    P{FILL:RGB 0.803 0.076 0.086;}e{FILL:RGB 0.803 0.076 0.086;}l{FILL:RGB 0.803 0.076 0.086;}l{FILL:RGB 0.803 0.076 0.086;}e{FILL:RGB 0.803 0.076 0.086;}
    

    and

    G{FILL:RGB 0.102 0.101 0.095;}r{FILL:RGB 0.102 0.101 0.095;}a{FILL:RGB 0.102 0.101 0.095;}z{FILL:RGB 0.102 0.101 0.095;}i{FILL:RGB 0.102 0.101 0.095;}e{FILL:RGB 0.102 0.101 0.095;}
    

    for the Pelle and Grazie from this

    Pelle and Grazie

    or

    K{FILL:RGB 0.0 0.322 0.573;}E{FILL:RGB 0.0 0.322 0.573;}Y{FILL:RGB 0.0 0.322 0.573;}
    

    and

    C{FILL:GRAY 0.0;}o{FILL:GRAY 0.0;}m{FILL:GRAY 0.0;}b{FILL:GRAY 0.0;}i{FILL:GRAY 0.0;}n{FILL:GRAY 0.0;}e{FILL:GRAY 0.0;}d{FILL:GRAY 0.0;}
    

    for KEY and Combined from this:

    KEY and Combined

    Instead of serializing all the information into a String result, you can of course also create some class containing both the color and the character information in a structured way. Just like now the String result is created in writeString, you can change this method to add instances of such a class to some list in it.

    Requirements

    At least PDFBox version 1.8.4 is required to make this work. I tested it using 2.0.0-SNAPSHOT but 1.8.4 should suffice. 1.8.3, on the other hand, has a bug which sometimes forwards the wrong TextPosition objects to writeString, cf. PDFBOX-1804, and earlier versions don't provide a TextPosition collection to writeString at all.

    For PDFBox 2.x

    There were multiple refactorings and other changes in PDFBox 2.x which also concern the code above.

    Ported to PDFBox 2.x it may look like this:

    public class ColorTextStripper extends PDFTextStripper {
        public ColorTextStripper() throws IOException {
            super();
            setSuppressDuplicateOverlappingText(false);
    
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor());
        }
    
        @Override
        protected void processTextPosition(TextPosition text) {
            renderingMode.put(text, getGraphicsState().getTextState().getRenderingMode());
            strokingColor.put(text, getGraphicsState().getStrokingColor().getComponents());
            nonStrokingColor.put(text, getGraphicsState().getNonStrokingColor().getComponents());
    
            super.processTextPosition(text);
        }
    
        Map<TextPosition, RenderingMode> renderingMode = new HashMap<TextPosition, RenderingMode>();
        Map<TextPosition, float[]> strokingColor = new HashMap<TextPosition, float[]>();
        Map<TextPosition, float[]> nonStrokingColor = new HashMap<TextPosition, float[]>();
    
        final static List<RenderingMode> FILLING_MODES = Arrays.asList(RenderingMode.FILL, RenderingMode.FILL_STROKE, RenderingMode.FILL_CLIP, RenderingMode.FILL_STROKE_CLIP);
        final static List<RenderingMode> STROKING_MODES = Arrays.asList(RenderingMode.STROKE, RenderingMode.FILL_STROKE, RenderingMode.STROKE_CLIP, RenderingMode.FILL_STROKE_CLIP);
        final static List<RenderingMode> CLIPPING_MODES = Arrays.asList(RenderingMode.FILL_CLIP, RenderingMode.STROKE_CLIP, RenderingMode.FILL_STROKE_CLIP, RenderingMode.NEITHER_CLIP);
    
        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
            for (TextPosition textPosition: textPositions) {
                RenderingMode charRenderingMode = renderingMode.get(textPosition);
                float[] charStrokingColor = strokingColor.get(textPosition);
                float[] charNonStrokingColor = nonStrokingColor.get(textPosition);
    
                StringBuilder textBuilder = new StringBuilder();
                textBuilder.append(textPosition.getUnicode()).append("{");
    
                if (FILLING_MODES.contains(charRenderingMode)) {
                    textBuilder.append("FILL:").append(toString(charNonStrokingColor)).append(';');
                }
    
                if (STROKING_MODES.contains(charRenderingMode)) {
                    textBuilder.append("STROKE:").append(toString(charStrokingColor)).append(';');
                }
    
                if (CLIPPING_MODES.contains(charRenderingMode)) {
                    textBuilder.append("CLIP;");
                }
    
                textBuilder.append("}");
                writeString(textBuilder.toString());
            }
        }
    
        String toString(float[] values)
        {
            if (values == null)
                return "null";
            StringBuilder builder = new StringBuilder();
            switch(values.length) {
            case 1:
                builder.append("GRAY"); break;
            case 3:
                builder.append("RGB"); break;
            case 4:
                builder.append("CMYK"); break;
            default:
                builder.append("UNKNOWN");
            }
            for (float f: values) {
                builder.append(' ')
                       .append(f);
            }
    
            return builder.toString();
        }
    }
    

    (ColorTextStripper)