javapdfbox

How to differentiate between background color and text color?


can anyone help me how to differentiate between background color and text color of pdf document if background color and text color are same.

Actually I need to set some static color to invisible text using pdfbox so that text which are invisible we can see.

TextObjectInfo contains all text object information using PDFStreamEngine .

public class SimplePdfRegeneretor {
    private PDDocument _document;
    private PDResources _pageResource;
    private PDFTextObjectInfoExtraction _PDFTextObjectInfoExtraction;
    private List<List<TextObjectInfo>> _documentTextObjectInfo;
    
    private void RecreatePDF() throws IOException{
        int _pageNo = 0;
        for (PDPage page : _document.getPages())
        {
            List<TextObjectInfo> _pageTextObjectInfo = this._documentTextObjectInfo.get(_pageNo);
             try (PDPageContentStream contentStream = new PDPageContentStream(_document,
                    page, AppendMode.APPEND, false, true)){
                    Integer _textObjInfoInx = 0 ;
                    //contentStream.setNonStrokingColor(0,0,0,0);
                    for(TextObjectInfo _textObjInfo : _pageTextObjectInfo){  
                    Float _xmin = _textObjInfo.get_xyminmax().get(0);
                    Float _ymin = _textObjInfo.get_xyminmax().get(1); 
                    putTextOnDocument(contentStream,_textObjInfo,_textObjInfo.TextFontObject,_xmin,_ymin,_textObjInfoInx);
                    _textObjInfoInx++; 
               }
               
            }
            _pageNo++;
        }

        _pageNo = 0;
        for (PDPage _page : _document.getPages())
            {
                List<Object> newTokens = addTjStringtoContenStream(_page,_pageNo);
                PDStream newContents = new PDStream(_document);
                writeTokensToStream(newContents, newTokens);
                _page.setContents(newContents);
                System.out.println("Page TextObject Writting Completed.."+_pageNo);
                _pageNo++;
            }

    }
    
    private void putTextOnDocument(PDPageContentStream contentStream, TextObjectInfo _textObjInfo, PDFont font, Float horizontalPixel, 
                                                                   Float verticalPixel, int TextObjectIndex) throws IOException {
        
        String _textobjstr = "TextObjectIndex-" + TextObjectIndex;
        Matrix _tm = _textObjInfo.textMatrixs.get(_textObjInfo.textMatrixs.size()-1);
        int fontSize = _textObjInfo.TextFontSize.intValue();
        PDGraphicsState _GraphicsState = _textObjInfo.getGraphicsState();
        PDTextState _TextState = _GraphicsState.getTextState();
        
        contentStream.beginText();
        contentStream.setNonStrokingColor(_GraphicsState.getNonStrokingColor());
        contentStream.setStrokingColor(_GraphicsState.getStrokingColor());
        contentStream.setRenderingMode(_TextState.getRenderingMode());
        contentStream.setFont(font, fontSize);
        contentStream.setTextMatrix(_tm);
        contentStream.beginMarkedContent(COSName.getPDFName(_textobjstr));
        contentStream.endMarkedContent();
        contentStream.endText();
    }

    private List<Object> addTjStringtoContenStream(PDContentStream contentStream, int _pgInx) throws IOException{
        PDFStreamParser parser = new PDFStreamParser(contentStream);
        Object token = parser.parseNextToken();
        List<Object> newTokens = new ArrayList<>();
        List<TextObjectInfo> _pageTextObjInfo =  this._documentTextObjectInfo.get(_pgInx);
        System.out.println("Len of _pageTextObjInfo: "+_pageTextObjInfo.size());
        //newTokens.add(Operator.getOperator("q"));
        while (token != null)
        {
            if (token instanceof Operator)
            {
                Operator op = (Operator) token;
                String opName = op.getName();
                if (OperatorName.BEGIN_MARKED_CONTENT.equals(opName))
                {
                    // remove the argument to this operator
                    //System.out.println(newTokens.get(newTokens.size() - 1));
                    Integer _tjObjInx = Integer.parseInt(((COSName)newTokens.get(newTokens.size() - 1)).getName().replace("TextObjectIndex-", ""));
                    
                    TextObjectInfo _TextObjectInfo = _pageTextObjInfo.get(_tjObjInx);
                    COSString _tjStr = _TextObjectInfo.TjString;
                    newTokens.remove(newTokens.size() - 1);
                    newTokens.add(_tjStr);
                    newTokens.add(Operator.getOperator("Tj"));
                    token = parser.parseNextToken();
                    continue;
                }
                else if (OperatorName.END_MARKED_CONTENT.equals(opName))
                {
                    token = parser.parseNextToken();
                    continue;
                }
            }
            newTokens.add(token);
            token = parser.parseNextToken();
        }
        //newTokens.add(Operator.getOperator("Q"));
        return newTokens;
    }
    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException
    {
        try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE))
        {
            ContentStreamWriter writer = new ContentStreamWriter(out);
            writer.writeTokens(newTokens);
        }
    }
}

1.this is background image enter image description here

2.this is text having black color without background image enter image description here

3.this is input document where text is visible enter image description here

4.and this is output where text is invisible i.e Silver enter image description here


Solution

  • I am using pdfbox verson 2.0+ , So I have added these following operators in the constructor of my overwritten PDFStreamEngine:

    addOperator(new SetStrokingColorSpace());
    addOperator(new SetNonStrokingColorSpace());
    addOperator(new SetStrokingDeviceCMYKColor());
    addOperator(new SetNonStrokingDeviceCMYKColor());
    addOperator(new SetNonStrokingDeviceRGBColor());
    addOperator(new SetStrokingDeviceRGBColor());
    addOperator(new SetNonStrokingDeviceGrayColor());
    addOperator(new SetStrokingDeviceGrayColor());
    addOperator(new SetStrokingColor());
    addOperator(new SetStrokingColorN());
    addOperator(new SetNonStrokingColor());
    addOperator(new SetNonStrokingColorN());
    

    Then extracted required information from this getGraphicsState(). please also look into this https://pdfbox.apache.org/2.0/migration.html specially Text Extraction part .