pdfitextrectangles

How to find all rectangles in a PDF using iText


A MS word document with a text box(rectangle) and I have successfully used libreoffice convert it to PDF. How should I find all text box(rectangle) in pdf and How interpret the coordinates of a rectangle?

@Override
public void modifyPath(PathConstructionRenderInfo renderInfo) {
    if (renderInfo.getOperation() == PathConstructionRenderInfo.RECT) {
        float x = renderInfo.getSegmentData().get(0);
        float y = renderInfo.getSegmentData().get(1);
        float w = renderInfo.getSegmentData().get(2);
        float h = renderInfo.getSegmentData().get(3);
        Vector a = new Vector(x, y, 1).cross(renderInfo.getCtm());
        Vector c = new Vector(x + w, y + h, 1).cross(renderInfo.getCtm());

implements ExtRenderListener, only allow find the page(A4) rectangle,do not find the (textbox)rectangle that contains all the content in a page.


Solution

  • As Bruno pointed out, the problem is that you may be faced with rectangles that are only defined by line-to or move-to operations.

    You will need to keep track of all line-drawing operations, and 'aggregate' them as soon as they intersect (whenever a line is being drawn whos end/start matches up with an already known line's end/start).

    public class RectangleFinder implements IEventListener {
    
        private Map<Line, Integer> knownLines = new HashMap<>();
        private Map<Integer, Integer> clusters = new HashMap<>();
    
        public void eventOccurred(IEventData data, EventType type) {
            if(data instanceof PathRenderInfo){
                PathRenderInfo pathRenderInfo = (PathRenderInfo) data;
                pathRenderInfo.preserveGraphicsState();
                Path path = pathRenderInfo.getPath();
                if(pathRenderInfo.getOperation() == PathRenderInfo.NO_OP)
                    return;
                if(pathRenderInfo.getOperation() != PathRenderInfo.FILL)
                    return;
                if(!isBlack(pathRenderInfo.getFillColor()))
                    return;
                for(Subpath sPath : path.getSubpaths()){
                    for(IShape segment : sPath.getSegments()) {
                        if(segment instanceof Line) {
                            lineOccurred((Line) segment);
                        }
                    }
                }
            }
        }
    
        private boolean isBlack(Color c){
            if(c instanceof IccBased){
                IccBased col01 = (IccBased) c;
                return col01.getNumberOfComponents() == 1 && col01.getColorValue()[0] == 0.0f;
            }
            if(c instanceof DeviceGray){
                DeviceGray col02 = (DeviceGray) c;
                return col02.getNumberOfComponents() == 1 && col02.getColorValue()[0] == 0.0f;
            }
            return false;
        }
    
        private void lineOccurred(Line line){
            int ID = 0;
            if(!knownLines.containsKey(line)) {
                ID = knownLines.size();
                knownLines.put(line, ID);
            }else{
                ID = knownLines.get(line);
            }
    
            Point start = line.getBasePoints().get(0);
            Point end = line.getBasePoints().get(1);
            for(Line line2 : knownLines.keySet()){
                if(line.equals(line2))
                    continue;
                if(line2.getBasePoints().get(0).equals(start)
                        || line2.getBasePoints().get(1).equals(end)
                        || line2.getBasePoints().get(0).equals(end)
                        || line2.getBasePoints().get(1).equals(start)){
                    int ID2 = find(knownLines.get(line2));
                    clusters.put(ID, ID2);
                    break;
                }
            }
        }
    
        private int find(int ID){
            int out = ID;
            while(clusters.containsKey(out))
                out = clusters.get(out);
            return out;
        }
    
        public Set<EventType> getSupportedEvents() {
            return null;
        }
    
        public Collection<Set<Line>> getClusters(){
            Map<Integer, Set<Line>> out = new HashMap<>();
            for(Integer val : clusters.values())
                out.put(val, new HashSet<Line>());
            out.put(-1, new HashSet<Line>());
            for(Line l : knownLines.keySet()){
                int clusterID = clusters.containsKey(knownLines.get(l)) ? clusters.get(knownLines.get(l)) : -1;
                out.get(clusterID).add(l);
            }
            out.remove(-1);
            return out.values();
        }
    
        public Collection<Rectangle> getBoundingBoxes(){
            Set<Rectangle> rectangles = new HashSet<>();
            for(Set<Line> cluster : getClusters()){
                double minX = Double.MAX_VALUE;
                double minY = Double.MAX_VALUE;
                double maxX = -Double.MAX_VALUE;
                double maxY = -Double.MAX_VALUE;
                for(Line l : cluster){
                    for(Point p : l.getBasePoints()){
                        minX = Math.min(minX, p.x);
                        minY = Math.min(minY, p.y);
                        maxX = Math.max(maxX, p.x);
                        maxY = Math.max(maxY, p.y);
                    }
                }
                double w = (maxX - minX);
                double h = (maxY - minY);
                rectangles.add(new Rectangle((float) minX, (float) minY, (float) w, (float) h));
            }
            return rectangles;
        }
    }
    

    This is a class I wrote to find black (filled) rectangles on a page. With minor adjustments, it can find other rectangles as well.