At this point I simply want to retrieve all checkboxes in a sheet of a .xlsx excel file. I found an older answer, but my file doesn't seem to include the x:Checked attribute, thus it retrieves the correct amount of checkboxes, however their values are all false (even though they shouldn't be).
More context: My .xlsx file was originally a .xls file and I converted it via LibreOffice, maybe that might be the problem here, not sure, so happy to get help from anyone more experienced.
My code is very similar to the one in this answer: https://stackoverflow.com/a/48972176/13600212
So... I wrote a "simple" .xlsx parser only for getting the checkboxes since I couldn't get it working with apache poi, here you go.
The code can:
package com.osiris.danielmanager.excel;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
public class ExcelCheckboxesParser {
private final Map<String, Document> cacheParsedXml = new HashMap<>();
public static void main(String[] args) {
try {
File file = new File("example.xlsx");
var parser = new ExcelCheckboxesParser();
List<CheckboxInfo> checkboxes = parser.parseXLSX(file);
for (CheckboxInfo checkbox : checkboxes) {
System.out.println(checkbox);
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static String toString(Document doc) {
try {
StringWriter sw = new StringWriter();
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
transformer.setOutputProperty(OutputKeys.METHOD, "xml");
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.transform(new DOMSource(doc), new StreamResult(sw));
return sw.toString();
} catch (Exception ex) {
throw new RuntimeException("Error converting to String", ex);
}
}
private static void parsePosition(CheckboxInfo checkbox) {
String ref = checkbox.cellReference;
StringBuilder colStr = new StringBuilder();
StringBuilder rowStr = new StringBuilder();
// Split reference into column letters and row number
int i = 0;
while (i < ref.length() && Character.isLetter(ref.charAt(i))) {
colStr.append(ref.charAt(i));
i++;
}
while (i < ref.length() && Character.isDigit(ref.charAt(i))) {
rowStr.append(ref.charAt(i));
i++;
}
// Convert column letters to number (A=1, B=2, AA=27, etc.)
String col = colStr.toString().toUpperCase();
int column = 0;
for (int j = 0; j < col.length(); j++) {
column = column * 26 + (col.charAt(j) - 'A' + 1);
}
checkbox.column = column;
checkbox.row = Integer.parseInt(rowStr.toString());
}
public List<CheckboxInfo> parseXLSX(File file) throws Exception {
List<CheckboxInfo> checkboxes = new ArrayList<>();
Map<String, String> sheetNames = new HashMap<>();
Map<String, List<Relation>> sheetAndRelationshipPaths = new HashMap<>();
try (ZipInputStream zis = new ZipInputStream(new FileInputStream(file))) {
ZipEntry entry;
Map<String, String> xmlFiles = new HashMap<>();
// Extract XML files from .xlsx
while ((entry = zis.getNextEntry()) != null) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int length;
while ((length = zis.read(buffer)) > 0) {
baos.write(buffer, 0, length);
}
xmlFiles.put(entry.getName(), baos.toString(StandardCharsets.UTF_8));
// Uncomment if you need to print the complete contents of the .xlsx file:
// try{
// System.out.println("\n\n\n"+entry.getName()+"\n\n"+toString(parseXml(xmlFiles.get(entry.getName()))));
// } catch (Exception e) {
// System.err.println("Failed to debug show: "+entry.getName()+" error: "+e.getMessage());
// e.printStackTrace();
// }
}
// Parse sheet names and relationships
if (xmlFiles.containsKey("xl/workbook.xml")) {
String workbookXml = xmlFiles.get("xl/workbook.xml");
Document doc = parseXml(workbookXml);
NodeList sheets = doc.getElementsByTagName("sheet");
for (int i = 0; i < sheets.getLength(); i++) {
Element sheet = (Element) sheets.item(i);
String sheetId = sheet.getAttribute("sheetId");
String sheetName = sheet.getAttribute("name");
sheetNames.put(sheetId, sheetName);
// Find the corresponding relationship for each sheet
String sheetRelsPath = "xl/worksheets/_rels/sheet" + sheetId + ".xml.rels";
if (xmlFiles.containsKey(sheetRelsPath)) {
String relsXml = xmlFiles.get(sheetRelsPath);
Document relsDoc = parseXml(relsXml);
NodeList relationships = relsDoc.getElementsByTagName("Relationship");
for (int j = 0; j < relationships.getLength(); j++) {
Element relationship = (Element) relationships.item(j);
String type = relationship.getAttribute("Type");
if (type.contains("ctrlProp")) {
String absolutePath = relationship.getAttribute("Target").replace("../ctrlProps/", "xl/ctrlProps/");
var list = sheetAndRelationshipPaths.get(sheetId);
if (list == null) {
list = new ArrayList<>();
sheetAndRelationshipPaths.put(sheetId, list);
}
list.add(new Relation(sheetId, relationship.getAttribute("Id"), absolutePath));
}
}
}
}
}
// Parse checkboxes in each sheet
for (String sheetId : sheetNames.keySet()) {
String sheetName = sheetNames.get(sheetId);
var docSheet = parseXml(xmlFiles.get("xl/worksheets/sheet" + sheetId + ".xml"));
if (sheetAndRelationshipPaths.containsKey(sheetId)) {
// Extract the control properties xml for checkboxes
for (var rel : sheetAndRelationshipPaths.get(sheetId)) {
String ctrlPropsXml = xmlFiles.get(rel.pathToXml);
Objects.requireNonNull(ctrlPropsXml);
Document ctrlDoc = parseXml(ctrlPropsXml);
NodeList formControls = ctrlDoc.getElementsByTagName("formControlPr");
for (int i = 0; i < formControls.getLength(); i++) {
Element formControl = (Element) formControls.item(i);
if ("CheckBox".equals(formControl.getAttribute("objectType"))) {
CheckboxInfo checkboxInfo = new CheckboxInfo();
checkboxInfo.sheetName = sheetName;
checkboxInfo.isChecked = "Checked".equalsIgnoreCase(formControl.getAttribute("checked"));
checkboxInfo.cellReference = formControl.getAttribute("cellReference");
if (!checkboxInfo.cellReference.isEmpty())
parsePosition(checkboxInfo);
else {
// // 1. search for a "control" element in docSheet with r:id = rel.id and get the sub element "controlPr" altText attribute
// // 2. also search for xdr:row, xdr:rowOff, xdr:col, xdr:colOff elements within "control", get their contents
// // 3. if all are == 0 means this is a floating checkbox and we cannot use the values, otherwise set to checkbox.column/row accordingly
// // 4. if floating checkbox continue with checking .vml files, like shown below:
// for(Map.Entry<String, String> e : xmlFiles.entrySet()){
// if(e.getKey().endsWith(".vml")){
// var docVml = parseXml(e.getValue());
//
// // 5. in docVml search for a v:shape element that has id = altText
// // 6. get the sub element named x:Anchor and its content, which will be a integer list separated by commas
// // 7. set checkbox.column to list[0] and set checkbox.row to list[2]
// }
// }
// Search for control element in worksheet
NodeList controls = docSheet.getElementsByTagName("control");
for (int j = 0; j < controls.getLength(); j++) {
Element control = (Element) controls.item(j);
if (control.getAttribute("r:id").equals(rel.id)) {
// Get position from control element
NodeList controlPrs = control.getElementsByTagName("controlPr");
if (controlPrs.getLength() > 0) {
Element controlPr = (Element) controlPrs.item(0);
String altText = controlPr.getAttribute("altText");
// Check if it's a floating checkbox
NodeList rows = control.getElementsByTagName("xdr:row");
NodeList cols = control.getElementsByTagName("xdr:col");
NodeList rowOffs = control.getElementsByTagName("xdr:rowOff");
NodeList colOffs = control.getElementsByTagName("xdr:colOff");
if (rows.getLength() > 0 && cols.getLength() > 0) {
int row = Integer.parseInt(rows.item(0).getTextContent());
int col = Integer.parseInt(cols.item(0).getTextContent());
long rowOff = Long.parseLong(rowOffs.item(0).getTextContent());
long colOff = Long.parseLong(colOffs.item(0).getTextContent());
checkboxInfo.row = row;
checkboxInfo.column = col;
if (row != 0 && col != 0 && rowOff != 0 && colOff != 0)
continue;
}
// If we get here, it's a floating checkbox - check VML files
for (Map.Entry<String, String> e : xmlFiles.entrySet()) {
if (e.getKey().endsWith(".vml")) {
Document docVml = parseXml(e.getValue());
NodeList shapes = docVml.getElementsByTagName("v:shape");
for (int k = 0; k < shapes.getLength(); k++) {
Element shape = (Element) shapes.item(k);
if (shape.getAttribute("id").equals(altText)) {
NodeList anchors = shape.getElementsByTagName("x:Anchor");
if (anchors.getLength() > 0) {
String[] coords = anchors.item(0).getTextContent().split(",");
if (coords.length >= 4) {
checkboxInfo.column = Integer.parseInt(coords[0].trim());
checkboxInfo.row = Integer.parseInt(coords[2].trim());
}
}
break;
}
}
}
}
}
break;
}
}
}
checkboxes.add(checkboxInfo);
}
}
}
}
}
}
// Sort checkboxes by row and column
Collections.sort(checkboxes);
return checkboxes;
}
private synchronized Document parseXml(String xmlContent) throws Exception {
synchronized (cacheParsedXml) {
var doc = cacheParsedXml.get(xmlContent);
if (doc != null) return doc;
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
doc = builder.parse(new ByteArrayInputStream(xmlContent.getBytes()));
cacheParsedXml.put(xmlContent, doc);
return doc;
}
}
/**
* Is equivalent to something like this: <br>
* <pre>
* <Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/ctrlProp" Target="../ctrlProps/ctrlProps2.xml"/>
* </pre>
*/
public static class Relation {
public String sheetId;
public String id;
public String pathToXml;
public Relation(String sheetId, String id, String pathToXml) {
this.sheetId = sheetId;
this.id = id;
this.pathToXml = pathToXml;
}
}
public static class CheckboxInfo implements Comparable<CheckboxInfo> {
public String sheetName;
public boolean isChecked;
public String cellReference;
public int row;
public int column;
@Override
public String toString() {
return "Checkbox [Sheet: " + sheetName + ", Checked: " + isChecked +
", Cell: " + cellReference + ", Row: " + row + ", Column: " + column + "]";
}
@Override
public int compareTo(CheckboxInfo other) {
// First compare by sheet name
int sheetCompare = this.sheetName.compareTo(other.sheetName);
if (sheetCompare != 0) return sheetCompare;
// Then by row
int rowCompare = Integer.compare(this.row, other.row);
if (rowCompare != 0) return rowCompare;
// Finally by column
return Integer.compare(this.column, other.column);
}
}
}