import org.bytedeco.tesseract.TessBaseAPI;
public class TesseractInitExample {
public static void main(String[] args) {
// Create a new Tesseract API instance
TessBaseAPI api = new TessBaseAPI();
// Set the path to tessdata and language (change accordingly)
String dataPath = "/root/GitHub/tessdata_best";
String language = "eng";
// Initialize Tesseract
if (api.Init(dataPath, language) != 0) {
System.err.println("Could not initialize Tesseract.");
return;
}
System.out.println("Tesseract initialized successfully!");
// Cleanup
api.End();
}
}
Below is dockerfile
FROM gradle:8.11.1-jdk17-alpine AS build
RUN apk add --no-cache git
RUN git clone --branch main --single-branch https://github.com/tesseract-ocr/tessdata_best.git /root/GitHub/tessdata_best
COPY --chown=gradle:gradle . /home/gradle/src
WORKDIR /home/gradle/src
RUN gradle build --no-daemon -x test
FROM eclipse-temurin:17.0.13_11-jdk-alpine
COPY --from=build /root/GitHub/tessdata_best /root/GitHub/tessdata_best
RUN apk add --no-cache \
tesseract-ocr \
leptonica-dev
VOLUME /tmp
ARG JAVA_OPTS
ENV JAVA_OPTS=$JAVA_OPTS
RUN mkdir /app
COPY --from=build /home/gradle/src/build/libs/*.jar /app/extract.jar
ENTRYPOINT ["java", "-XX:+UnlockExperimentalVMOptions", "-Djava.security.egd=file:/dev/./urandom","-jar","/app/extract.jar"]
While running container I am getting below error during init
.
3.362 seconds (process running for 4.095)
#
# A fatal error has been detected by the Java Runtime Environment:
#
# SIGSEGV (0xb) at pc=0x000000000005f2b0, pid=1, tid=7
#
# JRE version: OpenJDK Runtime Environment Temurin-17.0.13+11 (17.0.13+11) (build 17.0.13+11)
# Java VM: OpenJDK 64-Bit Server VM Temurin-17.0.13+11 (17.0.13+11, mixed mode, sharing, tiered, compressed oops, compressed class ptrs, g1 gc, linux-amd64)
# Problematic frame:
# C [libtesseract.so.5.5+0x14b9c8] tesseract::CHAR_FRAGMENT::parse_from_string(char const*)+0xc8
#
# Core dump will be written. Default location: Core dumps may be processed with "/wsl-capture-crash %t %E %p %s" (or dumping to //core.1)
#
# An error report file with more information is saved as:
# //hs_err_pid1.log
#
# If you would like to submit a bug report, please visit:
# https://github.com/adoptium/adoptium-support/issues
# The crash happened outside the Java Virtual Machine in native code.
# See problematic frame for where to report the bug.
#
Unable to figure this error. How can I resolve this (Works fine in windows locally)?
The issue you're experiencing might be due to compatibility issues between Alpine-based images and the native Tesseract libraries. Alpine uses musl instead of glibc, which can sometimes lead to segmentation faults or other unexpected behavior when working with native libraries like libtesseract.so. Switching to a Debian or Ubuntu-based image often resolves such problems.
Here's a Dockerfile that uses a Debian-based image for better compatibility:
FROM gradle:8.11.1-jdk17 AS build
# Install git and clone tessdata_best
RUN apt-get update && apt-get install -y git
RUN git clone --branch main --single-branch https://github.com/tesseract-ocr/tessdata_best.git /root/GitHub/tessdata_best
# Build the project with Gradle
COPY --chown=gradle:gradle . /home/gradle/src
WORKDIR /home/gradle/src
RUN gradle build --no-daemon -x test
FROM eclipse-temurin:17.0.13_11-jdk
# Install Tesseract and dependencies
RUN apt-get update && apt-get install -y \
tesseract-ocr \
libleptonica-dev
# Copy tessdata_best
COPY --from=build /root/GitHub/tessdata_best /root/GitHub/tessdata_best
# Prepare the application
VOLUME /tmp
ARG JAVA_OPTS
ENV JAVA_OPTS=$JAVA_OPTS
RUN mkdir /app
COPY --from=build /home/gradle/src/build/libs/*.jar /app/extract.jar
# Run the application
ENTRYPOINT ["java", "-XX:+UnlockExperimentalVMOptions", "-Djava.security.egd=file:/dev/./urandom","-jar","/app/extract.jar"]