From a51e3b08016f091940b44908cb394e420b78198f Mon Sep 17 00:00:00 2001
From: Kaan0029 <ke2461@columbia.edu>
Date: Thu, 12 Jun 2025 20:57:22 +0200
Subject: [PATCH 1/3] Initial implementation using tess4j

---
 .gitignore                                    |   2 +
 .../gui/fieldeditors/LinkedFilesEditor.java   |   3 +-
 .../contextmenu/ContextMenuFactory.java       |  45 +++++++-
 .../org/jabref/gui/linkedfile/OcrAction.java  | 106 ++++++++++++++++++
 .../contextmenu/ContextMenuFactoryTest.java   |   5 +-
 jablib/build.gradle.kts                       |   3 +
 jablib/src/main/java/module-info.java         |   2 +
 .../org/jabref/logic/ocr/OcrException.java    |  28 +++++
 .../java/org/jabref/logic/ocr/OcrService.java |  78 +++++++++++++
 .../main/resources/l10n/JabRef_en.properties  |   8 ++
 10 files changed, 277 insertions(+), 3 deletions(-)
 create mode 100644 jabgui/src/main/java/org/jabref/gui/linkedfile/OcrAction.java
 create mode 100644 jablib/src/main/java/org/jabref/logic/ocr/OcrException.java
 create mode 100644 jablib/src/main/java/org/jabref/logic/ocr/OcrService.java
diff --git a/.gitignore b/.gitignore
index c94289d0f06..c0e5370009f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -581,3 +581,5 @@ CHANGELOG.html
 
 # some strange gradle/IntelliJ extension
 extension 'reporting' property 'baseDirectory'
+
+tessdata/
diff --git a/jabgui/src/main/java/org/jabref/gui/fieldeditors/LinkedFilesEditor.java b/jabgui/src/main/java/org/jabref/gui/fieldeditors/LinkedFilesEditor.java
index ff6992be048..554c58cae8a 100644
--- a/jabgui/src/main/java/org/jabref/gui/fieldeditors/LinkedFilesEditor.java
+++ b/jabgui/src/main/java/org/jabref/gui/fieldeditors/LinkedFilesEditor.java
@@ -325,7 +325,8 @@ private void handleItemMouseClick(LinkedFileViewModel linkedFile, MouseEvent eve
                     bibEntry,
                     viewModel,
                     contextCommandFactory,
-                    multiContextCommandFactory
+                    multiContextCommandFactory,
+                    taskExecutor
             );
 
             ContextMenu contextMenu = contextMenuFactory.createForSelection(listView.getSelectionModel().getSelectedItems());
diff --git a/jabgui/src/main/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactory.java b/jabgui/src/main/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactory.java
index a6b3262450e..9c05875e180 100644
--- a/jabgui/src/main/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactory.java
+++ b/jabgui/src/main/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactory.java
@@ -2,6 +2,7 @@
 
 import javafx.collections.ObservableList;
 import javafx.scene.control.ContextMenu;
+import javafx.scene.control.MenuItem;
 import javafx.scene.control.SeparatorMenuItem;
 
 import org.jabref.gui.DialogService;
@@ -10,7 +11,10 @@
 import org.jabref.gui.copyfiles.CopySingleFileAction;
 import org.jabref.gui.fieldeditors.LinkedFileViewModel;
 import org.jabref.gui.fieldeditors.LinkedFilesEditorViewModel;
+import org.jabref.gui.linkedfile.OcrAction;
 import org.jabref.gui.preferences.GuiPreferences;
+import org.jabref.logic.l10n.Localization;
+import org.jabref.logic.util.TaskExecutor;
 import org.jabref.model.database.BibDatabaseContext;
 import org.jabref.model.entry.BibEntry;
 
@@ -25,6 +29,7 @@ public class ContextMenuFactory {
     private final LinkedFilesEditorViewModel viewModel;
     private final SingleContextCommandFactory singleCommandFactory;
     private final MultiContextCommandFactory multiCommandFactory;
+    private final TaskExecutor taskExecutor;
 
     public ContextMenuFactory(DialogService dialogService,
                               GuiPreferences preferences,
@@ -32,7 +37,8 @@ public ContextMenuFactory(DialogService dialogService,
                               ObservableOptionalValue<BibEntry> bibEntry,
                               LinkedFilesEditorViewModel viewModel,
                               SingleContextCommandFactory singleCommandFactory,
-                              MultiContextCommandFactory multiCommandFactory) {
+                              MultiContextCommandFactory multiCommandFactory,
+                              TaskExecutor taskExecutor) {
         this.dialogService = dialogService;
         this.preferences = preferences;
         this.databaseContext = databaseContext;
@@ -40,6 +46,7 @@ public ContextMenuFactory(DialogService dialogService,
         this.viewModel = viewModel;
         this.singleCommandFactory = singleCommandFactory;
         this.multiCommandFactory = multiCommandFactory;
+        this.taskExecutor = taskExecutor;
     }
 
     public ContextMenu createForSelection(ObservableList<LinkedFileViewModel> selectedFiles) {
@@ -86,9 +93,45 @@ private ContextMenu createContextMenuForFile(LinkedFileViewModel linkedFile) {
                 factory.createMenuItem(StandardActions.DELETE_FILE, singleCommandFactory.build(StandardActions.DELETE_FILE, linkedFile))
         );
 
+        // Add OCR menu item for PDF files
+        if (linkedFile.getFile().getFileType().equalsIgnoreCase("pdf")) {
+            menu.getItems().add(new SeparatorMenuItem());
+
+            MenuItem ocrItem = createOcrMenuItem(linkedFile);
+            menu.getItems().add(ocrItem);
+        }
+
         return menu;
     }
 
+    /**
+     * Creates the OCR menu item for a PDF file.
+     * The menu item is only enabled if the PDF file exists on disk.
+     *
+     * @param linkedFile The linked PDF file
+     * @return MenuItem configured for OCR action
+     */
+    private MenuItem createOcrMenuItem(LinkedFileViewModel linkedFile) {
+        MenuItem ocrItem = new MenuItem(Localization.lang("Extract text (OCR)"));
+
+        // Create the OCR action
+        OcrAction ocrAction = new OcrAction(
+                linkedFile.getFile(),
+                databaseContext,
+                dialogService,
+                preferences.getFilePreferences(),
+                taskExecutor
+        );
+
+        // Set the action to execute when clicked
+        ocrItem.setOnAction(event -> ocrAction.execute());
+
+        // Disable if the action is not executable (file doesn't exist)
+        ocrItem.disableProperty().bind(ocrAction.executableProperty().not());
+
+        return ocrItem;
+    }
+
     @FunctionalInterface
     public interface SingleContextCommandFactory {
         ContextAction build(StandardActions action, LinkedFileViewModel file);
diff --git a/jabgui/src/main/java/org/jabref/gui/linkedfile/OcrAction.java b/jabgui/src/main/java/org/jabref/gui/linkedfile/OcrAction.java
new file mode 100644
index 00000000000..2d5f22773f4
--- /dev/null
+++ b/jabgui/src/main/java/org/jabref/gui/linkedfile/OcrAction.java
@@ -0,0 +1,106 @@
+package org.jabref.gui.linkedfile;
+
+import org.jabref.gui.DialogService;
+import org.jabref.gui.StateManager;
+import org.jabref.gui.actions.Action;
+import org.jabref.gui.actions.ActionHelper;
+import org.jabref.gui.actions.SimpleCommand;
+import org.jabref.logic.util.BackgroundTask;
+import org.jabref.logic.util.TaskExecutor;
+import org.jabref.logic.l10n.Localization;
+import org.jabref.logic.ocr.OcrService;
+import org.jabref.logic.ocr.OcrException;
+import org.jabref.model.database.BibDatabaseContext;
+import org.jabref.model.entry.LinkedFile;
+import org.jabref.logic.FilePreferences;
+
+import java.nio.file.Path;
+import java.util.Optional;
+
+/**
+ * Action for performing OCR (Optical Character Recognition) on linked PDF files.
+ * <p>
+ * This action extracts text content from PDF files that are attached to BibTeX entries.
+ * It runs the OCR process in a background thread to keep the UI responsive and provides
+ * user feedback through dialogs and notifications.
+ * <p>
+ * The action follows JabRef's command pattern and can be triggered from context menus.
+ * It includes built-in validation to ensure it's only enabled for PDF files that exist on disk.
+ *
+ * @see OcrService
+ * @see org.jabref.gui.actions.SimpleCommand
+ */
+
+// Personal Note: Add more doc in between later
+
+public class OcrAction extends SimpleCommand {
+
+    private final LinkedFile linkedFile;
+    private final BibDatabaseContext databaseContext;
+    private final DialogService dialogService;
+    private final FilePreferences filePreferences;
+    private final TaskExecutor taskExecutor;
+
+    public OcrAction(LinkedFile linkedFile,
+                     BibDatabaseContext databaseContext,
+                     DialogService dialogService,
+                     FilePreferences filePreferences,
+                     TaskExecutor taskExecutor) {
+        this.linkedFile = linkedFile;
+        this.databaseContext = databaseContext;
+        this.dialogService = dialogService;
+        this.filePreferences = filePreferences;
+        this.taskExecutor = taskExecutor;
+
+        // Only executable for existing PDF files
+        this.executable.set(
+                linkedFile.getFileType().equalsIgnoreCase("pdf") &&
+                        linkedFile.findIn(databaseContext, filePreferences).isPresent()
+        );
+    }
+
+    @Override
+    public void execute() {
+        Optional<Path> filePath = linkedFile.findIn(databaseContext, filePreferences);
+
+        if (filePath.isEmpty()) {
+            dialogService.showErrorDialogAndWait(
+                    Localization.lang("File not found"),
+                    Localization.lang("Could not locate the PDF file on disk.")
+            );
+            return;
+        }
+
+        dialogService.notify(Localization.lang("Performing OCR..."));
+
+        BackgroundTask.wrap(() -> {
+                          OcrService ocrService = new OcrService();
+                          return ocrService.performOcr(filePath.get());
+                      })
+                      .onSuccess(extractedText -> {
+                          if (extractedText.isEmpty()) {
+                              dialogService.showInformationDialogAndWait(
+                                      Localization.lang("OCR Complete"),
+                                      Localization.lang("No text was found in the PDF.")
+                              );
+                          } else {
+                              // For now, just show preview
+                              String preview = extractedText.length() > 1000
+                                      ? extractedText.substring(0, 1000) + "..."
+                                      : extractedText;
+
+                              dialogService.showInformationDialogAndWait(
+                                      Localization.lang("OCR Result"),
+                                      preview
+                              );
+                          }
+                      })
+                      .onFailure(exception -> {
+                          dialogService.showErrorDialogAndWait(
+                                  Localization.lang("OCR failed"),
+                                  exception.getMessage()
+                          );
+                      })
+                      .executeWith(taskExecutor);
+    }
+}
diff --git a/jabgui/src/test/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactoryTest.java b/jabgui/src/test/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactoryTest.java
index 141bc6e44e5..ec777d2b379 100644
--- a/jabgui/src/test/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactoryTest.java
+++ b/jabgui/src/test/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactoryTest.java
@@ -11,6 +11,7 @@
 import org.jabref.gui.fieldeditors.LinkedFileViewModel;
 import org.jabref.gui.fieldeditors.LinkedFilesEditorViewModel;
 import org.jabref.gui.preferences.GuiPreferences;
+import org.jabref.logic.util.TaskExecutor;
 import org.jabref.model.database.BibDatabaseContext;
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.entry.LinkedFile;
@@ -41,6 +42,7 @@ public class ContextMenuFactoryTest {
     private ContextMenuFactory factory;
     private ContextMenuFactory.SingleContextCommandFactory singleCommandFactory;
     private ContextMenuFactory.MultiContextCommandFactory multiCommandFactory;
+    private TaskExecutor taskExecutor;
 
     @BeforeAll
     public static void initToolkit() {
@@ -78,7 +80,8 @@ public void setUp() {
                 bibEntry,
                 viewModel,
                 singleCommandFactory,
-                multiCommandFactory
+                multiCommandFactory,
+                taskExecutor
         );
     }
 
diff --git a/jablib/build.gradle.kts b/jablib/build.gradle.kts
index 4581f58e23f..5e43cd7ef2b 100644
--- a/jablib/build.gradle.kts
+++ b/jablib/build.gradle.kts
@@ -243,6 +243,9 @@ dependencies {
     // Required for LocalizationConsistencyTest
     testImplementation("org.testfx:testfx-core:4.0.16-alpha")
     testImplementation("org.testfx:testfx-junit5:4.0.16-alpha")
+
+    // OCR support
+    implementation("net.sourceforge.tess4j:tess4j:5.15.0")
 }
 /*
 jacoco {
diff --git a/jablib/src/main/java/module-info.java b/jablib/src/main/java/module-info.java
index a792ae43f7d..85258c1526a 100644
--- a/jablib/src/main/java/module-info.java
+++ b/jablib/src/main/java/module-info.java
@@ -104,6 +104,7 @@
     exports org.jabref.logic.crawler;
     exports org.jabref.logic.git;
     exports org.jabref.logic.pseudonymization;
+    exports org.jabref.logic.ocr;
 
     requires java.base;
 
@@ -250,5 +251,6 @@
     requires mslinks;
     requires org.antlr.antlr4.runtime;
     requires org.libreoffice.uno;
+    requires tess4j;
     // endregion
 }
diff --git a/jablib/src/main/java/org/jabref/logic/ocr/OcrException.java b/jablib/src/main/java/org/jabref/logic/ocr/OcrException.java
new file mode 100644
index 00000000000..808f4d93a7a
--- /dev/null
+++ b/jablib/src/main/java/org/jabref/logic/ocr/OcrException.java
@@ -0,0 +1,28 @@
+package org.jabref.logic.ocr;
+
+/**
+ * Exception thrown when OCR operations fail.
+ * This exception wraps lower-level OCR engine exceptions to provide
+ * a consistent interface for error handling throughout JabRef.
+ */
+public class OcrException extends Exception {
+
+    /**
+     * Constructs an OcrException with a message and underlying cause.
+     *
+     * @param message Descriptive error message
+     * @param cause The underlying exception that caused this error
+     */
+    public OcrException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+    /**
+     * Constructs an OcrException with only a message.
+     *
+     * @param message Descriptive error message
+     */
+    public OcrException(String message) {
+        super(message);
+    }
+}
diff --git a/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java
new file mode 100644
index 00000000000..d51695fcd22
--- /dev/null
+++ b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java
@@ -0,0 +1,78 @@
+package org.jabref.logic.ocr;
+
+import net.sourceforge.tess4j.Tesseract;
+import net.sourceforge.tess4j.TesseractException;
+import org.jabref.model.strings.StringUtil;  // JabRef utility class
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.nio.file.Path;
+
+/**
+ * Service for performing Optical Character Recognition (OCR) on PDF files.
+ * This class provides a high-level interface to OCR functionality,
+ * abstracting away the specific OCR engine implementation details.
+ */
+public class OcrService {
+    private static final Logger LOGGER = LoggerFactory.getLogger(OcrService.class);
+
+    // The OCR engine instance
+    private final Tesseract tesseract;
+
+    /**
+     * Constructs a new OcrService with default settings.
+     * Currently uses Tesseract with English language support.
+     */
+    public OcrService() {
+        this.tesseract = new Tesseract();
+
+        // Configure Tesseract
+        tesseract.setLanguage("eng");
+
+        // TODO: This path needs to be configurable and bundled properly
+        // For now, we'll use a relative path that works during development
+        tesseract.setDatapath("tessdata");
+
+        LOGGER.debug("Initialized OcrService with Tesseract");
+    }
+
+    /**
+     * Performs OCR on a PDF file and returns the extracted text.
+     *
+     * @param pdfPath Path to the PDF file to process
+     * @return The extracted text, or empty string if no text found
+     * @throws OcrException if OCR processing fails
+     */
+    public String performOcr(Path pdfPath) throws OcrException {
+        // Validate input
+        if (pdfPath == null) {
+            throw new OcrException("PDF path cannot be null");
+        }
+
+        File pdfFile = pdfPath.toFile();
+        if (!pdfFile.exists()) {
+            throw new OcrException("PDF file does not exist: " + pdfPath);
+        }
+
+        try {
+            LOGGER.info("Starting OCR for file: {}", pdfFile.getName());
+
+            // Perform OCR
+            String result = tesseract.doOCR(pdfFile);
+
+            // Clean up the result (remove extra whitespace, etc.)
+            result = StringUtil.isBlank(result) ? "" : result.trim();
+
+            LOGGER.info("OCR completed successfully. Extracted {} characters", result.length());
+            return result;
+
+        } catch (TesseractException e) {
+            LOGGER.error("OCR failed for file: {}", pdfFile.getName(), e);
+            throw new OcrException(
+                    "Failed to perform OCR on file: " + pdfFile.getName() +
+                            ". Error: " + e.getMessage(), e
+            );
+        }
+    }
+}
diff --git a/jablib/src/main/resources/l10n/JabRef_en.properties b/jablib/src/main/resources/l10n/JabRef_en.properties
index bff236c92a6..85456f30e59 100644
--- a/jablib/src/main/resources/l10n/JabRef_en.properties
+++ b/jablib/src/main/resources/l10n/JabRef_en.properties
@@ -316,6 +316,14 @@ Extract\ references\ from\ file\ (online)=Extract references from file (online)
 Extract\ References\ (offline)=Extract References (offline)
 Extract\ References\ (online)=Extract References (online)
 
+Extract\ text\ (OCR)=Extract text (OCR)
+Performing\ OCR...=Performing OCR...
+OCR\ Complete=OCR Complete
+OCR\ Result=OCR Result
+OCR\ failed=OCR failed
+No\ text\ was\ found\ in\ the\ PDF.=No text was found in the PDF.
+Could\ not\ locate\ the\ PDF\ file\ on\ disk.=Could not locate the PDF file on disk.
+
 Processing...=Processing...
 Processing\ "%0"...=Processing "%0"...
 Processing\ PDF(s)=Processing PDF(s)

From 48ffb067bef2e38faaf7e9a677a396604288b439 Mon Sep 17 00:00:00 2001
From: Siedlerchr <siedlerkiller@gmail.com>
Date: Thu, 12 Jun 2025 21:16:40 +0200
Subject: [PATCH 2/3] add brew to jna path

---
 .../java/org/jabref/logic/ocr/OcrService.java | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java
index d51695fcd22..4f68c1ab5c2 100644
--- a/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java
+++ b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java
@@ -1,14 +1,16 @@
 package org.jabref.logic.ocr;
 
+import java.io.File;
+import java.nio.file.Path;
+
+import org.jabref.model.strings.StringUtil;
+
+import com.sun.jna.Platform;
 import net.sourceforge.tess4j.Tesseract;
 import net.sourceforge.tess4j.TesseractException;
-import org.jabref.model.strings.StringUtil;  // JabRef utility class
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
-import java.nio.file.Path;
-
 /**
  * Service for performing Optical Character Recognition (OCR) on PDF files.
  * This class provides a high-level interface to OCR functionality,
@@ -16,7 +18,7 @@
  */
 public class OcrService {
     private static final Logger LOGGER = LoggerFactory.getLogger(OcrService.class);
-
+    private static final String JNA_LIBRARY_PATH = "jna.library.path";
     // The OCR engine instance
     private final Tesseract tesseract;
 
@@ -25,6 +27,13 @@ public class OcrService {
      * Currently uses Tesseract with English language support.
      */
     public OcrService() {
+        if (Platform.isMac()) {
+            if (Platform.isARM()) {
+                System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/opt/homebrew/lib/");
+            } else {
+                System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/usr/local/cellar/");
+            }
+        }
         this.tesseract = new Tesseract();
 
         // Configure Tesseract
@@ -66,8 +75,8 @@ public String performOcr(Path pdfPath) throws OcrException {
 
             LOGGER.info("OCR completed successfully. Extracted {} characters", result.length());
             return result;
-
-        } catch (TesseractException e) {
+        } catch (
+                TesseractException e) {
             LOGGER.error("OCR failed for file: {}", pdfFile.getName(), e);
             throw new OcrException(
                     "Failed to perform OCR on file: " + pdfFile.getName() +

From 5a256ae71e3eeb36ecd3de97812f2e68632df1c3 Mon Sep 17 00:00:00 2001
From: Kaan0029 <ke2461@columbia.edu>
Date: Thu, 19 Jun 2025 20:39:37 +0200
Subject: [PATCH 3/3] Adapted Exception handling and configured tessdata
 variable

---
 .../java/org/jabref/logic/ocr/OcrResult.java  |  35 ++++++
 .../java/org/jabref/logic/ocr/OcrService.java | 106 +++++++++++++-----
 2 files changed, 116 insertions(+), 25 deletions(-)
 create mode 100644 jablib/src/main/java/org/jabref/logic/ocr/OcrResult.java

diff --git a/jablib/src/main/java/org/jabref/logic/ocr/OcrResult.java b/jablib/src/main/java/org/jabref/logic/ocr/OcrResult.java
new file mode 100644
index 00000000000..8cbe573e938
--- /dev/null
+++ b/jablib/src/main/java/org/jabref/logic/ocr/OcrResult.java
@@ -0,0 +1,35 @@
+package org.jabref.logic.ocr;
+
+import java.util.Optional;
+
+public class OcrResult {
+    private final boolean success;
+    private final String text;
+    private final String errorMessage;
+
+    private OcrResult(boolean success, String text, String errorMessage) {
+        this.success = success;
+        this.text = text;
+        this.errorMessage = errorMessage;
+    }
+
+    public static OcrResult success(String text) {
+        return new OcrResult(true, text, null);
+    }
+
+    public static OcrResult failure(String errorMessage) {
+        return new OcrResult(false, null, errorMessage);
+    }
+
+    public boolean isSuccess() {
+        return success;
+    }
+
+    public Optional<String> getText() {
+        return Optional.ofNullable(text);
+    }
+
+    public Optional<String> getErrorMessage() {
+        return Optional.ofNullable(errorMessage);
+    }
+}
diff --git a/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java
index 4f68c1ab5c2..37714273e6d 100644
--- a/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java
+++ b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java
@@ -19,6 +19,8 @@
 public class OcrService {
     private static final Logger LOGGER = LoggerFactory.getLogger(OcrService.class);
     private static final String JNA_LIBRARY_PATH = "jna.library.path";
+    private static final String TESSDATA_PREFIX = "TESSDATA_PREFIX";
+
     // The OCR engine instance
     private final Tesseract tesseract;
 
@@ -26,24 +28,78 @@ public class OcrService {
      * Constructs a new OcrService with default settings.
      * Currently uses Tesseract with English language support.
      */
-    public OcrService() {
+    public OcrService() throws OcrException {
+        configureLibraryPath();
+
+        try {
+            this.tesseract = new Tesseract();
+            tesseract.setLanguage("eng");
+            configureTessdata();
+            LOGGER.debug("Initialized OcrService with Tesseract");
+        } catch (Exception e) {
+            throw new OcrException("Failed to initialize OCR engine", e);
+        }
+    }
+
+    private void configureLibraryPath() {
         if (Platform.isMac()) {
+            String originalPath = System.getProperty(JNA_LIBRARY_PATH, "");
             if (Platform.isARM()) {
-                System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/opt/homebrew/lib/");
+                System.setProperty(JNA_LIBRARY_PATH,
+                        originalPath + File.pathSeparator + "/opt/homebrew/lib/");
+            } else {
+                System.setProperty(JNA_LIBRARY_PATH,
+                        originalPath + File.pathSeparator + "/usr/local/cellar/");
+            }
+        }
+    }
+
+    private void configureTessdata() throws OcrException {
+        // First, check environment variable
+        String tessdataPath = System.getenv(TESSDATA_PREFIX);
+
+        if (tessdataPath != null && !tessdataPath.isEmpty()) {
+            File tessdataDir = new File(tessdataPath);
+            if (tessdataDir.exists() && tessdataDir.isDirectory()) {
+                // Tesseract expects the parent directory of tessdata
+                if (tessdataDir.getName().equals("tessdata")) {
+                    tesseract.setDatapath(tessdataDir.getParent());
+                } else {
+                    tesseract.setDatapath(tessdataPath);
+                }
+                LOGGER.info("Using tessdata from environment variable: {}", tessdataPath);
+                return;
             } else {
-                System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/usr/local/cellar/");
+                LOGGER.warn("TESSDATA_PREFIX points to non-existent directory: {}", tessdataPath);
             }
         }
-        this.tesseract = new Tesseract();
 
-        // Configure Tesseract
-        tesseract.setLanguage("eng");
+        // Fall back to system locations
+        String systemPath = findSystemTessdata();
+        if (systemPath != null) {
+            tesseract.setDatapath(systemPath);
+            LOGGER.info("Using system tessdata at: {}", systemPath);
+        } else {
+            throw new OcrException("Could not find tessdata directory. Please set TESSDATA_PREFIX environment variable.");
+        }
+    }
+
+    private String findSystemTessdata() {
+        String[] possiblePaths = {
+                "/usr/local/share",  // Homebrew Intel
+                "/opt/homebrew/share",  // Homebrew ARM
+                "/usr/share"  // System
+        };
 
-        // TODO: This path needs to be configurable and bundled properly
-        // For now, we'll use a relative path that works during development
-        tesseract.setDatapath("tessdata");
+        for (String path : possiblePaths) {
+            File tessdata = new File(path, "tessdata");
+            File engData = new File(tessdata, "eng.traineddata");
+            if (tessdata.exists() && engData.exists()) {
+                return path;  // Return parent of tessdata
+            }
+        }
 
-        LOGGER.debug("Initialized OcrService with Tesseract");
+        return null;
     }
 
     /**
@@ -53,35 +109,35 @@ public OcrService() {
      * @return The extracted text, or empty string if no text found
      * @throws OcrException if OCR processing fails
      */
-    public String performOcr(Path pdfPath) throws OcrException {
-        // Validate input
+    public OcrResult performOcr(Path pdfPath) {
+        // User error - not an exception
         if (pdfPath == null) {
-            throw new OcrException("PDF path cannot be null");
+            LOGGER.warn("PDF path is null");
+            return OcrResult.failure("No file path provided");
         }
 
         File pdfFile = pdfPath.toFile();
+
+        // User error - not an exception
         if (!pdfFile.exists()) {
-            throw new OcrException("PDF file does not exist: " + pdfPath);
+            LOGGER.warn("PDF file does not exist: {}", pdfPath);
+            return OcrResult.failure("File does not exist: " + pdfPath.getFileName());
         }
 
         try {
             LOGGER.info("Starting OCR for file: {}", pdfFile.getName());
 
-            // Perform OCR
             String result = tesseract.doOCR(pdfFile);
-
-            // Clean up the result (remove extra whitespace, etc.)
             result = StringUtil.isBlank(result) ? "" : result.trim();
 
             LOGGER.info("OCR completed successfully. Extracted {} characters", result.length());
-            return result;
-        } catch (
-                TesseractException e) {
-            LOGGER.error("OCR failed for file: {}", pdfFile.getName(), e);
-            throw new OcrException(
-                    "Failed to perform OCR on file: " + pdfFile.getName() +
-                            ". Error: " + e.getMessage(), e
-            );
+            return OcrResult.success(result);
+
+        } catch (TesseractException e) {
+            // This could be either a user error (corrupt PDF) or our bug
+            // Log it as error but return as failure, not exception
+            LOGGER.error("OCR processing failed for file: {}", pdfFile.getName(), e);
+            return OcrResult.failure("Failed to extract text from PDF: " + e.getMessage());
         }
     }
 }