From a51e3b08016f091940b44908cb394e420b78198f Mon Sep 17 00:00:00 2001 From: Kaan0029 Date: Thu, 12 Jun 2025 20:57:22 +0200 Subject: [PATCH 1/3] Initial implementation using tess4j --- .gitignore | 2 + .../gui/fieldeditors/LinkedFilesEditor.java | 3 +- .../contextmenu/ContextMenuFactory.java | 45 +++++++- .../org/jabref/gui/linkedfile/OcrAction.java | 106 ++++++++++++++++++ .../contextmenu/ContextMenuFactoryTest.java | 5 +- jablib/build.gradle.kts | 3 + jablib/src/main/java/module-info.java | 2 + .../org/jabref/logic/ocr/OcrException.java | 28 +++++ .../java/org/jabref/logic/ocr/OcrService.java | 78 +++++++++++++ .../main/resources/l10n/JabRef_en.properties | 8 ++ 10 files changed, 277 insertions(+), 3 deletions(-) create mode 100644 jabgui/src/main/java/org/jabref/gui/linkedfile/OcrAction.java create mode 100644 jablib/src/main/java/org/jabref/logic/ocr/OcrException.java create mode 100644 jablib/src/main/java/org/jabref/logic/ocr/OcrService.java diff --git a/.gitignore b/.gitignore index c94289d0f06..c0e5370009f 100644 --- a/.gitignore +++ b/.gitignore @@ -581,3 +581,5 @@ CHANGELOG.html # some strange gradle/IntelliJ extension extension 'reporting' property 'baseDirectory' + +tessdata/ diff --git a/jabgui/src/main/java/org/jabref/gui/fieldeditors/LinkedFilesEditor.java b/jabgui/src/main/java/org/jabref/gui/fieldeditors/LinkedFilesEditor.java index ff6992be048..554c58cae8a 100644 --- a/jabgui/src/main/java/org/jabref/gui/fieldeditors/LinkedFilesEditor.java +++ b/jabgui/src/main/java/org/jabref/gui/fieldeditors/LinkedFilesEditor.java @@ -325,7 +325,8 @@ private void handleItemMouseClick(LinkedFileViewModel linkedFile, MouseEvent eve bibEntry, viewModel, contextCommandFactory, - multiContextCommandFactory + multiContextCommandFactory, + taskExecutor ); ContextMenu contextMenu = contextMenuFactory.createForSelection(listView.getSelectionModel().getSelectedItems()); diff --git a/jabgui/src/main/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactory.java b/jabgui/src/main/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactory.java index a6b3262450e..9c05875e180 100644 --- a/jabgui/src/main/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactory.java +++ b/jabgui/src/main/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactory.java @@ -2,6 +2,7 @@ import javafx.collections.ObservableList; import javafx.scene.control.ContextMenu; +import javafx.scene.control.MenuItem; import javafx.scene.control.SeparatorMenuItem; import org.jabref.gui.DialogService; @@ -10,7 +11,10 @@ import org.jabref.gui.copyfiles.CopySingleFileAction; import org.jabref.gui.fieldeditors.LinkedFileViewModel; import org.jabref.gui.fieldeditors.LinkedFilesEditorViewModel; +import org.jabref.gui.linkedfile.OcrAction; import org.jabref.gui.preferences.GuiPreferences; +import org.jabref.logic.l10n.Localization; +import org.jabref.logic.util.TaskExecutor; import org.jabref.model.database.BibDatabaseContext; import org.jabref.model.entry.BibEntry; @@ -25,6 +29,7 @@ public class ContextMenuFactory { private final LinkedFilesEditorViewModel viewModel; private final SingleContextCommandFactory singleCommandFactory; private final MultiContextCommandFactory multiCommandFactory; + private final TaskExecutor taskExecutor; public ContextMenuFactory(DialogService dialogService, GuiPreferences preferences, @@ -32,7 +37,8 @@ public ContextMenuFactory(DialogService dialogService, ObservableOptionalValue bibEntry, LinkedFilesEditorViewModel viewModel, SingleContextCommandFactory singleCommandFactory, - MultiContextCommandFactory multiCommandFactory) { + MultiContextCommandFactory multiCommandFactory, + TaskExecutor taskExecutor) { this.dialogService = dialogService; this.preferences = preferences; this.databaseContext = databaseContext; @@ -40,6 +46,7 @@ public ContextMenuFactory(DialogService dialogService, this.viewModel = viewModel; this.singleCommandFactory = singleCommandFactory; this.multiCommandFactory = multiCommandFactory; + this.taskExecutor = taskExecutor; } public ContextMenu createForSelection(ObservableList selectedFiles) { @@ -86,9 +93,45 @@ private ContextMenu createContextMenuForFile(LinkedFileViewModel linkedFile) { factory.createMenuItem(StandardActions.DELETE_FILE, singleCommandFactory.build(StandardActions.DELETE_FILE, linkedFile)) ); + // Add OCR menu item for PDF files + if (linkedFile.getFile().getFileType().equalsIgnoreCase("pdf")) { + menu.getItems().add(new SeparatorMenuItem()); + + MenuItem ocrItem = createOcrMenuItem(linkedFile); + menu.getItems().add(ocrItem); + } + return menu; } + /** + * Creates the OCR menu item for a PDF file. + * The menu item is only enabled if the PDF file exists on disk. + * + * @param linkedFile The linked PDF file + * @return MenuItem configured for OCR action + */ + private MenuItem createOcrMenuItem(LinkedFileViewModel linkedFile) { + MenuItem ocrItem = new MenuItem(Localization.lang("Extract text (OCR)")); + + // Create the OCR action + OcrAction ocrAction = new OcrAction( + linkedFile.getFile(), + databaseContext, + dialogService, + preferences.getFilePreferences(), + taskExecutor + ); + + // Set the action to execute when clicked + ocrItem.setOnAction(event -> ocrAction.execute()); + + // Disable if the action is not executable (file doesn't exist) + ocrItem.disableProperty().bind(ocrAction.executableProperty().not()); + + return ocrItem; + } + @FunctionalInterface public interface SingleContextCommandFactory { ContextAction build(StandardActions action, LinkedFileViewModel file); diff --git a/jabgui/src/main/java/org/jabref/gui/linkedfile/OcrAction.java b/jabgui/src/main/java/org/jabref/gui/linkedfile/OcrAction.java new file mode 100644 index 00000000000..2d5f22773f4 --- /dev/null +++ b/jabgui/src/main/java/org/jabref/gui/linkedfile/OcrAction.java @@ -0,0 +1,106 @@ +package org.jabref.gui.linkedfile; + +import org.jabref.gui.DialogService; +import org.jabref.gui.StateManager; +import org.jabref.gui.actions.Action; +import org.jabref.gui.actions.ActionHelper; +import org.jabref.gui.actions.SimpleCommand; +import org.jabref.logic.util.BackgroundTask; +import org.jabref.logic.util.TaskExecutor; +import org.jabref.logic.l10n.Localization; +import org.jabref.logic.ocr.OcrService; +import org.jabref.logic.ocr.OcrException; +import org.jabref.model.database.BibDatabaseContext; +import org.jabref.model.entry.LinkedFile; +import org.jabref.logic.FilePreferences; + +import java.nio.file.Path; +import java.util.Optional; + +/** + * Action for performing OCR (Optical Character Recognition) on linked PDF files. + *

+ * This action extracts text content from PDF files that are attached to BibTeX entries. + * It runs the OCR process in a background thread to keep the UI responsive and provides + * user feedback through dialogs and notifications. + *

+ * The action follows JabRef's command pattern and can be triggered from context menus. + * It includes built-in validation to ensure it's only enabled for PDF files that exist on disk. + * + * @see OcrService + * @see org.jabref.gui.actions.SimpleCommand + */ + +// Personal Note: Add more doc in between later + +public class OcrAction extends SimpleCommand { + + private final LinkedFile linkedFile; + private final BibDatabaseContext databaseContext; + private final DialogService dialogService; + private final FilePreferences filePreferences; + private final TaskExecutor taskExecutor; + + public OcrAction(LinkedFile linkedFile, + BibDatabaseContext databaseContext, + DialogService dialogService, + FilePreferences filePreferences, + TaskExecutor taskExecutor) { + this.linkedFile = linkedFile; + this.databaseContext = databaseContext; + this.dialogService = dialogService; + this.filePreferences = filePreferences; + this.taskExecutor = taskExecutor; + + // Only executable for existing PDF files + this.executable.set( + linkedFile.getFileType().equalsIgnoreCase("pdf") && + linkedFile.findIn(databaseContext, filePreferences).isPresent() + ); + } + + @Override + public void execute() { + Optional filePath = linkedFile.findIn(databaseContext, filePreferences); + + if (filePath.isEmpty()) { + dialogService.showErrorDialogAndWait( + Localization.lang("File not found"), + Localization.lang("Could not locate the PDF file on disk.") + ); + return; + } + + dialogService.notify(Localization.lang("Performing OCR...")); + + BackgroundTask.wrap(() -> { + OcrService ocrService = new OcrService(); + return ocrService.performOcr(filePath.get()); + }) + .onSuccess(extractedText -> { + if (extractedText.isEmpty()) { + dialogService.showInformationDialogAndWait( + Localization.lang("OCR Complete"), + Localization.lang("No text was found in the PDF.") + ); + } else { + // For now, just show preview + String preview = extractedText.length() > 1000 + ? extractedText.substring(0, 1000) + "..." + : extractedText; + + dialogService.showInformationDialogAndWait( + Localization.lang("OCR Result"), + preview + ); + } + }) + .onFailure(exception -> { + dialogService.showErrorDialogAndWait( + Localization.lang("OCR failed"), + exception.getMessage() + ); + }) + .executeWith(taskExecutor); + } +} diff --git a/jabgui/src/test/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactoryTest.java b/jabgui/src/test/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactoryTest.java index 141bc6e44e5..ec777d2b379 100644 --- a/jabgui/src/test/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactoryTest.java +++ b/jabgui/src/test/java/org/jabref/gui/fieldeditors/contextmenu/ContextMenuFactoryTest.java @@ -11,6 +11,7 @@ import org.jabref.gui.fieldeditors.LinkedFileViewModel; import org.jabref.gui.fieldeditors.LinkedFilesEditorViewModel; import org.jabref.gui.preferences.GuiPreferences; +import org.jabref.logic.util.TaskExecutor; import org.jabref.model.database.BibDatabaseContext; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.LinkedFile; @@ -41,6 +42,7 @@ public class ContextMenuFactoryTest { private ContextMenuFactory factory; private ContextMenuFactory.SingleContextCommandFactory singleCommandFactory; private ContextMenuFactory.MultiContextCommandFactory multiCommandFactory; + private TaskExecutor taskExecutor; @BeforeAll public static void initToolkit() { @@ -78,7 +80,8 @@ public void setUp() { bibEntry, viewModel, singleCommandFactory, - multiCommandFactory + multiCommandFactory, + taskExecutor ); } diff --git a/jablib/build.gradle.kts b/jablib/build.gradle.kts index 4581f58e23f..5e43cd7ef2b 100644 --- a/jablib/build.gradle.kts +++ b/jablib/build.gradle.kts @@ -243,6 +243,9 @@ dependencies { // Required for LocalizationConsistencyTest testImplementation("org.testfx:testfx-core:4.0.16-alpha") testImplementation("org.testfx:testfx-junit5:4.0.16-alpha") + + // OCR support + implementation("net.sourceforge.tess4j:tess4j:5.15.0") } /* jacoco { diff --git a/jablib/src/main/java/module-info.java b/jablib/src/main/java/module-info.java index a792ae43f7d..85258c1526a 100644 --- a/jablib/src/main/java/module-info.java +++ b/jablib/src/main/java/module-info.java @@ -104,6 +104,7 @@ exports org.jabref.logic.crawler; exports org.jabref.logic.git; exports org.jabref.logic.pseudonymization; + exports org.jabref.logic.ocr; requires java.base; @@ -250,5 +251,6 @@ requires mslinks; requires org.antlr.antlr4.runtime; requires org.libreoffice.uno; + requires tess4j; // endregion } diff --git a/jablib/src/main/java/org/jabref/logic/ocr/OcrException.java b/jablib/src/main/java/org/jabref/logic/ocr/OcrException.java new file mode 100644 index 00000000000..808f4d93a7a --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/ocr/OcrException.java @@ -0,0 +1,28 @@ +package org.jabref.logic.ocr; + +/** + * Exception thrown when OCR operations fail. + * This exception wraps lower-level OCR engine exceptions to provide + * a consistent interface for error handling throughout JabRef. + */ +public class OcrException extends Exception { + + /** + * Constructs an OcrException with a message and underlying cause. + * + * @param message Descriptive error message + * @param cause The underlying exception that caused this error + */ + public OcrException(String message, Throwable cause) { + super(message, cause); + } + + /** + * Constructs an OcrException with only a message. + * + * @param message Descriptive error message + */ + public OcrException(String message) { + super(message); + } +} diff --git a/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java new file mode 100644 index 00000000000..d51695fcd22 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java @@ -0,0 +1,78 @@ +package org.jabref.logic.ocr; + +import net.sourceforge.tess4j.Tesseract; +import net.sourceforge.tess4j.TesseractException; +import org.jabref.model.strings.StringUtil; // JabRef utility class +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.file.Path; + +/** + * Service for performing Optical Character Recognition (OCR) on PDF files. + * This class provides a high-level interface to OCR functionality, + * abstracting away the specific OCR engine implementation details. + */ +public class OcrService { + private static final Logger LOGGER = LoggerFactory.getLogger(OcrService.class); + + // The OCR engine instance + private final Tesseract tesseract; + + /** + * Constructs a new OcrService with default settings. + * Currently uses Tesseract with English language support. + */ + public OcrService() { + this.tesseract = new Tesseract(); + + // Configure Tesseract + tesseract.setLanguage("eng"); + + // TODO: This path needs to be configurable and bundled properly + // For now, we'll use a relative path that works during development + tesseract.setDatapath("tessdata"); + + LOGGER.debug("Initialized OcrService with Tesseract"); + } + + /** + * Performs OCR on a PDF file and returns the extracted text. + * + * @param pdfPath Path to the PDF file to process + * @return The extracted text, or empty string if no text found + * @throws OcrException if OCR processing fails + */ + public String performOcr(Path pdfPath) throws OcrException { + // Validate input + if (pdfPath == null) { + throw new OcrException("PDF path cannot be null"); + } + + File pdfFile = pdfPath.toFile(); + if (!pdfFile.exists()) { + throw new OcrException("PDF file does not exist: " + pdfPath); + } + + try { + LOGGER.info("Starting OCR for file: {}", pdfFile.getName()); + + // Perform OCR + String result = tesseract.doOCR(pdfFile); + + // Clean up the result (remove extra whitespace, etc.) + result = StringUtil.isBlank(result) ? "" : result.trim(); + + LOGGER.info("OCR completed successfully. Extracted {} characters", result.length()); + return result; + + } catch (TesseractException e) { + LOGGER.error("OCR failed for file: {}", pdfFile.getName(), e); + throw new OcrException( + "Failed to perform OCR on file: " + pdfFile.getName() + + ". Error: " + e.getMessage(), e + ); + } + } +} diff --git a/jablib/src/main/resources/l10n/JabRef_en.properties b/jablib/src/main/resources/l10n/JabRef_en.properties index bff236c92a6..85456f30e59 100644 --- a/jablib/src/main/resources/l10n/JabRef_en.properties +++ b/jablib/src/main/resources/l10n/JabRef_en.properties @@ -316,6 +316,14 @@ Extract\ references\ from\ file\ (online)=Extract references from file (online) Extract\ References\ (offline)=Extract References (offline) Extract\ References\ (online)=Extract References (online) +Extract\ text\ (OCR)=Extract text (OCR) +Performing\ OCR...=Performing OCR... +OCR\ Complete=OCR Complete +OCR\ Result=OCR Result +OCR\ failed=OCR failed +No\ text\ was\ found\ in\ the\ PDF.=No text was found in the PDF. +Could\ not\ locate\ the\ PDF\ file\ on\ disk.=Could not locate the PDF file on disk. + Processing...=Processing... Processing\ "%0"...=Processing "%0"... Processing\ PDF(s)=Processing PDF(s) From 48ffb067bef2e38faaf7e9a677a396604288b439 Mon Sep 17 00:00:00 2001 From: Siedlerchr Date: Thu, 12 Jun 2025 21:16:40 +0200 Subject: [PATCH 2/3] add brew to jna path --- .../java/org/jabref/logic/ocr/OcrService.java | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java index d51695fcd22..4f68c1ab5c2 100644 --- a/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java +++ b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java @@ -1,14 +1,16 @@ package org.jabref.logic.ocr; +import java.io.File; +import java.nio.file.Path; + +import org.jabref.model.strings.StringUtil; + +import com.sun.jna.Platform; import net.sourceforge.tess4j.Tesseract; import net.sourceforge.tess4j.TesseractException; -import org.jabref.model.strings.StringUtil; // JabRef utility class import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; -import java.nio.file.Path; - /** * Service for performing Optical Character Recognition (OCR) on PDF files. * This class provides a high-level interface to OCR functionality, @@ -16,7 +18,7 @@ */ public class OcrService { private static final Logger LOGGER = LoggerFactory.getLogger(OcrService.class); - + private static final String JNA_LIBRARY_PATH = "jna.library.path"; // The OCR engine instance private final Tesseract tesseract; @@ -25,6 +27,13 @@ public class OcrService { * Currently uses Tesseract with English language support. */ public OcrService() { + if (Platform.isMac()) { + if (Platform.isARM()) { + System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/opt/homebrew/lib/"); + } else { + System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/usr/local/cellar/"); + } + } this.tesseract = new Tesseract(); // Configure Tesseract @@ -66,8 +75,8 @@ public String performOcr(Path pdfPath) throws OcrException { LOGGER.info("OCR completed successfully. Extracted {} characters", result.length()); return result; - - } catch (TesseractException e) { + } catch ( + TesseractException e) { LOGGER.error("OCR failed for file: {}", pdfFile.getName(), e); throw new OcrException( "Failed to perform OCR on file: " + pdfFile.getName() + From 5a256ae71e3eeb36ecd3de97812f2e68632df1c3 Mon Sep 17 00:00:00 2001 From: Kaan0029 Date: Thu, 19 Jun 2025 20:39:37 +0200 Subject: [PATCH 3/3] Adapted Exception handling and configured tessdata variable --- .../java/org/jabref/logic/ocr/OcrResult.java | 35 ++++++ .../java/org/jabref/logic/ocr/OcrService.java | 106 +++++++++++++----- 2 files changed, 116 insertions(+), 25 deletions(-) create mode 100644 jablib/src/main/java/org/jabref/logic/ocr/OcrResult.java diff --git a/jablib/src/main/java/org/jabref/logic/ocr/OcrResult.java b/jablib/src/main/java/org/jabref/logic/ocr/OcrResult.java new file mode 100644 index 00000000000..8cbe573e938 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/ocr/OcrResult.java @@ -0,0 +1,35 @@ +package org.jabref.logic.ocr; + +import java.util.Optional; + +public class OcrResult { + private final boolean success; + private final String text; + private final String errorMessage; + + private OcrResult(boolean success, String text, String errorMessage) { + this.success = success; + this.text = text; + this.errorMessage = errorMessage; + } + + public static OcrResult success(String text) { + return new OcrResult(true, text, null); + } + + public static OcrResult failure(String errorMessage) { + return new OcrResult(false, null, errorMessage); + } + + public boolean isSuccess() { + return success; + } + + public Optional getText() { + return Optional.ofNullable(text); + } + + public Optional getErrorMessage() { + return Optional.ofNullable(errorMessage); + } +} diff --git a/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java index 4f68c1ab5c2..37714273e6d 100644 --- a/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java +++ b/jablib/src/main/java/org/jabref/logic/ocr/OcrService.java @@ -19,6 +19,8 @@ public class OcrService { private static final Logger LOGGER = LoggerFactory.getLogger(OcrService.class); private static final String JNA_LIBRARY_PATH = "jna.library.path"; + private static final String TESSDATA_PREFIX = "TESSDATA_PREFIX"; + // The OCR engine instance private final Tesseract tesseract; @@ -26,24 +28,78 @@ public class OcrService { * Constructs a new OcrService with default settings. * Currently uses Tesseract with English language support. */ - public OcrService() { + public OcrService() throws OcrException { + configureLibraryPath(); + + try { + this.tesseract = new Tesseract(); + tesseract.setLanguage("eng"); + configureTessdata(); + LOGGER.debug("Initialized OcrService with Tesseract"); + } catch (Exception e) { + throw new OcrException("Failed to initialize OCR engine", e); + } + } + + private void configureLibraryPath() { if (Platform.isMac()) { + String originalPath = System.getProperty(JNA_LIBRARY_PATH, ""); if (Platform.isARM()) { - System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/opt/homebrew/lib/"); + System.setProperty(JNA_LIBRARY_PATH, + originalPath + File.pathSeparator + "/opt/homebrew/lib/"); + } else { + System.setProperty(JNA_LIBRARY_PATH, + originalPath + File.pathSeparator + "/usr/local/cellar/"); + } + } + } + + private void configureTessdata() throws OcrException { + // First, check environment variable + String tessdataPath = System.getenv(TESSDATA_PREFIX); + + if (tessdataPath != null && !tessdataPath.isEmpty()) { + File tessdataDir = new File(tessdataPath); + if (tessdataDir.exists() && tessdataDir.isDirectory()) { + // Tesseract expects the parent directory of tessdata + if (tessdataDir.getName().equals("tessdata")) { + tesseract.setDatapath(tessdataDir.getParent()); + } else { + tesseract.setDatapath(tessdataPath); + } + LOGGER.info("Using tessdata from environment variable: {}", tessdataPath); + return; } else { - System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/usr/local/cellar/"); + LOGGER.warn("TESSDATA_PREFIX points to non-existent directory: {}", tessdataPath); } } - this.tesseract = new Tesseract(); - // Configure Tesseract - tesseract.setLanguage("eng"); + // Fall back to system locations + String systemPath = findSystemTessdata(); + if (systemPath != null) { + tesseract.setDatapath(systemPath); + LOGGER.info("Using system tessdata at: {}", systemPath); + } else { + throw new OcrException("Could not find tessdata directory. Please set TESSDATA_PREFIX environment variable."); + } + } + + private String findSystemTessdata() { + String[] possiblePaths = { + "/usr/local/share", // Homebrew Intel + "/opt/homebrew/share", // Homebrew ARM + "/usr/share" // System + }; - // TODO: This path needs to be configurable and bundled properly - // For now, we'll use a relative path that works during development - tesseract.setDatapath("tessdata"); + for (String path : possiblePaths) { + File tessdata = new File(path, "tessdata"); + File engData = new File(tessdata, "eng.traineddata"); + if (tessdata.exists() && engData.exists()) { + return path; // Return parent of tessdata + } + } - LOGGER.debug("Initialized OcrService with Tesseract"); + return null; } /** @@ -53,35 +109,35 @@ public OcrService() { * @return The extracted text, or empty string if no text found * @throws OcrException if OCR processing fails */ - public String performOcr(Path pdfPath) throws OcrException { - // Validate input + public OcrResult performOcr(Path pdfPath) { + // User error - not an exception if (pdfPath == null) { - throw new OcrException("PDF path cannot be null"); + LOGGER.warn("PDF path is null"); + return OcrResult.failure("No file path provided"); } File pdfFile = pdfPath.toFile(); + + // User error - not an exception if (!pdfFile.exists()) { - throw new OcrException("PDF file does not exist: " + pdfPath); + LOGGER.warn("PDF file does not exist: {}", pdfPath); + return OcrResult.failure("File does not exist: " + pdfPath.getFileName()); } try { LOGGER.info("Starting OCR for file: {}", pdfFile.getName()); - // Perform OCR String result = tesseract.doOCR(pdfFile); - - // Clean up the result (remove extra whitespace, etc.) result = StringUtil.isBlank(result) ? "" : result.trim(); LOGGER.info("OCR completed successfully. Extracted {} characters", result.length()); - return result; - } catch ( - TesseractException e) { - LOGGER.error("OCR failed for file: {}", pdfFile.getName(), e); - throw new OcrException( - "Failed to perform OCR on file: " + pdfFile.getName() + - ". Error: " + e.getMessage(), e - ); + return OcrResult.success(result); + + } catch (TesseractException e) { + // This could be either a user error (corrupt PDF) or our bug + // Log it as error but return as failure, not exception + LOGGER.error("OCR processing failed for file: {}", pdfFile.getName(), e); + return OcrResult.failure("Failed to extract text from PDF: " + e.getMessage()); } } }