-
-
Notifications
You must be signed in to change notification settings - Fork 2.9k
OCR integration #13313
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
OCR integration #13313
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
package org.jabref.gui.linkedfile; | ||
|
||
import org.jabref.gui.DialogService; | ||
import org.jabref.gui.StateManager; | ||
import org.jabref.gui.actions.Action; | ||
import org.jabref.gui.actions.ActionHelper; | ||
import org.jabref.gui.actions.SimpleCommand; | ||
import org.jabref.logic.util.BackgroundTask; | ||
import org.jabref.logic.util.TaskExecutor; | ||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.logic.ocr.OcrService; | ||
import org.jabref.logic.ocr.OcrException; | ||
import org.jabref.model.database.BibDatabaseContext; | ||
import org.jabref.model.entry.LinkedFile; | ||
import org.jabref.logic.FilePreferences; | ||
|
||
import java.nio.file.Path; | ||
import java.util.Optional; | ||
|
||
/** | ||
* Action for performing OCR (Optical Character Recognition) on linked PDF files. | ||
* <p> | ||
* This action extracts text content from PDF files that are attached to BibTeX entries. | ||
* It runs the OCR process in a background thread to keep the UI responsive and provides | ||
* user feedback through dialogs and notifications. | ||
* <p> | ||
* The action follows JabRef's command pattern and can be triggered from context menus. | ||
* It includes built-in validation to ensure it's only enabled for PDF files that exist on disk. | ||
* | ||
* @see OcrService | ||
* @see org.jabref.gui.actions.SimpleCommand | ||
*/ | ||
|
||
// Personal Note: Add more doc in between later | ||
|
||
public class OcrAction extends SimpleCommand { | ||
|
||
private final LinkedFile linkedFile; | ||
private final BibDatabaseContext databaseContext; | ||
private final DialogService dialogService; | ||
private final FilePreferences filePreferences; | ||
private final TaskExecutor taskExecutor; | ||
|
||
public OcrAction(LinkedFile linkedFile, | ||
BibDatabaseContext databaseContext, | ||
DialogService dialogService, | ||
FilePreferences filePreferences, | ||
TaskExecutor taskExecutor) { | ||
this.linkedFile = linkedFile; | ||
this.databaseContext = databaseContext; | ||
this.dialogService = dialogService; | ||
this.filePreferences = filePreferences; | ||
this.taskExecutor = taskExecutor; | ||
|
||
// Only executable for existing PDF files | ||
this.executable.set( | ||
linkedFile.getFileType().equalsIgnoreCase("pdf") && | ||
linkedFile.findIn(databaseContext, filePreferences).isPresent() | ||
); | ||
} | ||
|
||
@Override | ||
public void execute() { | ||
Optional<Path> filePath = linkedFile.findIn(databaseContext, filePreferences); | ||
|
||
if (filePath.isEmpty()) { | ||
dialogService.showErrorDialogAndWait( | ||
Localization.lang("File not found"), | ||
Localization.lang("Could not locate the PDF file on disk.") | ||
); | ||
return; | ||
} | ||
|
||
dialogService.notify(Localization.lang("Performing OCR...")); | ||
|
||
BackgroundTask.wrap(() -> { | ||
OcrService ocrService = new OcrService(); | ||
return ocrService.performOcr(filePath.get()); | ||
}) | ||
.onSuccess(extractedText -> { | ||
if (extractedText.isEmpty()) { | ||
dialogService.showInformationDialogAndWait( | ||
Localization.lang("OCR Complete"), | ||
Localization.lang("No text was found in the PDF.") | ||
); | ||
} else { | ||
// For now, just show preview | ||
String preview = extractedText.length() > 1000 | ||
? extractedText.substring(0, 1000) + "..." | ||
: extractedText; | ||
|
||
dialogService.showInformationDialogAndWait( | ||
Localization.lang("OCR Result"), | ||
preview | ||
); | ||
} | ||
}) | ||
.onFailure(exception -> { | ||
dialogService.showErrorDialogAndWait( | ||
Localization.lang("OCR failed"), | ||
exception.getMessage() | ||
); | ||
}) | ||
.executeWith(taskExecutor); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
package org.jabref.logic.ocr; | ||
|
||
/** | ||
* Exception thrown when OCR operations fail. | ||
* This exception wraps lower-level OCR engine exceptions to provide | ||
* a consistent interface for error handling throughout JabRef. | ||
*/ | ||
public class OcrException extends Exception { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the idea, only drawback I see is this may be hard to maintain/stay consistent with as the project grows or if external contributors wish to add something... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Best is to avoid exceptions being thrown at all, especially for expected possible stupid user behaviour. And if they are being thrown, keep them as informative as possible. The consistent interface is Exception at the end either way. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Backstory: This was based on my experience in OO - all the JStyle classes and the OO GUI class had a custom error wrapper, OOError. There was also OOResult and OOVoidResult. I still don't know much about them, but for |
||
|
||
/** | ||
* Constructs an OcrException with a message and underlying cause. | ||
* | ||
* @param message Descriptive error message | ||
* @param cause The underlying exception that caused this error | ||
*/ | ||
public OcrException(String message, Throwable cause) { | ||
super(message, cause); | ||
} | ||
|
||
/** | ||
* Constructs an OcrException with only a message. | ||
* | ||
* @param message Descriptive error message | ||
*/ | ||
public OcrException(String message) { | ||
super(message); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
package org.jabref.logic.ocr; | ||
|
||
import java.io.File; | ||
import java.nio.file.Path; | ||
|
||
import org.jabref.model.strings.StringUtil; | ||
|
||
import com.sun.jna.Platform; | ||
import net.sourceforge.tess4j.Tesseract; | ||
import net.sourceforge.tess4j.TesseractException; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
/** | ||
* Service for performing Optical Character Recognition (OCR) on PDF files. | ||
* This class provides a high-level interface to OCR functionality, | ||
* abstracting away the specific OCR engine implementation details. | ||
*/ | ||
public class OcrService { | ||
private static final Logger LOGGER = LoggerFactory.getLogger(OcrService.class); | ||
private static final String JNA_LIBRARY_PATH = "jna.library.path"; | ||
// The OCR engine instance | ||
private final Tesseract tesseract; | ||
Comment on lines
+24
to
+25
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The comment is trivial and can be derived directly from the code. It doesn't add any new information about the implementation or reasoning behind using Tesseract. |
||
|
||
/** | ||
* Constructs a new OcrService with default settings. | ||
* Currently uses Tesseract with English language support. | ||
*/ | ||
public OcrService() { | ||
if (Platform.isMac()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adapted from https://github.com/nguyenq/tess4j/pull/240/files |
||
if (Platform.isARM()) { | ||
System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/opt/homebrew/lib/"); | ||
} else { | ||
System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/usr/local/cellar/"); | ||
} | ||
} | ||
this.tesseract = new Tesseract(); | ||
|
||
// Configure Tesseract | ||
tesseract.setLanguage("eng"); | ||
|
||
// TODO: This path needs to be configurable and bundled properly | ||
// For now, we'll use a relative path that works during development | ||
tesseract.setDatapath("tessdata"); | ||
subhramit marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
LOGGER.debug("Initialized OcrService with Tesseract"); | ||
} | ||
|
||
/** | ||
* Performs OCR on a PDF file and returns the extracted text. | ||
* | ||
* @param pdfPath Path to the PDF file to process | ||
* @return The extracted text, or empty string if no text found | ||
* @throws OcrException if OCR processing fails | ||
*/ | ||
public String performOcr(Path pdfPath) throws OcrException { | ||
// Validate input | ||
if (pdfPath == null) { | ||
throw new OcrException("PDF path cannot be null"); | ||
} | ||
|
||
File pdfFile = pdfPath.toFile(); | ||
if (!pdfFile.exists()) { | ||
throw new OcrException("PDF file does not exist: " + pdfPath); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use exceptions only for exceptional states. See https://github.com/HugoMatilla/Effective-JAVA-Summary?tab=readme-ov-file#57-use-exceptions-only-for-exceptional-conditions |
||
|
||
try { | ||
LOGGER.info("Starting OCR for file: {}", pdfFile.getName()); | ||
|
||
// Perform OCR | ||
String result = tesseract.doOCR(pdfFile); | ||
|
||
// Clean up the result (remove extra whitespace, etc.) | ||
result = StringUtil.isBlank(result) ? "" : result.trim(); | ||
|
||
LOGGER.info("OCR completed successfully. Extracted {} characters", result.length()); | ||
return result; | ||
} catch ( | ||
TesseractException e) { | ||
LOGGER.error("OCR failed for file: {}", pdfFile.getName(), e); | ||
throw new OcrException( | ||
"Failed to perform OCR on file: " + pdfFile.getName() + | ||
". Error: " + e.getMessage(), e | ||
); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
before add
.showToUser(true) then it will be shown in the UI task list