-
-
Notifications
You must be signed in to change notification settings - Fork 2.9k
OCR integration #13313
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
OCR integration #13313
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
package org.jabref.logic.ocr; | ||
|
||
import java.util.Optional; | ||
|
||
public class OcrResult { | ||
private final boolean success; | ||
private final String text; | ||
private final String errorMessage; | ||
|
||
private OcrResult(boolean success, String text, String errorMessage) { | ||
this.success = success; | ||
this.text = text; | ||
this.errorMessage = errorMessage; | ||
} | ||
|
||
public static OcrResult success(String text) { | ||
return new OcrResult(true, text, null); | ||
Comment on lines
+10
to
+17
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There should be a better way to think about the constructor parameters. Maybe have different constructors to avoid passing null (as both success and failure are very expected cases)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The other idea is to model a sealed class hierarchy with sucess/failure as simple record and then you can do pattern matching with switch ... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. example at the bottom https://gamlor.info/posts-output/2022-06-01-java-advanced-enums/en/ |
||
} | ||
Comment on lines
+16
to
+18
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Factory method passes null explicitly as a parameter. This violates the principle of not passing null to methods. Consider using Optional or restructuring to avoid null parameter. |
||
|
||
public static OcrResult failure(String errorMessage) { | ||
return new OcrResult(false, null, errorMessage); | ||
} | ||
Comment on lines
+20
to
+22
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Factory method passes null explicitly as a parameter. This violates the principle of not passing null to methods. Consider using Optional or restructuring to avoid null parameter. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Optional should only be a return type |
||
|
||
public boolean isSuccess() { | ||
return success; | ||
} | ||
|
||
public Optional<String> getText() { | ||
return Optional.ofNullable(text); | ||
} | ||
|
||
public Optional<String> getErrorMessage() { | ||
return Optional.ofNullable(errorMessage); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,31 +19,87 @@ | |
public class OcrService { | ||
private static final Logger LOGGER = LoggerFactory.getLogger(OcrService.class); | ||
private static final String JNA_LIBRARY_PATH = "jna.library.path"; | ||
private static final String TESSDATA_PREFIX = "TESSDATA_PREFIX"; | ||
|
||
// The OCR engine instance | ||
private final Tesseract tesseract; | ||
|
||
/** | ||
* Constructs a new OcrService with default settings. | ||
* Currently uses Tesseract with English language support. | ||
*/ | ||
public OcrService() { | ||
public OcrService() throws OcrException { | ||
configureLibraryPath(); | ||
|
||
try { | ||
this.tesseract = new Tesseract(); | ||
tesseract.setLanguage("eng"); | ||
configureTessdata(); | ||
LOGGER.debug("Initialized OcrService with Tesseract"); | ||
} catch (Exception e) { | ||
throw new OcrException("Failed to initialize OCR engine", e); | ||
} | ||
} | ||
|
||
private void configureLibraryPath() { | ||
if (Platform.isMac()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adapted from https://github.com/nguyenq/tess4j/pull/240/files |
||
String originalPath = System.getProperty(JNA_LIBRARY_PATH, ""); | ||
if (Platform.isARM()) { | ||
System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/opt/homebrew/lib/"); | ||
System.setProperty(JNA_LIBRARY_PATH, | ||
originalPath + File.pathSeparator + "/opt/homebrew/lib/"); | ||
} else { | ||
System.setProperty(JNA_LIBRARY_PATH, | ||
originalPath + File.pathSeparator + "/usr/local/cellar/"); | ||
} | ||
} | ||
} | ||
|
||
private void configureTessdata() throws OcrException { | ||
// First, check environment variable | ||
String tessdataPath = System.getenv(TESSDATA_PREFIX); | ||
|
||
if (tessdataPath != null && !tessdataPath.isEmpty()) { | ||
File tessdataDir = new File(tessdataPath); | ||
if (tessdataDir.exists() && tessdataDir.isDirectory()) { | ||
// Tesseract expects the parent directory of tessdata | ||
if (tessdataDir.getName().equals("tessdata")) { | ||
tesseract.setDatapath(tessdataDir.getParent()); | ||
} else { | ||
tesseract.setDatapath(tessdataPath); | ||
} | ||
LOGGER.info("Using tessdata from environment variable: {}", tessdataPath); | ||
return; | ||
} else { | ||
System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/usr/local/cellar/"); | ||
LOGGER.warn("TESSDATA_PREFIX points to non-existent directory: {}", tessdataPath); | ||
} | ||
} | ||
this.tesseract = new Tesseract(); | ||
|
||
// Configure Tesseract | ||
tesseract.setLanguage("eng"); | ||
// Fall back to system locations | ||
String systemPath = findSystemTessdata(); | ||
if (systemPath != null) { | ||
tesseract.setDatapath(systemPath); | ||
LOGGER.info("Using system tessdata at: {}", systemPath); | ||
} else { | ||
throw new OcrException("Could not find tessdata directory. Please set TESSDATA_PREFIX environment variable."); | ||
Comment on lines
+82
to
+83
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid logical branching to throw exceptions. |
||
} | ||
} | ||
|
||
private String findSystemTessdata() { | ||
String[] possiblePaths = { | ||
"/usr/local/share", // Homebrew Intel | ||
"/opt/homebrew/share", // Homebrew ARM | ||
"/usr/share" // System | ||
}; | ||
|
||
// TODO: This path needs to be configurable and bundled properly | ||
// For now, we'll use a relative path that works during development | ||
tesseract.setDatapath("tessdata"); | ||
for (String path : possiblePaths) { | ||
File tessdata = new File(path, "tessdata"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use java.nio with Path and File.exists(...) https://docs.oracle.com/javase/tutorial/essential/io/legacy.html |
||
File engData = new File(tessdata, "eng.traineddata"); | ||
if (tessdata.exists() && engData.exists()) { | ||
return path; // Return parent of tessdata | ||
} | ||
} | ||
|
||
LOGGER.debug("Initialized OcrService with Tesseract"); | ||
return null; | ||
} | ||
|
||
/** | ||
|
@@ -53,35 +109,35 @@ public OcrService() { | |
* @return The extracted text, or empty string if no text found | ||
* @throws OcrException if OCR processing fails | ||
*/ | ||
public String performOcr(Path pdfPath) throws OcrException { | ||
// Validate input | ||
public OcrResult performOcr(Path pdfPath) { | ||
// User error - not an exception | ||
if (pdfPath == null) { | ||
throw new OcrException("PDF path cannot be null"); | ||
LOGGER.warn("PDF path is null"); | ||
return OcrResult.failure("No file path provided"); | ||
} | ||
|
||
File pdfFile = pdfPath.toFile(); | ||
|
||
// User error - not an exception | ||
if (!pdfFile.exists()) { | ||
throw new OcrException("PDF file does not exist: " + pdfPath); | ||
LOGGER.warn("PDF file does not exist: {}", pdfPath); | ||
return OcrResult.failure("File does not exist: " + pdfPath.getFileName()); | ||
} | ||
|
||
try { | ||
LOGGER.info("Starting OCR for file: {}", pdfFile.getName()); | ||
|
||
// Perform OCR | ||
String result = tesseract.doOCR(pdfFile); | ||
|
||
// Clean up the result (remove extra whitespace, etc.) | ||
result = StringUtil.isBlank(result) ? "" : result.trim(); | ||
|
||
LOGGER.info("OCR completed successfully. Extracted {} characters", result.length()); | ||
return result; | ||
} catch ( | ||
TesseractException e) { | ||
LOGGER.error("OCR failed for file: {}", pdfFile.getName(), e); | ||
throw new OcrException( | ||
"Failed to perform OCR on file: " + pdfFile.getName() + | ||
". Error: " + e.getMessage(), e | ||
); | ||
return OcrResult.success(result); | ||
|
||
} catch (TesseractException e) { | ||
// This could be either a user error (corrupt PDF) or our bug | ||
// Log it as error but return as failure, not exception | ||
LOGGER.error("OCR processing failed for file: {}", pdfFile.getName(), e); | ||
return OcrResult.failure("Failed to extract text from PDF: " + e.getMessage()); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe can be converted to a record ? https://docs.oracle.com/en/java/javase/24/language/records.html