Skip to content

Add more file importers to JabRef #13310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 14 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public enum StandardExternalFileType implements ExternalFileType {
TIFF(Localization.lang("%0 image", "TIFF"), "tiff", "image/tiff", "gimp", "picture", IconTheme.JabRefIcons.PICTURE),
URL("URL", "html", "text/html", "firefox", "www", IconTheme.JabRefIcons.WWW),
MHT("MHT", "mht", "multipart/related", "firefox", "www", IconTheme.JabRefIcons.WWW),
ePUB("ePUB", "epub", "application/epub+zip", "firefox", "www", IconTheme.JabRefIcons.WWW),
ePUB("ePUB", "epub", "application/epub+zip", "firefox", "www", IconTheme.JabRefIcons.BOOK),
MARKDOWN("Markdown", "md", "text/markdown", "emacs", "emacs", IconTheme.JabRefIcons.FILE_TEXT);
private final String name;
private final String extension;
Expand Down
31 changes: 31 additions & 0 deletions jablib/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,33 @@ dependencies {

implementation("de.rototor.snuggletex:snuggletex-jeuclid")

// region for document importing
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please rework with the new scheme.

But the concrete versions at https://github.com/JabRef/jabref/blob/main/versions/build.gradle.kts

implementation("org.apache.tika:tika-core:3.2.0") {
exclude(group = "commons-logging")
}
implementation("org.apache.tika:tika-parsers:3.2.0") {
exclude(group = "commons-logging")
}
implementation("org.apache.tika:tika-parser-xml-module:3.2.0") {
exclude(group = "commons-logging")
}
implementation("org.apache.tika:tika-parser-image-module:3.2.0") {
exclude(group = "commons-logging")
}
implementation("org.apache.tika:tika-parser-microsoft-module:3.2.0") {
exclude(group = "commons-logging")
}
implementation("org.apache.tika:tika-parser-text-module:3.2.0") {
exclude(group = "commons-logging")
}
implementation("org.apache.tika:tika-parser-miscoffice-module:3.2.0") {
exclude(group = "commons-logging")
}
implementation("org.apache.poi:poi:5.4.1")
// TODO: Remove this mail dependency.
implementation("com.sun.mail:jakarta.mail:2.0.1")
// endregion

// Even if("compileOnly") is used, IntelliJ always adds to module-info.java. To avoid issues during committing, we use("implementation") instead of("compileOnly")
implementation("io.github.adr:e-adr")

Expand Down Expand Up @@ -412,6 +439,10 @@ tasks.test {
useJUnitPlatform {
excludeTags("DatabaseTest", "FetcherTest")
}

jvmArgs(
"--add-exports=org.apache.poi.ooxml/org.apache.poi.xslf.extractor=org.apache.tika.parser.microsoft"
)
}

jmh {
Expand Down
6 changes: 3 additions & 3 deletions jablib/src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,16 +106,14 @@
exports org.jabref.logic.git;
exports org.jabref.logic.pseudonymization;
exports org.jabref.logic.citation.repository;

requires java.base;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, really, no java.base any more?

exports org.jabref.logic.importer.fileformat.misc;

requires javafx.base;
requires javafx.graphics; // because of javafx.scene.paint.Color
requires afterburner.fx;
requires com.tobiasdiez.easybind;

// for java.awt.geom.Rectangle2D required by org.jabref.logic.pdf.TextExtractor
requires java.desktop;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is org.jabref.logic.pdf.TextExtractor gone? Then also delete the line before.


// SQL
requires java.sql;
Expand Down Expand Up @@ -252,5 +250,7 @@
requires mslinks;
requires org.antlr.antlr4.runtime;
requires org.libreoffice.uno;
requires org.apache.tika.core;
requires org.jetbrains.annotations;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// endregion
}
4 changes: 4 additions & 0 deletions jablib/src/main/java/org/jabref/logic/importer/Importer.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ public abstract class Importer implements Comparable<Importer> {
* @throws IOException Signals that an I/O exception has occurred.
*/
public boolean isRecognizedFormat(Path filePath) throws IOException {
if (!Files.exists(filePath) || !Files.isRegularFile(filePath)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The latter includes the former, doesn't it?

Suggested change
if (!Files.exists(filePath) || !Files.isRegularFile(filePath)) {
if (!Files.isRegularFile(filePath)) {

return false;
}

try (BufferedReader bufferedReader = getReader(filePath)) {
return isRecognizedFormat(bufferedReader);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ public static ParserResult fromErrorMessage(String message) {
return parserResult;
}

public static ParserResult fromEntry(BibEntry entry) {
return new ParserResult(List.of(entry));
}

private static String getErrorMessage(Exception exception) {
String errorMessage = exception.getLocalizedMessage();
if (exception.getCause() != null) {
Expand Down
94 changes: 94 additions & 0 deletions jablib/src/main/java/org/jabref/logic/importer/TikaImporter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package org.jabref.logic.importer;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;

import org.jabref.logic.importer.util.TikaMetadataParser;
import org.jabref.logic.util.io.FileUtil;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.types.StandardEntryType;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;

/**
* Common class for all file importers that use Apache Tika to extract metadata from files.
* <p>
* Child classes should implement the rest of {@link Importer} and method {@link #extractMetadata(TikaMetadataParser, String)} to extract the {@link BibEntry} from the Tika metadata.
* <p>
* In case you need to use a specific Tika parser, you can override {@link #getTikaParser()} to return a different parser instance.
*/
public abstract class TikaImporter extends Importer {
@Override
public ParserResult importDatabase(BufferedReader input) throws IOException {
throw new UnsupportedOperationException("TikaImporter (and descendants) do not support importDatabase(BufferedReader reader)."
+ "Instead use importDatabase(Path filePath).");
}

@Override
public ParserResult importDatabase(Path filePath) throws IOException {
try (InputStream inputStream = new FileInputStream(filePath.toFile())) {
Parser parser = getTikaParser();
Metadata metadata = new Metadata();
BodyContentHandler handler = new BodyContentHandler();

ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);

parser.parse(inputStream, handler, metadata, parseContext);

String fileName = FileUtil.getBaseName(filePath);
BibEntry entry = extractMetadata(new TikaMetadataParser(metadata));

if (!entry.hasField(StandardField.TITLE)) {
entry.setField(StandardField.TITLE, fileName);
}

return ParserResult.fromEntry(entry);
} catch (SAXException | TikaException e) {
throw new IOException("Error parsing file: " + filePath, e);
}
}

protected Parser getTikaParser() {
return new AutoDetectParser();
}

/**
* Extracts common metadata from the given Tika metadata object and returns a {@link BibEntry}.
* <p>
* This function will add fields that are most standard and common across different file types. Inheritors are
* recommended to override {@link TikaImporter#extractAdditionalMetadata(BibEntry, TikaMetadataParser)}
* process additional metadata that is specific to the file type they are importing.
*/
protected final BibEntry extractMetadata(Metadata metadata) {
TikaMetadataParser metadataParser = new TikaMetadataParser(metadata);

BibEntry entry = new BibEntry(StandardEntryType.Misc)
.withField(StandardField.TITLE, metadataParser.getDcTitle())
.withField(StandardField.AUTHOR, TikaMetadataParser.formatBibtexAuthors(metadataParser.getDcCreators()));

metadataParser.getDcTermsCreated().ifPresent(date -> TikaMetadataParser.addDateCreated(entry, date));

extractAdditionalMetadata(entry, metadataParser);

return entry;
}

/**
* Extracts additional metadata that is specific to the file type being imported. Inheritors are allowed to mutate
* the given {@link BibEntry} to add more fields or modify existing ones.
*/
protected void extractAdditionalMetadata(BibEntry entry, TikaMetadataParser metadataParser) {

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.jabref.logic.importer.fileformat.books;

import java.io.BufferedReader;
import java.io.IOException;

import org.jabref.logic.importer.TikaImporter;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.FileType;
import org.jabref.logic.util.StandardFileType;

public class DjvuImporter extends TikaImporter {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some refernce to the format would be nice.

@Override
public boolean isRecognizedFormat(BufferedReader input) throws IOException {
// DJVU start with "AT&TFORM" and then "DJVU" some time after that.

char[] buffer = new char[64];
int read = input.read(buffer, 0, buffer.length);
input.reset();
String header = new String(buffer, 0, read);
return header.startsWith("AT&TFORM") && header.contains("DJVU");
}

@Override
public String getId() {
return "djvu";
}

@Override
public String getName() {
return "DjVu";
}

@Override
public String getDescription() {
return Localization.lang("Import DjVu files");
}

@Override
public FileType getFileType() {
return StandardFileType.DJVU;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package org.jabref.logic.importer.fileformat.books;

import java.io.BufferedReader;
import java.io.IOException;

import org.jabref.logic.importer.TikaImporter;
import org.jabref.logic.importer.util.Constants;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.FileType;
import org.jabref.logic.util.StandardFileType;

public class EpubImporter extends TikaImporter {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some refernce to the format would be nice.

(Even if it self-explanatory here; but mayb e some more non-obvious onformatoin can be found there)

private static final char[] EPUB_HEADER_MAGIC_NUMBER = {0x50, 0x4b, 0x03, 0x04};

@Override
public boolean isRecognizedFormat(BufferedReader input) throws IOException {
return Constants.hasMagicNumber(input, EPUB_HEADER_MAGIC_NUMBER);
}

@Override
public String getId() {
return "epub";
}

@Override
public String getName() {
return "ePUB";
}

@Override
public String getDescription() {
return Localization.lang("Import the popular e-book file format ePUB");
}

@Override
public FileType getFileType() {
return StandardFileType.EPUB;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package org.jabref.logic.importer.fileformat.books;

import java.io.BufferedReader;
import java.io.IOException;

import org.jabref.logic.importer.TikaImporter;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.FileType;
import org.jabref.logic.util.StandardFileType;

public class Fb2Importer extends TikaImporter {
@Override
public boolean isRecognizedFormat(BufferedReader input) throws IOException {
return input.lines()
.map(String::trim)
.anyMatch(line -> line.startsWith("<?xml")
&& line.contains("FictionBook")
&& line.contains("http://www.gribuser.ru/xml/fictionbook/2.0"));
}

@Override
public String getId() {
return "fb2";
}

@Override
public String getName() {
return "FB2";
}

@Override
public String getDescription() {
return Localization.lang("Importer for Fiction Books (FB2) files");
}

@Override
public FileType getFileType() {
return StandardFileType.FB2;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package org.jabref.logic.importer.fileformat.img;

import java.io.BufferedReader;
import java.io.IOException;

import org.jabref.logic.importer.TikaImporter;
import org.jabref.logic.importer.util.Constants;
import org.jabref.logic.importer.util.TikaMetadataParser;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.FileType;
import org.jabref.logic.util.StandardFileType;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.types.BiblatexNonStandardTypes;
import org.jabref.model.entry.types.EntryType;

public class JpgImporter extends TikaImporter {
@Override
public boolean isRecognizedFormat(BufferedReader input) throws IOException {
return Constants.hasMagicNumber(input, new char[]{(char) 0xFF, (char) 0xD8, (char) 0xFF});
}

@Override
public String getId() {
return "jpg";
}

@Override
public String getName() {
return "JPG";
}

@Override
public String getDescription() {
return Localization.lang("JPG image importer");
}

@Override
public FileType getFileType() {
return StandardFileType.JPG;
}

@Override
protected void extractAdditionalMetadata(BibEntry entry, TikaMetadataParser metadataParser) {
entry.setType(BiblatexNonStandardTypes.Image);
}
}
Loading
Loading