-
-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Add more file importers to JabRef #13310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 13 commits
7e9f579
ed19d6b
a3eb9f3
6679f1e
5a3d35b
a27c7e7
06fbb35
8769276
34039bd
511e3c0
03cbaa3
a5c006b
6cc22ff
8a4c492
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -106,16 +106,14 @@ | |
exports org.jabref.logic.git; | ||
exports org.jabref.logic.pseudonymization; | ||
exports org.jabref.logic.citation.repository; | ||
|
||
requires java.base; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, really, no |
||
exports org.jabref.logic.importer.fileformat.misc; | ||
|
||
requires javafx.base; | ||
requires javafx.graphics; // because of javafx.scene.paint.Color | ||
requires afterburner.fx; | ||
requires com.tobiasdiez.easybind; | ||
|
||
// for java.awt.geom.Rectangle2D required by org.jabref.logic.pdf.TextExtractor | ||
requires java.desktop; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is |
||
|
||
// SQL | ||
requires java.sql; | ||
|
@@ -252,5 +250,7 @@ | |
requires mslinks; | ||
requires org.antlr.antlr4.runtime; | ||
requires org.libreoffice.uno; | ||
requires org.apache.tika.core; | ||
requires org.jetbrains.annotations; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please no jetbrains annotations Use JSpecify - "howto" at https://www.jetbrains.com/help/idea/annotating-source-code.html#configure-nullability-annotations |
||
// endregion | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -49,6 +49,10 @@ public abstract class Importer implements Comparable<Importer> { | |||||
* @throws IOException Signals that an I/O exception has occurred. | ||||||
*/ | ||||||
public boolean isRecognizedFormat(Path filePath) throws IOException { | ||||||
if (!Files.exists(filePath) || !Files.isRegularFile(filePath)) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The latter includes the former, doesn't it?
Suggested change
|
||||||
return false; | ||||||
} | ||||||
|
||||||
try (BufferedReader bufferedReader = getReader(filePath)) { | ||||||
return isRecognizedFormat(bufferedReader); | ||||||
} | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
package org.jabref.logic.importer; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.nio.file.Path; | ||
|
||
import org.jabref.logic.importer.util.TikaMetadataParser; | ||
import org.jabref.logic.util.io.FileUtil; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.field.StandardField; | ||
import org.jabref.model.entry.types.StandardEntryType; | ||
|
||
import org.apache.tika.exception.TikaException; | ||
import org.apache.tika.metadata.Metadata; | ||
import org.apache.tika.parser.AutoDetectParser; | ||
import org.apache.tika.parser.ParseContext; | ||
import org.apache.tika.parser.Parser; | ||
import org.apache.tika.sax.BodyContentHandler; | ||
import org.xml.sax.SAXException; | ||
|
||
/** | ||
* Common class for all file importers that use Apache Tika to extract metadata from files. | ||
* <p> | ||
* Child classes should implement the rest of {@link Importer} and method {@link #extractMetadata(TikaMetadataParser, String)} to extract the {@link BibEntry} from the Tika metadata. | ||
* <p> | ||
* In case you need to use a specific Tika parser, you can override {@link #getTikaParser()} to return a different parser instance. | ||
*/ | ||
public abstract class TikaImporter extends Importer { | ||
@Override | ||
public ParserResult importDatabase(BufferedReader input) throws IOException { | ||
throw new UnsupportedOperationException("TikaImporter (and descendants) do not support importDatabase(BufferedReader reader)." | ||
+ "Instead use importDatabase(Path filePath)."); | ||
} | ||
|
||
@Override | ||
public ParserResult importDatabase(Path filePath) throws IOException { | ||
try (InputStream inputStream = new FileInputStream(filePath.toFile())) { | ||
Parser parser = getTikaParser(); | ||
Metadata metadata = new Metadata(); | ||
BodyContentHandler handler = new BodyContentHandler(); | ||
|
||
ParseContext parseContext = new ParseContext(); | ||
parseContext.set(Parser.class, parser); | ||
|
||
parser.parse(inputStream, handler, metadata, parseContext); | ||
|
||
String fileName = FileUtil.getBaseName(filePath); | ||
BibEntry entry = extractMetadata(new TikaMetadataParser(metadata)); | ||
|
||
if (!entry.hasField(StandardField.TITLE)) { | ||
entry.setField(StandardField.TITLE, fileName); | ||
} | ||
|
||
return ParserResult.fromEntry(entry); | ||
} catch (SAXException | TikaException e) { | ||
throw new IOException("Error parsing file: " + filePath, e); | ||
} | ||
} | ||
|
||
protected Parser getTikaParser() { | ||
return new AutoDetectParser(); | ||
} | ||
|
||
/** | ||
* Extracts common metadata from the given Tika metadata object and returns a {@link BibEntry}. | ||
* <p> | ||
* This function will add fields that are most standard and common across different file types. Inheritors are | ||
* recommended to override {@link TikaImporter#extractAdditionalMetadata(BibEntry, TikaMetadataParser)} | ||
* process additional metadata that is specific to the file type they are importing. | ||
*/ | ||
protected final BibEntry extractMetadata(Metadata metadata) { | ||
TikaMetadataParser metadataParser = new TikaMetadataParser(metadata); | ||
|
||
BibEntry entry = new BibEntry(StandardEntryType.Misc) | ||
.withField(StandardField.TITLE, metadataParser.getDcTitle()) | ||
.withField(StandardField.AUTHOR, TikaMetadataParser.formatBibtexAuthors(metadataParser.getDcCreators())); | ||
|
||
metadataParser.getDcTermsCreated().ifPresent(date -> TikaMetadataParser.addDateCreated(entry, date)); | ||
|
||
extractAdditionalMetadata(entry, metadataParser); | ||
|
||
return entry; | ||
} | ||
|
||
/** | ||
* Extracts additional metadata that is specific to the file type being imported. Inheritors are allowed to mutate | ||
* the given {@link BibEntry} to add more fields or modify existing ones. | ||
*/ | ||
protected void extractAdditionalMetadata(BibEntry entry, TikaMetadataParser metadataParser) { | ||
|
||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package org.jabref.logic.importer.fileformat.books; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
|
||
import org.jabref.logic.importer.TikaImporter; | ||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.logic.util.FileType; | ||
import org.jabref.logic.util.StandardFileType; | ||
|
||
public class DjvuImporter extends TikaImporter { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some refernce to the format would be nice. |
||
@Override | ||
public boolean isRecognizedFormat(BufferedReader input) throws IOException { | ||
// DJVU start with "AT&TFORM" and then "DJVU" some time after that. | ||
|
||
char[] buffer = new char[64]; | ||
int read = input.read(buffer, 0, buffer.length); | ||
input.reset(); | ||
String header = new String(buffer, 0, read); | ||
return header.startsWith("AT&TFORM") && header.contains("DJVU"); | ||
} | ||
|
||
@Override | ||
public String getId() { | ||
return "djvu"; | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return "DjVu"; | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return Localization.lang("Import DjVu files"); | ||
} | ||
|
||
@Override | ||
public FileType getFileType() { | ||
return StandardFileType.DJVU; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
package org.jabref.logic.importer.fileformat.books; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
|
||
import org.jabref.logic.importer.TikaImporter; | ||
import org.jabref.logic.importer.util.Constants; | ||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.logic.util.FileType; | ||
import org.jabref.logic.util.StandardFileType; | ||
|
||
public class EpubImporter extends TikaImporter { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some refernce to the format would be nice. (Even if it self-explanatory here; but mayb e some more non-obvious onformatoin can be found there) |
||
private static final char[] EPUB_HEADER_MAGIC_NUMBER = {0x50, 0x4b, 0x03, 0x04}; | ||
|
||
@Override | ||
public boolean isRecognizedFormat(BufferedReader input) throws IOException { | ||
return Constants.hasMagicNumber(input, EPUB_HEADER_MAGIC_NUMBER); | ||
} | ||
|
||
@Override | ||
public String getId() { | ||
return "epub"; | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return "ePUB"; | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return Localization.lang("Import the popular e-book file format ePUB"); | ||
} | ||
|
||
@Override | ||
public FileType getFileType() { | ||
return StandardFileType.EPUB; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
package org.jabref.logic.importer.fileformat.books; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
|
||
import org.jabref.logic.importer.TikaImporter; | ||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.logic.util.FileType; | ||
import org.jabref.logic.util.StandardFileType; | ||
|
||
public class Fb2Importer extends TikaImporter { | ||
@Override | ||
public boolean isRecognizedFormat(BufferedReader input) throws IOException { | ||
return input.lines() | ||
.map(String::trim) | ||
.anyMatch(line -> line.startsWith("<?xml") | ||
&& line.contains("FictionBook") | ||
&& line.contains("http://www.gribuser.ru/xml/fictionbook/2.0")); | ||
} | ||
|
||
@Override | ||
public String getId() { | ||
return "fb2"; | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return "FB2"; | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return Localization.lang("Importer for Fiction Books (FB2) files"); | ||
} | ||
|
||
@Override | ||
public FileType getFileType() { | ||
return StandardFileType.FB2; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
package org.jabref.logic.importer.fileformat.img; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
|
||
import org.jabref.logic.importer.TikaImporter; | ||
import org.jabref.logic.importer.util.Constants; | ||
import org.jabref.logic.importer.util.TikaMetadataParser; | ||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.logic.util.FileType; | ||
import org.jabref.logic.util.StandardFileType; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.types.BiblatexNonStandardTypes; | ||
import org.jabref.model.entry.types.EntryType; | ||
|
||
public class JpgImporter extends TikaImporter { | ||
@Override | ||
public boolean isRecognizedFormat(BufferedReader input) throws IOException { | ||
return Constants.hasMagicNumber(input, new char[]{(char) 0xFF, (char) 0xD8, (char) 0xFF}); | ||
} | ||
|
||
@Override | ||
public String getId() { | ||
return "jpg"; | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return "JPG"; | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return Localization.lang("JPG image importer"); | ||
} | ||
|
||
@Override | ||
public FileType getFileType() { | ||
return StandardFileType.JPG; | ||
} | ||
|
||
@Override | ||
protected void extractAdditionalMetadata(BibEntry entry, TikaMetadataParser metadataParser) { | ||
entry.setType(BiblatexNonStandardTypes.Image); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please rework with the new scheme.
But the concrete versions at https://github.com/JabRef/jabref/blob/main/versions/build.gradle.kts