Skip to content

Add more file importers to JabRef #13310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 14 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions jablib/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,15 @@ dependencies {
exclude(group = "org.apache.xmlgraphics")
}

// region for document importing
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please rework with the new scheme.

But the concrete versions at https://github.com/JabRef/jabref/blob/main/versions/build.gradle.kts

implementation("org.apache.tika:tika-core:3.2.0") {
exclude(group = "commons-logging")
}
implementation("org.apache.tika:tika-parsers-standard-package:3.2.0") {
exclude(group = "commons-logging")
}
// endregion

// Even if("compileOnly") is used, IntelliJ always adds to module-info.java. To avoid issues during committing, we use("implementation") instead of("compileOnly")
implementation("io.github.adr:e-adr:2.0.0-SNAPSHOT")

Expand Down
1 change: 1 addition & 0 deletions jablib/src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -252,5 +252,6 @@
requires mslinks;
requires org.antlr.antlr4.runtime;
requires org.libreoffice.uno;
requires org.apache.tika.core;
// endregion
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ public static ParserResult fromErrorMessage(String message) {
return parserResult;
}

public static ParserResult fromEntry(BibEntry entry) {
return new ParserResult(List.of(entry));
}

private static String getErrorMessage(Exception exception) {
String errorMessage = exception.getLocalizedMessage();
if (exception.getCause() != null) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package org.jabref.logic.importer.fileformat.docs;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Optional;

import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.util.Constants;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.FileType;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;

/**
* General importer for Open Document Format files.
*/
public abstract class OdfImporter extends Importer {
@Override
public boolean isRecognizedFormat(BufferedReader input) throws IOException {
return Constants.isZip(input);
}

@Override
public ParserResult importDatabase(BufferedReader input) throws IOException {
throw new UnsupportedOperationException("OdfImporter (and descendants) does not support importDatabase(BufferedReader reader)."
+ "Instead use importDatabase(Path filePath).");
}

@Override
public ParserResult importDatabase(Path filePath) throws IOException {
try (InputStream inputStream = new FileInputStream(filePath.toFile())) {
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
BodyContentHandler handler = new BodyContentHandler();

parser.parse(inputStream, handler, metadata);

BibEntry entry = extractMetadata(metadata);

return ParserResult.fromEntry(entry);
} catch (SAXException | TikaException e) {
throw new IOException("Error parsing file: " + filePath, e);
}
}

private BibEntry extractMetadata(Metadata metadata) {
Optional<String> title = Optional.ofNullable(metadata.get("dc:title"));
Optional<Date> date = Optional.ofNullable(metadata.getDate(Property.internalDate("dcterms:created")));

List<String> authors = Arrays.asList(metadata.getValues("dc:contributor"));

return new BibEntry()
.withField(StandardField.TITLE, title)
.withField(StandardField.AUTHOR, !authors.isEmpty() ? Optional.of(String.join(" and ", authors)) : Optional.empty())
.withField(StandardField.YEAR, date.map(Date::getYear).map(Object::toString));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.jabref.logic.importer.fileformat.docs;

import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.FileType;
import org.jabref.logic.util.StandardFileType;

/**
* Importer for OpenDocument Impress (ODP) files.
*/
public class OdpImporter extends OdfImporter {
@Override
public String getId() {
return "odp";
}

@Override
public String getName() {
return "OpenDocument Impress";
}

@Override
public String getDescription() {
return Localization.lang("Importer for OpenDocument Impress (ODP) files");
}

@Override
public FileType getFileType() {
return StandardFileType.ODP;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.jabref.logic.importer.fileformat.docs;

import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.FileType;
import org.jabref.logic.util.StandardFileType;

/**
* Importer for OpenDocument Calc (ODS) files.
*/
public class OdsImporter extends OdfImporter {
@Override
public String getId() {
return "ods";
}

@Override
public String getName() {
return "OpenDocument Calc";
}

@Override
public String getDescription() {
return Localization.lang("Importer for OpenDocument Calc (ODS) files");
}

@Override
public FileType getFileType() {
return StandardFileType.ODS;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.jabref.logic.importer.fileformat.docs;

import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.FileType;
import org.jabref.logic.util.StandardFileType;

/**
* Importer for OpenDocument Text (ODT) files.
*/
public class OdtImporter extends OdfImporter {
@Override
public String getId() {
return "odt";
}

@Override
public String getName() {
return "OpenDocument Writer";
}

@Override
public String getDescription() {
return Localization.lang("Importer for OpenDocument Writer (ODT) files");
}

@Override
public FileType getFileType() {
return StandardFileType.ODT;
}
}
30 changes: 30 additions & 0 deletions jablib/src/main/java/org/jabref/logic/importer/util/Constants.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.jabref.logic.importer.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

public class Constants {
public static final char[] ZIP_HEADER_MAGIC_NUMBER = {0x50, 0x4b, 0x03, 0x04};

public static final List<String> ZIP_FILES_EXTENSIONS = List.of(
".ctv6bak",
".zip",
".epub",
".odt",
".docx",
".xlsx",
".pptx",
".ods",
".odp"
);

public static final String DC_NAMESPACE = "http://purl.org/dc/elements/1.1/";

public static boolean isZip(BufferedReader input) throws IOException {
char[] header = new char[ZIP_HEADER_MAGIC_NUMBER.length];
int nRead = input.read(header);
return nRead == ZIP_HEADER_MAGIC_NUMBER.length && Arrays.equals(header, ZIP_HEADER_MAGIC_NUMBER);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ public enum StandardFileType implements FileType {
MARKDOWN("Markdown", "md"),
MEDLINE("Medline", "nbib", "xml"),
MEDLINE_PLAIN("Medline Plain", "nbib", "txt"),
ODP("OpenOffice Impress", "odp"),
ODS("OpenOffice Calc", "ods"),
ODT("OpenOffice Writer", "odt"),
PDF("PDF", "pdf"),
PUBMED("Pubmed", "fcgi"),
RDF("RDF", "rdf"),
Expand Down
8 changes: 8 additions & 0 deletions jablib/src/main/java/org/jabref/model/entry/BibEntry.java
Original file line number Diff line number Diff line change
Expand Up @@ -969,6 +969,14 @@ public BibEntry withField(Field field, String value) {
return this;
}

public BibEntry withField(Field field, Optional<String> value) {
value.ifPresent(v -> {
setField(field, v);
this.setChanged(false);
});
return this;
}

/**
* A copy is made of the parameter
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package org.jabref.logic.importer.fileformat.docs;

import java.io.IOException;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.Stream;

import org.jabref.logic.importer.ImportException;
import org.jabref.logic.importer.fileformat.ImporterTestEngine;
import org.jabref.logic.importer.util.Constants;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;

public class OdpImporterFilesTest {
private static final String FILE_ENDING = ".odp";
private static final List<String> EXCLUDE_EXTENSIONS = Constants.ZIP_FILES_EXTENSIONS
.stream()
.filter(ext -> !ext.equals(FILE_ENDING))
.toList();

private OdpImporter importer;

@BeforeEach
void setUp() {
importer = new OdpImporter();
}

private static Stream<String> fileNames() throws IOException {
Predicate<String> fileName = name -> name.startsWith("OdpImporterTest") && name.endsWith(FILE_ENDING);
return ImporterTestEngine.getTestFiles(fileName).stream();
}

private static Stream<String> invalidFileNames() throws IOException {
Predicate<String> fileName = name -> !name.startsWith("OdpImporterTest") && EXCLUDE_EXTENSIONS.stream().noneMatch(name::endsWith);
return ImporterTestEngine.getTestFiles(fileName).stream();
}

@ParameterizedTest
@MethodSource("fileNames")
void isRecognizedFormat(String fileName) throws IOException {
ImporterTestEngine.testIsRecognizedFormat(importer, fileName);
}

@ParameterizedTest
@MethodSource("invalidFileNames")
void isNotRecognizedFormat(String fileName) throws IOException {
ImporterTestEngine.testIsNotRecognizedFormat(importer, fileName);
}

@ParameterizedTest
@MethodSource("fileNames")
void importEntries(String fileName) throws ImportException, IOException {
ImporterTestEngine.testImportEntries(importer, fileName, FILE_ENDING);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package org.jabref.logic.importer.fileformat.docs;

import org.jabref.logic.util.StandardFileType;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

class OdpImporterTest {
private OdpImporter importer;

@BeforeEach
void setUp() {
importer = new OdpImporter();
}

@Test
void getFormatName() {
assertEquals("OpenDocument Impress", importer.getName());
}

@Test
void getCLIId() {
assertEquals("odp", importer.getId());
}

@Test
void sGetExtensions() {
assertEquals(StandardFileType.ODP, importer.getFileType());
}
}
Loading
Loading