JabRef · calixtus · Jul 14, 2021 · Aug 2, 2016 · Aug 2, 2016 · Aug 4, 2016
diff --git a/build.gradle b/build.gradle
@@ -141,6 +141,12 @@ dependencies {
     compile group: 'com.microsoft.azure', name: 'applicationinsights-core', version: '1.0.+'
     compile group: 'com.microsoft.azure', name: 'applicationinsights-logging-log4j2', version: '1.0.+'
 
+    compile 'org.apache.lucene:lucene-core:6.5.1'
+    compile 'org.apache.lucene:lucene-queryparser:6.5.1'
+    compile 'org.apache.lucene:lucene-queries:6.5.1'
+    compile 'org.apache.lucene:lucene-analyzers-common:6.5.1'
+
+
     testCompile 'junit:junit:4.12'
     testCompile 'org.mockito:mockito-core:2.7.22'
     testCompile 'com.github.tomakehurst:wiremock:2.6.0'
@@ -308,26 +314,26 @@ shadowJar {
 
         // this is an adapter required for generating a fat jar with correct log4j2 output
 
-        com.github.edwgiz.mavenShadePlugin.log4j2CacheTransformer.PluginsCacheFileTransformer target = new com.github.edwgiz.mavenShadePlugin.log4j2CacheTransformer.PluginsCacheFileTransformer();
+        com.github.edwgiz.mavenShadePlugin.log4j2CacheTransformer.PluginsCacheFileTransformer target = new com.github.edwgiz.mavenShadePlugin.log4j2CacheTransformer.PluginsCacheFileTransformer()
 
         @Override
         boolean canTransformResource(FileTreeElement element) {
-            return target.canTransformResource(element.getPath());
+            return target.canTransformResource(element.getPath())
         }
 
         @Override
         void transform(String path, InputStream is, List<com.github.jengelman.gradle.plugins.shadow.relocation.Relocator> relocators) {
-            target.processResource(path, is, relocators);
+            target.processResource(path, is, relocators)
         }
 
         @Override
         boolean hasTransformedResource() {
-            return target.hasTransformedResource();
+            return target.hasTransformedResource()
         }
 
         @Override
         void modifyOutputStream(org.apache.tools.zip.ZipOutputStream jos) {
-            target.modifyOutputStream(jos);
+            target.modifyOutputStream(jos)
         }
     })
 }
@@ -356,7 +362,7 @@ if (hasProperty('dev')) {
     // In the context of github, the branch name could be something like "pull/277"
     // "/" is an illegal character. To be safe, all illegal filename characters are replaced by "_"
     // http://stackoverflow.com/a/15075907/873282 describes the used pattern.
-    branchName = branchName.trim().replaceAll("[^a-zA-Z0-9.-]", "_");
+    branchName = branchName.trim().replaceAll("[^a-zA-Z0-9.-]", "_")
 
     // hack string
     // first the date (%cd), then the branch name, and finally the commit id (%h)

diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
-#Thu Apr 13 18:56:36 CEST 2017
+#Fri May 12 13:26:57 CEST 2017
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-3.5-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-3.5-all.zip
diff --git a/src/main/java/org/jabref/logic/pdf/search/indexing/DocumentReader.java b/src/main/java/org/jabref/logic/pdf/search/indexing/DocumentReader.java
@@ -0,0 +1,94 @@
+package org.jabref.logic.pdf.search.indexing;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.entry.FieldName;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.util.PDFTextStripper;
+
+import static org.jabref.model.pdf.search.SearchFieldConstants.AUTHOR;
+import static org.jabref.model.pdf.search.SearchFieldConstants.CONTENT;
+import static org.jabref.model.pdf.search.SearchFieldConstants.CREATOR;
+import static org.jabref.model.pdf.search.SearchFieldConstants.KEY;
+import static org.jabref.model.pdf.search.SearchFieldConstants.KEYWORDS;
+import static org.jabref.model.pdf.search.SearchFieldConstants.SUBJECT;
+
+public final class DocumentReader {
+
+    private final BibEntry entry;
+    private final PDFTextStripper pdfTextStripper = new PDFTextStripper();
+
+    public DocumentReader(BibEntry bibEntry) throws IOException {
+        if (!bibEntry.getField(FieldName.FILE).isPresent()) {
+            throw new IllegalArgumentException("The file field must not be absent when trying to reading the document!");
+        }
+
+        this.entry = bibEntry;
+        pdfTextStripper.setLineSeparator("\n");
+    }
+
+    /**
+     * Reads the content and metadata from a pdf file
+     */
+    public Document readPdfContents() throws IOException {
+        Path pdfPath = Paths.get(this.entry.getField(FieldName.FILE).get());
+
+        try (PDDocument pdfDocument = PDDocument.load(pdfPath.toFile())) {
+            Document newDocument = new Document();
+            addKeyIfPresent(newDocument);
+            addContentIfNotEmpty(pdfDocument, newDocument);
+            addMetaData(pdfDocument, newDocument);
+            return newDocument;
+        } catch (IOException e) {
+            throw new IOException("Could not read pdf file: " + pdfPath + "!", e);
+        }
+    }
+
+    private void addMetaData(PDDocument pdfDocument, Document newDocument) {
+        PDDocumentInformation info = pdfDocument.getDocumentInformation();
+        addStringField(newDocument, AUTHOR, info.getAuthor());
+        addStringField(newDocument, CREATOR, info.getCreator());
+        addStringField(newDocument, SUBJECT, info.getSubject());
+        addTextField(newDocument, KEYWORDS, info.getKeywords());
+    }
+
+    private void addTextField(Document newDocument, String field, String value) {
+        if (!isValidField(value)) {
+            return;
+        }
+        newDocument.add(new TextField(field, value, Field.Store.YES));
+    }
+
+    private void addStringField(Document newDocument, String field, String value) {
+        if (!isValidField(value)) {
+            return;
+        }
+        newDocument.add(new StringField(field, value, Field.Store.YES));
+    }
+
+    private boolean isValidField(String value) {
+        return !(value == null || value.trim().isEmpty());
+    }
+
+    private void addContentIfNotEmpty(PDDocument pdfDocument, Document newDocument) throws IOException {
+        String pdfContent = pdfTextStripper.getText(pdfDocument);
+        if (!pdfContent.trim().isEmpty()) {
+            newDocument.add(new TextField(CONTENT, pdfContent, Field.Store.YES));
+        }
+    }
+
+    private void addKeyIfPresent(Document newDocument) {
+        if (this.entry.getCiteKeyOptional().isPresent()) {
+            newDocument.add(new StringField(KEY, this.entry.getCiteKeyOptional().get(), Field.Store.YES));
+        }
+    }
+}
diff --git a/src/main/java/org/jabref/logic/pdf/search/indexing/EnglishStemAnalyzer.java b/src/main/java/org/jabref/logic/pdf/search/indexing/EnglishStemAnalyzer.java
@@ -0,0 +1,27 @@
+package org.jabref.logic.pdf.search.indexing;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
+import org.apache.lucene.analysis.core.StopAnalyzer;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+public class EnglishStemAnalyzer extends Analyzer {
+
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer source = new StandardTokenizer();
+        TokenStream filter = new StandardFilter(source);
+        filter = new LowerCaseFilter(filter);
+        filter = new StopFilter(filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+        filter = new DecimalDigitFilter(filter);
+        filter = new PorterStemFilter(filter);
+        return new TokenStreamComponents(source, filter);
+    }
+}
+
diff --git a/src/main/java/org/jabref/logic/pdf/search/indexing/PdfIndexer.java b/src/main/java/org/jabref/logic/pdf/search/indexing/PdfIndexer.java
@@ -0,0 +1,90 @@
+package org.jabref.logic.pdf.search.indexing;
+
+import java.io.IOException;
+import java.nio.file.Paths;
+
+import org.jabref.model.database.BibDatabase;
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.entry.FieldName;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.SimpleFSDirectory;
+
+
+/**
+ * Indexes the text of pdf files and adds it into the lucene index.
+ */
+public class PdfIndexer {
+    private static final Log LOGGER = LogFactory.getLog(PdfIndexer.class);
+
+    private final Directory directoryToIndex;
+
+    public PdfIndexer() throws IOException {
+        this.directoryToIndex = new SimpleFSDirectory(Paths.get("src/main/resources/luceneIndex"));
+    }
+
+    public Directory getIndexDirectory() {
+        return this.directoryToIndex;
+    }
+
+    /**
+     * Adds all PDF files linked to an entry in the database to new Lucene search index
+     *
+     * @param database a bibtex database to link the pdf files to
+     */
+    public void createIndex(BibDatabase database) {
+        try (IndexWriter indexWriter = new IndexWriter(directoryToIndex,
+                new IndexWriterConfig(new EnglishStemAnalyzer()).setOpenMode(IndexWriterConfig.OpenMode.CREATE))) {
+            database.getEntries().stream().
+                    filter(entry -> entry.hasField(FieldName.FILE)).
+                    filter(entry -> entry.getCiteKeyOptional().isPresent()).
+                    forEach(entry -> writeToIndex(entry, indexWriter));
+        } catch (IOException e) {
+            LOGGER.warn(e.getMessage());
+        }
+    }
+
+    /**
+     * Adds all the pdf files linked to one entry in the database to an existing (or new) Lucene search index
+     *
+     * @param entry a bibtex entry to link the pdf files to
+     */
+    public void addToIndex(BibEntry entry) {
+        try (IndexWriter indexWriter = new IndexWriter(directoryToIndex,
+                new IndexWriterConfig(new EnglishStemAnalyzer()).setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND))) {
+
+            if (entry.hasField(FieldName.FILE) && entry.getCiteKeyOptional().isPresent()) {
+                writeToIndex(entry, indexWriter);
+            }
+        } catch (IOException e) {
+            LOGGER.warn(e.getMessage());
+        }
+    }
+
+    /**
+     * Deletes all entries from the Lucene search index.
+     */
+    public void flushIndex() {
+
+        IndexWriterConfig config = new IndexWriterConfig();
+        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
+        try (IndexWriter deleter = new IndexWriter(directoryToIndex, config)) {
+            // Do nothing. Index is deleted.
+            return;
+        } catch (IOException e) {
+            LOGGER.warn(e.getMessage());
+        }
+    }
+
+    private void writeToIndex(BibEntry entry, IndexWriter indexWriter) {
+        try {
+            indexWriter.addDocument(new DocumentReader(entry).readPdfContents());
+        } catch (IOException e) {
+            LOGGER.debug("Document could not be added to the index.", e);
+        }
+    }
+}
diff --git a/src/main/java/org/jabref/logic/pdf/search/retrieval/PdfSearcher.java b/src/main/java/org/jabref/logic/pdf/search/retrieval/PdfSearcher.java
@@ -0,0 +1,64 @@
+package org.jabref.logic.pdf.search.retrieval;
+
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Objects;
+
+import org.jabref.logic.pdf.search.indexing.EnglishStemAnalyzer;
+import org.jabref.model.pdf.search.PdfSearchResults;
+import org.jabref.model.pdf.search.SearchResult;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.SimpleFSDirectory;
+
+import static org.jabref.model.pdf.search.SearchFieldConstants.PDF_FIELDS;
+
+public final class PdfSearcher {
+    private static final Log LOGGER = LogFactory.getLog(PdfSearcher.class);
+
+    private final Directory indexDirectory;
+
+    public PdfSearcher() throws IOException {
+        this.indexDirectory = new SimpleFSDirectory(Paths.get("src/main/resources/luceneIndex"));
+    }
+
+    /**
+     * Search for results matching a query in the Lucene search index
+     *
+     * @param searchString a pattern to search for matching entries in the index, must not be null
+     * @param maxHits      number of maximum search results, must be positive
+     * @return a result set of all documents that have matches in any fields
+     */
+    public PdfSearchResults search(String searchString, int maxHits) throws IOException {
+        if (Objects.requireNonNull(searchString, "The search string was null!").isEmpty()) {
+            return new PdfSearchResults();
+        }
+        if (maxHits <= 0) {
+            throw new IllegalArgumentException("Must be called with at least 1 maxHits, was" + maxHits);
+        }
+
+        try {
+            List<SearchResult> resultDocs = new LinkedList<>();
+
+            IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(indexDirectory));
+            Query query = new MultiFieldQueryParser(PDF_FIELDS, new EnglishStemAnalyzer()).parse(searchString);
+            for (ScoreDoc scoreDoc : searcher.search(query, maxHits).scoreDocs) {
+                resultDocs.add(new SearchResult(searcher, scoreDoc));
+            }
+            return new PdfSearchResults(resultDocs);
+        } catch (ParseException e) {
+            LOGGER.warn("Could not parse query: '" + searchString + "'! \n" + e.getMessage());
+            return new PdfSearchResults();
+        }
+    }
+}
diff --git a/src/main/java/org/jabref/model/pdf/search/PdfSearchResults.java b/src/main/java/org/jabref/model/pdf/search/PdfSearchResults.java
@@ -0,0 +1,45 @@
+package org.jabref.model.pdf.search;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public final class PdfSearchResults {
+
+    private final List<SearchResult> searchResults;
+
+    public PdfSearchResults(List<SearchResult> search) {
+        this.searchResults = Collections.unmodifiableList(search);
+    }
+
+    public PdfSearchResults() {
+        this.searchResults = Collections.unmodifiableList(Collections.emptyList());
+    }
+
+    public List<SearchResult> getSortedByScore() {
+        List<SearchResult> sortedList = new ArrayList<>(searchResults);
+        sortedList.sort((searchResult, t1) -> {
+            if (searchResult.getLuceneScore() < t1.getLuceneScore()) {
+                return -1;
+            }
+            if (searchResult.getLuceneScore() > t1.getLuceneScore()) {
+                return 1;
+            }
+            return 0;
+        });
+        return Collections.unmodifiableList(sortedList);
+    }
+
+    private List<SearchResult> getSortedByAlphabet() {
+        //TODO implement sorting
+        throw new RuntimeException("Not implemented");
+    }
+
+    public List<SearchResult> getSearchResults() {
+        return this.searchResults;
+    }
+
+    public int numSearchResults() {
+        return this.searchResults.size();
+    }
+}