Merge pull request #532 from koppor/add-bibtex-accept

Add support for application/x-bibtex type
kermitt2 · Mar 9, 2020 · adeca65 · adeca65
2 parents 5a325e3 + 0b86f3f
commit adeca65
Show file tree

Hide file tree

Showing 20 changed files with 766 additions and 469 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -17,6 +17,12 @@ indent_size=2
 indent_style=space
 indent_size=2
 
+[*.md]
+insert_final_newline=true
+
 [{*.yml,*.yaml}]
 indent_style=space
 indent_size=2
+
+[GrobidRestProcessString.java]
+indent_style=tab
diff --git a/build.gradle b/build.gradle
@@ -5,12 +5,16 @@ buildscript {
         mavenLocal()
         mavenCentral()
         jcenter()
+        maven {
+            url 'https://plugins.gradle.org/m2/'
+        }
     }
     dependencies {
         classpath group: 'net.researchgate', name: 'gradle-release', version: '2.6.0'
         classpath 'org.kt3k.gradle.plugin:coveralls-gradle-plugin:2.4.0'
         classpath 'com.jfrog.bintray.gradle:gradle-bintray-plugin:1.7.3'
         classpath 'com.github.jengelman.gradle.plugins:shadow:5.0.0'
+        classpath 'com.adarshr:gradle-test-logger-plugin:2.0.0'
     }
 }
 
@@ -25,6 +29,7 @@ allprojects {
     apply plugin: 'jacoco'
     apply plugin: 'base'
     apply plugin: 'com.github.kt3k.coveralls'
+    apply plugin: 'com.adarshr.test-logger'
 
     group = "org.grobid"
 

diff --git a/doc/Grobid-service.md b/doc/Grobid-service.md
diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
@@ -29,6 +29,7 @@
 import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -1727,132 +1728,139 @@ else if (string.startsWith("PACS Numbers") ||
 	}	
 
     /**
-     * Export to BibTeX format
+     * Export to BibTeX format. Use "id" as BibTeX key.
      */
     public String toBibTeX() {
 		return toBibTeX("id");
 	}
 
     /**
      * Export to BibTeX format
+     *
+     * @param id the BibTeX ke to use.
      */
     public String toBibTeX(String id) {
-        String bibtex = "";
-        try {
+        return toBibTeX(id, new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder().includeRawCitations(false).build());
+    }
 
-            if (journal != null) {
-                bibtex += "@article{" + id + ",\n";
-            } else if (book_type != null) {
-                bibtex += "@techreport{" + id + ",\n";
-            } else if (bookTitle != null) {
-                if ((bookTitle.startsWith("proc")) || (bookTitle.startsWith("Proc")) ||
-                        (bookTitle.startsWith("In Proc")) || (bookTitle.startsWith("In proc"))) {
-                    bibtex += "@inproceedings{" + id + ",\n";
-                } else {
-                    bibtex += "@article{" + id + ",\n"; // ???
-                }
+    /**
+     * Export to BibTeX format
+     *
+     * @param id                  the BibTeX ke to use
+     */
+    public String toBibTeX(String id, GrobidAnalysisConfig config) {
+        String type;
+        if (journal != null) {
+            type = "article";
+        } else if (book_type != null) {
+            type = "techreport";
+        } else if (bookTitle != null) {
+            if (StringUtils.containsIgnoreCase(bookTitle, "proceedings") ||
+                (bookTitle.startsWith("proc")) || (bookTitle.startsWith("Proc")) ||
+                (bookTitle.startsWith("In Proc")) || (bookTitle.startsWith("In proc"))) {
+                type = "inproceedings";
             } else {
-                bibtex += "@misc{" + id + ",\n"; // ???
+                LOGGER.debug("No journal given, but a booktitle. However, the booktitle does not start with \"proc\" or similar strings. Returning inbook");
+                type = "inbook";
             }
+        } else {
+            // using "misc" as fallback type
+            type = "misc";
+        }
+
+        StringJoiner bibtex = new StringJoiner(",\n", "@" + type + "{" + id + ",\n", "\n}\n");
+
+        try {
 
             // author 
             // fullAuthors has to be used instead
             if (collaboration != null) {
-                bibtex += "author\t=\t\"" + collaboration;
-            } else if (fullAuthors != null) {
-                if (fullAuthors.size() > 0) {
-                    boolean begin = true;
-                    for (Person person : fullAuthors) {
-                        if (begin) {
-                            bibtex += "author\t=\t\"" + person.getFirstName() + " " + person.getLastName();
-                            begin = false;
-                        } else
-                            bibtex += " and " + person.getFirstName() + " " + person.getLastName();
-                    }
-                    bibtex += "\"";
-                }
-            } else if (authors != null) {
-                StringTokenizer st = new StringTokenizer(authors, ";");
-                if (st.countTokens() > 1) {
-                    boolean begin = true;
+                bibtex.add("  author = {" + collaboration + "}");
+            } else {
+                StringJoiner authors = new StringJoiner(" and ", "  author = {", "}");
+                if (fullAuthors != null) {
+                    fullAuthors.stream()
+                               .filter(person -> person != null)
+                               .forEachOrdered(person -> {
+                                   String author = person.getLastName();
+                                   if (person.getFirstName() != null) {
+                                       author += ", ";
+                                       author += person.getFirstName();
+                                   }
+                                   authors.add(author);
+                               });
+                } else if (this.authors != null) {
+                    StringTokenizer st = new StringTokenizer(this.authors, ";");
                     while (st.hasMoreTokens()) {
                         String author = st.nextToken();
-                        if (author != null)
-                            author = author.trim();
-                        if (begin) {
-                            bibtex += "author\t=\t\"" + author;
-                            begin = false;
-                        } else
-                            bibtex += " and " + author;
-
+                        if (author != null) {
+                            authors.add(author.trim());
+                        }
                     }
-                    bibtex += "\"";
-                } else {
-                    if (authors != null)
-                        bibtex += "author\t=\t\"" + authors + "\"";
                 }
+                bibtex.add(authors.toString());
             }
 
             // title
             if (title != null) {
-                bibtex += ",\ntitle\t=\t\"" + title + "\"";
+                bibtex.add("  title = {" + title + "}");
             }
 
             // journal
             if (journal != null) {
-                bibtex += ",\njournal\t=\t\"" + journal + "\"";
+                bibtex.add("  journal = {" + journal + "}");
             }
 
             // booktitle
             if ((journal == null) && (book_type == null) && (bookTitle != null)) {
-                bibtex += ",\nbooktitle\t=\t\"" + bookTitle + "\"";
+                bibtex.add("  booktitle = {" + bookTitle + "}");
             }
 
             // publisher
             if (publisher != null) {
-                bibtex += ",\npublisher\t=\t\"" + publisher + "\"";
+                bibtex.add("  publisher = {" + publisher + "}");
             }
 
             // editors
             if (editors != null) {
                 String locEditors = editors.replace(" ; ", " and ");
-                bibtex += ",\neditor\t=\t\"" + locEditors + "\"";
+                bibtex.add("  editor = {" + locEditors + "}");
             }
             // fullEditors has to be used instead
 
             // year
             if (publication_date != null) {
-                bibtex += ",\nyear\t=\t\"" + publication_date + "\"";
+                bibtex.add("  year = {" + publication_date + "}");
             }
 
-            // location
+            // address
             if (location != null) {
-                bibtex += ",\naddress\t=\t\"" + location + "\"";
+                bibtex.add("  address = {" + location + "}");
             }
 
             // pages
             if (pageRange != null) {
-                bibtex += ",\npages\t=\t\"" + pageRange + "\"";
+                bibtex.add("  pages = {" + pageRange + "}");
             }
 
 			// volume
 			if (volumeBlock != null) {
-				bibtex += ",\nvolume\t=\t\"" + volumeBlock + "\"";
+                bibtex.add("  volume = {" + volumeBlock + "}");
 			}
 
 			// issue (named number in BibTeX)
 			if (issue != null) {
-				bibtex += ",\nnumber\t=\t\"" + issue + "\"";
+                bibtex.add("  number = {" + issue + "}");
 			}
 
             // DOI
             if (!StringUtils.isEmpty(doi)) {
-                bibtex += ",\ndoi\t=\t\"" + doi + "\"";
+                bibtex.add("  doi = {" + doi + "}");
             }
 
             // arXiv identifier
             if (!StringUtils.isEmpty(arXivId)) {
-                bibtex += ",\neprint\t=\t\"" + arXivId + "\"";
+                bibtex.add("  eprint = {" + arXivId + "}");
             }
             /* note that the following is now recommended for arXiv citations: 
                     archivePrefix = "arXiv",
@@ -1864,30 +1872,27 @@ public String toBibTeX(String id) {
 
             // abstract
             if (!StringUtils.isEmpty(abstract_)) {
-                bibtex += ",\nabstract\t=\t\"" + abstract_ + "\"";
+                bibtex.add("  abstract = {" + abstract_ + "}");
             }
 
             // keywords
             if (keywords != null) {
-                bibtex += ",\nkeywords\t=\t\"";
-                boolean begin = true;
-                for (Keyword keyw : keywords) {
-					if ( (keyw.getKeyword() == null) || (keyw.getKeyword().length() == 0) )
-						continue;
-                    if (begin) {
-                        begin = false;
-                        bibtex += keyw.getKeyword();
-                    } else
-                        bibtex += ", " + keyw.getKeyword();
-                }
-                bibtex += "\"";
+                String value = keywords.stream()
+                        .map(keyword -> keyword.getKeyword())
+                        .filter(keyword -> !StringUtils.isBlank(keyword))
+                        .collect(Collectors.joining(", ", "keywords = {", "}"));
+                bibtex.add(value);
             }
 
-            bibtex += "\n}\n";
+            if (config.getIncludeRawCitations() && !StringUtils.isEmpty(reference) ) {
+                // escape all " signs
+                bibtex.add("  raw = {" + reference + "}");
+            }
         } catch (Exception e) {
+            LOGGER.error("Cannot export BibTex format, because of nested exception.", e);
             throw new GrobidException("Cannot export BibTex format, because of nested exception.", e);
         }
-        return bibtex;
+        return bibtex.toString();
     }
 
     /** 
@@ -1940,16 +1945,29 @@ public void checkIdentifier() {
      *
      * @param n - the index of the bibliographical record, the corresponding id will be b+n
      */
-
     public String toTEI(int n) {
         return toTEI(n, 0, GrobidAnalysisConfig.defaultInstance());
     }
 
+    /**
+     * Export the bibliographical item into a TEI BiblStruct string
+     *
+     * @param n - the index of the bibliographical record, the corresponding id will be b+n
+     */
+    public String toTEI(int n, GrobidAnalysisConfig config) {
+        return toTEI(n, 0, config);
+    }
+
+    /**
+     * Export the bibliographical item into a TEI BiblStruct string
+     *
+     * @param n - the index of the bibliographical record, the corresponding id will be b+n
+     * @param indent - the tabulation indentation for the output of the xml elements
+     */
     public String toTEI(int n, int indent) {
         return toTEI(n, indent, GrobidAnalysisConfig.defaultInstance());
     }
 
-
     /**
      * Export the bibliographical item into a TEI BiblStruct string
      *

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java
@@ -30,6 +30,8 @@
 import org.grobid.core.tokenization.TaggingTokenClusteror;
 import org.grobid.core.engines.label.TaggingLabel;
 import org.grobid.core.engines.label.TaggingLabels;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.File;
 import java.io.IOException;
@@ -45,6 +47,8 @@
  * @author Patrice Lopez
  */
 public class CitationParser extends AbstractParser {
+    private static final Logger LOGGER = LoggerFactory.getLogger(AbstractParser.class);
+
     public Lexicon lexicon = Lexicon.getInstance();
     private EngineParsers parsers;
 
@@ -70,7 +74,10 @@ public BiblioItem processing(String input, int consolidate) {
         //input = input.replaceAll("\\p{Cntrl}", " ").trim();
 
         List<LayoutToken> tokens = analyzer.tokenizeWithLayoutToken(input);
-        return processing(tokens, consolidate);
+        BiblioItem biblioItem = processing(tokens, consolidate);
+        // store original references to enable raw output
+        biblioItem.setReference(input);
+        return biblioItem;
     }
 
     public BiblioItem processing(List<LayoutToken> tokens, int consolidate) {
@@ -146,6 +153,7 @@ public BiblioItem processing(List<LayoutToken> tokens, int consolidate) {
 
             return resCitation;
         } catch (Exception e) {
+            LOGGER.error("An exception occured while running Grobid.", e);
             throw new GrobidException(
                     "An exception occured while running Grobid.", e);
         }
@@ -171,7 +179,7 @@ public List<BibDataSet> processingReferenceSection(String referenceTextBlock, Re
     }
 
     public List<BibDataSet> processingReferenceSection(Document doc, ReferenceSegmenter referenceSegmenter, int consolidate) {
-        List<BibDataSet> results = new ArrayList<BibDataSet>();
+        List<BibDataSet> results = new ArrayList<>();
 
         String referencesStr = doc.getDocumentPartText(SegmentationLabels.REFERENCES);
 
@@ -292,8 +300,10 @@ public List<BibDataSet> processingReferenceSection(DocumentSource documentSource
                     GrobidAnalysisConfig.builder().consolidateCitations(consolidate).build());
             results = processingReferenceSection(doc, referenceSegmenter, consolidate);
         } catch (GrobidException e) {
+            LOGGER.error("An exception occured while running Grobid.", e);
             throw e;
         } catch (Exception e) {
+            LOGGER.error("An exception occured while running Grobid.", e);
             throw new GrobidException("An exception occurred while running Grobid.", e);
         }
 
@@ -448,9 +458,9 @@ else if (consolidate == 2)
                     BiblioItem.injectDOI(resCitation, bibo);
             }
         } catch (Exception e) {
-            // e.printStackTrace();
+            LOGGER.error("An exception occurred while running bibliographical data consolidation.", e);
             throw new GrobidException(
-                    "An exception occured while running bibliographical data consolidation.", e);
+                    "An exception occurred while running bibliographical data consolidation.", e);
         } 
         return resCitation;
     }