Skip to content

Commit

Permalink
Merge pull request #532 from koppor/add-bibtex-accept
Browse files Browse the repository at this point in the history
Add support for application/x-bibtex type
  • Loading branch information
kermitt2 committed Mar 9, 2020
2 parents 5a325e3 + 0b86f3f commit adeca65
Show file tree
Hide file tree
Showing 20 changed files with 766 additions and 469 deletions.
6 changes: 6 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ indent_size=2
indent_style=space
indent_size=2

[*.md]
insert_final_newline=true

[{*.yml,*.yaml}]
indent_style=space
indent_size=2

[GrobidRestProcessString.java]
indent_style=tab
5 changes: 5 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@ buildscript {
mavenLocal()
mavenCentral()
jcenter()
maven {
url 'https://plugins.gradle.org/m2/'
}
}
dependencies {
classpath group: 'net.researchgate', name: 'gradle-release', version: '2.6.0'
classpath 'org.kt3k.gradle.plugin:coveralls-gradle-plugin:2.4.0'
classpath 'com.jfrog.bintray.gradle:gradle-bintray-plugin:1.7.3'
classpath 'com.github.jengelman.gradle.plugins:shadow:5.0.0'
classpath 'com.adarshr:gradle-test-logger-plugin:2.0.0'
}
}

Expand All @@ -25,6 +29,7 @@ allprojects {
apply plugin: 'jacoco'
apply plugin: 'base'
apply plugin: 'com.github.kt3k.coveralls'
apply plugin: 'com.adarshr.test-logger'

group = "org.grobid"

Expand Down
377 changes: 201 additions & 176 deletions doc/Grobid-service.md

Large diffs are not rendered by default.

166 changes: 92 additions & 74 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -1727,132 +1728,139 @@ else if (string.startsWith("PACS Numbers") ||
}

/**
* Export to BibTeX format
* Export to BibTeX format. Use "id" as BibTeX key.
*/
public String toBibTeX() {
return toBibTeX("id");
}

/**
* Export to BibTeX format
*
* @param id the BibTeX ke to use.
*/
public String toBibTeX(String id) {
String bibtex = "";
try {
return toBibTeX(id, new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder().includeRawCitations(false).build());
}

if (journal != null) {
bibtex += "@article{" + id + ",\n";
} else if (book_type != null) {
bibtex += "@techreport{" + id + ",\n";
} else if (bookTitle != null) {
if ((bookTitle.startsWith("proc")) || (bookTitle.startsWith("Proc")) ||
(bookTitle.startsWith("In Proc")) || (bookTitle.startsWith("In proc"))) {
bibtex += "@inproceedings{" + id + ",\n";
} else {
bibtex += "@article{" + id + ",\n"; // ???
}
/**
* Export to BibTeX format
*
* @param id the BibTeX ke to use
*/
public String toBibTeX(String id, GrobidAnalysisConfig config) {
String type;
if (journal != null) {
type = "article";
} else if (book_type != null) {
type = "techreport";
} else if (bookTitle != null) {
if (StringUtils.containsIgnoreCase(bookTitle, "proceedings") ||
(bookTitle.startsWith("proc")) || (bookTitle.startsWith("Proc")) ||
(bookTitle.startsWith("In Proc")) || (bookTitle.startsWith("In proc"))) {
type = "inproceedings";
} else {
bibtex += "@misc{" + id + ",\n"; // ???
LOGGER.debug("No journal given, but a booktitle. However, the booktitle does not start with \"proc\" or similar strings. Returning inbook");
type = "inbook";
}
} else {
// using "misc" as fallback type
type = "misc";
}

StringJoiner bibtex = new StringJoiner(",\n", "@" + type + "{" + id + ",\n", "\n}\n");

try {

// author
// fullAuthors has to be used instead
if (collaboration != null) {
bibtex += "author\t=\t\"" + collaboration;
} else if (fullAuthors != null) {
if (fullAuthors.size() > 0) {
boolean begin = true;
for (Person person : fullAuthors) {
if (begin) {
bibtex += "author\t=\t\"" + person.getFirstName() + " " + person.getLastName();
begin = false;
} else
bibtex += " and " + person.getFirstName() + " " + person.getLastName();
}
bibtex += "\"";
}
} else if (authors != null) {
StringTokenizer st = new StringTokenizer(authors, ";");
if (st.countTokens() > 1) {
boolean begin = true;
bibtex.add(" author = {" + collaboration + "}");
} else {
StringJoiner authors = new StringJoiner(" and ", " author = {", "}");
if (fullAuthors != null) {
fullAuthors.stream()
.filter(person -> person != null)
.forEachOrdered(person -> {
String author = person.getLastName();
if (person.getFirstName() != null) {
author += ", ";
author += person.getFirstName();
}
authors.add(author);
});
} else if (this.authors != null) {
StringTokenizer st = new StringTokenizer(this.authors, ";");
while (st.hasMoreTokens()) {
String author = st.nextToken();
if (author != null)
author = author.trim();
if (begin) {
bibtex += "author\t=\t\"" + author;
begin = false;
} else
bibtex += " and " + author;

if (author != null) {
authors.add(author.trim());
}
}
bibtex += "\"";
} else {
if (authors != null)
bibtex += "author\t=\t\"" + authors + "\"";
}
bibtex.add(authors.toString());
}

// title
if (title != null) {
bibtex += ",\ntitle\t=\t\"" + title + "\"";
bibtex.add(" title = {" + title + "}");
}

// journal
if (journal != null) {
bibtex += ",\njournal\t=\t\"" + journal + "\"";
bibtex.add(" journal = {" + journal + "}");
}

// booktitle
if ((journal == null) && (book_type == null) && (bookTitle != null)) {
bibtex += ",\nbooktitle\t=\t\"" + bookTitle + "\"";
bibtex.add(" booktitle = {" + bookTitle + "}");
}

// publisher
if (publisher != null) {
bibtex += ",\npublisher\t=\t\"" + publisher + "\"";
bibtex.add(" publisher = {" + publisher + "}");
}

// editors
if (editors != null) {
String locEditors = editors.replace(" ; ", " and ");
bibtex += ",\neditor\t=\t\"" + locEditors + "\"";
bibtex.add(" editor = {" + locEditors + "}");
}
// fullEditors has to be used instead

// year
if (publication_date != null) {
bibtex += ",\nyear\t=\t\"" + publication_date + "\"";
bibtex.add(" year = {" + publication_date + "}");
}

// location
// address
if (location != null) {
bibtex += ",\naddress\t=\t\"" + location + "\"";
bibtex.add(" address = {" + location + "}");
}

// pages
if (pageRange != null) {
bibtex += ",\npages\t=\t\"" + pageRange + "\"";
bibtex.add(" pages = {" + pageRange + "}");
}

// volume
if (volumeBlock != null) {
bibtex += ",\nvolume\t=\t\"" + volumeBlock + "\"";
bibtex.add(" volume = {" + volumeBlock + "}");
}

// issue (named number in BibTeX)
if (issue != null) {
bibtex += ",\nnumber\t=\t\"" + issue + "\"";
bibtex.add(" number = {" + issue + "}");
}

// DOI
if (!StringUtils.isEmpty(doi)) {
bibtex += ",\ndoi\t=\t\"" + doi + "\"";
bibtex.add(" doi = {" + doi + "}");
}

// arXiv identifier
if (!StringUtils.isEmpty(arXivId)) {
bibtex += ",\neprint\t=\t\"" + arXivId + "\"";
bibtex.add(" eprint = {" + arXivId + "}");
}
/* note that the following is now recommended for arXiv citations:
archivePrefix = "arXiv",
Expand All @@ -1864,30 +1872,27 @@ public String toBibTeX(String id) {

// abstract
if (!StringUtils.isEmpty(abstract_)) {
bibtex += ",\nabstract\t=\t\"" + abstract_ + "\"";
bibtex.add(" abstract = {" + abstract_ + "}");
}

// keywords
if (keywords != null) {
bibtex += ",\nkeywords\t=\t\"";
boolean begin = true;
for (Keyword keyw : keywords) {
if ( (keyw.getKeyword() == null) || (keyw.getKeyword().length() == 0) )
continue;
if (begin) {
begin = false;
bibtex += keyw.getKeyword();
} else
bibtex += ", " + keyw.getKeyword();
}
bibtex += "\"";
String value = keywords.stream()
.map(keyword -> keyword.getKeyword())
.filter(keyword -> !StringUtils.isBlank(keyword))
.collect(Collectors.joining(", ", "keywords = {", "}"));
bibtex.add(value);
}

bibtex += "\n}\n";
if (config.getIncludeRawCitations() && !StringUtils.isEmpty(reference) ) {
// escape all " signs
bibtex.add(" raw = {" + reference + "}");
}
} catch (Exception e) {
LOGGER.error("Cannot export BibTex format, because of nested exception.", e);
throw new GrobidException("Cannot export BibTex format, because of nested exception.", e);
}
return bibtex;
return bibtex.toString();
}

/**
Expand Down Expand Up @@ -1940,16 +1945,29 @@ public void checkIdentifier() {
*
* @param n - the index of the bibliographical record, the corresponding id will be b+n
*/

public String toTEI(int n) {
return toTEI(n, 0, GrobidAnalysisConfig.defaultInstance());
}

/**
* Export the bibliographical item into a TEI BiblStruct string
*
* @param n - the index of the bibliographical record, the corresponding id will be b+n
*/
public String toTEI(int n, GrobidAnalysisConfig config) {
return toTEI(n, 0, config);
}

/**
* Export the bibliographical item into a TEI BiblStruct string
*
* @param n - the index of the bibliographical record, the corresponding id will be b+n
* @param indent - the tabulation indentation for the output of the xml elements
*/
public String toTEI(int n, int indent) {
return toTEI(n, indent, GrobidAnalysisConfig.defaultInstance());
}


/**
* Export the bibliographical item into a TEI BiblStruct string
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.label.TaggingLabels;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
Expand All @@ -45,6 +47,8 @@
* @author Patrice Lopez
*/
public class CitationParser extends AbstractParser {
private static final Logger LOGGER = LoggerFactory.getLogger(AbstractParser.class);

public Lexicon lexicon = Lexicon.getInstance();
private EngineParsers parsers;

Expand All @@ -70,7 +74,10 @@ public BiblioItem processing(String input, int consolidate) {
//input = input.replaceAll("\\p{Cntrl}", " ").trim();

List<LayoutToken> tokens = analyzer.tokenizeWithLayoutToken(input);
return processing(tokens, consolidate);
BiblioItem biblioItem = processing(tokens, consolidate);
// store original references to enable raw output
biblioItem.setReference(input);
return biblioItem;
}

public BiblioItem processing(List<LayoutToken> tokens, int consolidate) {
Expand Down Expand Up @@ -146,6 +153,7 @@ public BiblioItem processing(List<LayoutToken> tokens, int consolidate) {

return resCitation;
} catch (Exception e) {
LOGGER.error("An exception occured while running Grobid.", e);
throw new GrobidException(
"An exception occured while running Grobid.", e);
}
Expand All @@ -171,7 +179,7 @@ public List<BibDataSet> processingReferenceSection(String referenceTextBlock, Re
}

public List<BibDataSet> processingReferenceSection(Document doc, ReferenceSegmenter referenceSegmenter, int consolidate) {
List<BibDataSet> results = new ArrayList<BibDataSet>();
List<BibDataSet> results = new ArrayList<>();

String referencesStr = doc.getDocumentPartText(SegmentationLabels.REFERENCES);

Expand Down Expand Up @@ -292,8 +300,10 @@ public List<BibDataSet> processingReferenceSection(DocumentSource documentSource
GrobidAnalysisConfig.builder().consolidateCitations(consolidate).build());
results = processingReferenceSection(doc, referenceSegmenter, consolidate);
} catch (GrobidException e) {
LOGGER.error("An exception occured while running Grobid.", e);
throw e;
} catch (Exception e) {
LOGGER.error("An exception occured while running Grobid.", e);
throw new GrobidException("An exception occurred while running Grobid.", e);
}

Expand Down Expand Up @@ -448,9 +458,9 @@ else if (consolidate == 2)
BiblioItem.injectDOI(resCitation, bibo);
}
} catch (Exception e) {
// e.printStackTrace();
LOGGER.error("An exception occurred while running bibliographical data consolidation.", e);
throw new GrobidException(
"An exception occured while running bibliographical data consolidation.", e);
"An exception occurred while running bibliographical data consolidation.", e);
}
return resCitation;
}
Expand Down
Loading

0 comments on commit adeca65

Please sign in to comment.