Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix IEEE test #7852

Merged
merged 11 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions docs/adr/0022-remove-stop-words-during-query-transformation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Remove stop words during query transformation

## Context and Problem Statement

When quering for a title of a paper, the title might contain stop words such as "a", "for", "and". Some data providers return 0 results when querying for a stop word. When transforming a query to the lucene syntax, the default Boolean operator `and` is used. When using IEEE, this often leads to zero search results.

## Decision Drivers

* Consistent to the Google search engine
* Allow reproducible searches
* Avoid WTFs on the user's side

## Considered Options

* Remove stop words from the query
* Automatically enclose in quotes if no Boolean operator is contained

## Decision Outcome

Chosen option: "Remove stop words from the query", because comes out best.

## Pros and Cons of the Options

### Remove stop words from the query

* Good, because Good search results if no Boolean operators are used
* Bad, because When using complex queries and stop words are used alone, they are silently removed

### Automatically enclose in quotes if no Boolean operator is contained

* Good, because Good search results if no Boolean operators are used
* Bad, because Silently leads to different results
* Bad, because Inconsistent to Google behavior
28 changes: 24 additions & 4 deletions src/main/java/org/jabref/logic/importer/fetcher/IEEE.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,16 @@ public class IEEE implements FulltextFetcher, PagedSearchBasedParserFetcher {

private final ImportFormatPreferences preferences;

private IEEEQueryTransformer transformer;

public IEEE(ImportFormatPreferences preferences) {
this.preferences = Objects.requireNonNull(preferences);
}

/**
* @implNote <a href="https://developer.ieee.org/docs/read/Metadata_API_responses">documentation</a>
*/
private static BibEntry parseJsonRespone(JSONObject jsonEntry, Character keywordSeparator) {
private static BibEntry parseJsonResponse(JSONObject jsonEntry, Character keywordSeparator) {
BibEntry entry = new BibEntry();

switch (jsonEntry.optString("content_type")) {
Expand Down Expand Up @@ -205,8 +207,24 @@ public Parser getParser() {
JSONArray results = jsonObject.getJSONArray("articles");
for (int i = 0; i < results.length(); i++) {
JSONObject jsonEntry = results.getJSONObject(i);
BibEntry entry = parseJsonRespone(jsonEntry, preferences.getKeywordSeparator());
entries.add(entry);
BibEntry entry = parseJsonResponse(jsonEntry, preferences.getKeywordSeparator());
boolean addEntry;
// In case entry has no year, add it
// In case an entry has a year, check if its in the year range
// The implementation uses some Java 8 Optional magic to implement that
if (entry.hasField(StandardField.YEAR)) {
koppor marked this conversation as resolved.
Show resolved Hide resolved
addEntry = entry.getField(StandardField.YEAR).filter(year -> {
Integer yearAsInteger = Integer.valueOf(year);
return
transformer.getStartYear().map(startYear -> yearAsInteger >= startYear).orElse(true) &&
transformer.getEndYear().map(endYear -> yearAsInteger <= endYear).orElse(true);
}).map(x -> true).orElse(false);
} else {
addEntry = true;
}
if (addEntry) {
entries.add(entry);
}
}
}

Expand All @@ -226,7 +244,9 @@ public Optional<HelpFile> getHelpPage() {

@Override
public URL getURLForQuery(QueryNode luceneQuery, int pageNumber) throws URISyntaxException, MalformedURLException, FetcherException {
IEEEQueryTransformer transformer = new IEEEQueryTransformer();
// transformer is stored globally, because we need to filter out the bib entries by the year manually
// the transformer stores the min and max year
transformer = new IEEEQueryTransformer();
String transformedQuery = transformer.transformLuceneQuery(luceneQuery).orElse("");
URIBuilder uriBuilder = new URIBuilder("https://ieeexploreapi.ieee.org/api/v1/search/articles");
uriBuilder.addParameter("apikey", API_KEY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import java.util.StringJoiner;
import java.util.stream.Collectors;

import org.jabref.model.strings.StringUtil;

import org.apache.lucene.queryparser.flexible.core.nodes.BooleanQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.GroupQueryNode;
Expand Down Expand Up @@ -96,7 +98,7 @@ private Optional<String> transform(FieldQueryNode query) {
return s.isEmpty() ? Optional.empty() : Optional.of(s);
}
case NO_EXPLICIT_FIELD -> {
return Optional.of(handleUnFieldedTerm(term));
return handleUnFieldedTerm(term);
}
default -> {
// Just add unknown fields as default
Expand Down Expand Up @@ -184,29 +186,16 @@ protected String handleYearRange(String yearRange) {
*
* Default implementation: just return the term (in quotes if a space is contained)
*/
protected String handleUnFieldedTerm(String term) {
return quoteStringIfSpaceIsContained(term);
}

/**
* Encloses the given string with " if there is a space contained
*
* @return Returns a string
*/
protected String quoteStringIfSpaceIsContained(String string) {
if (string.contains(" ")) {
return "\"" + string + "\"";
} else {
return string;
}
protected Optional<String> handleUnFieldedTerm(String term) {
return Optional.of(StringUtil.quoteStringIfSpaceIsContained(term));
}

protected String createKeyValuePair(String fieldAsString, String term) {
return createKeyValuePair(fieldAsString, term, ":");
}

protected String createKeyValuePair(String fieldAsString, String term, String separator) {
return String.format("%s%s%s", fieldAsString, separator, quoteStringIfSpaceIsContained(term));
return String.format("%s%s%s", fieldAsString, separator, StringUtil.quoteStringIfSpaceIsContained(term));
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jabref.logic.importer.fetcher.transformers;

import java.util.Optional;

public class ArXivQueryTransformer extends YearRangeByFilteringQueryTransformer {
@Override
protected String getLogicalAndOperator() {
Expand Down Expand Up @@ -42,8 +44,8 @@ protected String handleYear(String year) {
}

@Override
protected String handleUnFieldedTerm(String term) {
return createKeyValuePair("all", term);
protected Optional<String> handleUnFieldedTerm(String term) {
return Optional.of(createKeyValuePair("all", term));
}

}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jabref.logic.importer.fetcher.transformers;

import org.jabref.model.strings.StringUtil;

public class CollectionOfComputerScienceBibliographiesQueryTransformer extends AbstractQueryTransformer {

@Override
Expand Down Expand Up @@ -29,7 +31,7 @@ protected String handleTitle(String title) {

@Override
protected String handleJournal(String journalTitle) {
return quoteStringIfSpaceIsContained(journalTitle);
return StringUtil.quoteStringIfSpaceIsContained(journalTitle);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jabref.logic.importer.fetcher.transformers;

import org.jabref.model.strings.StringUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -28,18 +30,18 @@ protected String getLogicalNotOperator() {
@Override
protected String handleAuthor(String author) {
// DBLP does not support explicit author field search
return quoteStringIfSpaceIsContained(author);
return StringUtil.quoteStringIfSpaceIsContained(author);
}

@Override
protected String handleTitle(String title) {
// DBLP does not support explicit title field search
return quoteStringIfSpaceIsContained(title);
return StringUtil.quoteStringIfSpaceIsContained(title);
}

@Override
protected String handleJournal(String journalTitle) {
// DBLP does not support explicit journal field search
return quoteStringIfSpaceIsContained(journalTitle);
return StringUtil.quoteStringIfSpaceIsContained(journalTitle);
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jabref.logic.importer.fetcher.transformers;

import org.jabref.model.strings.StringUtil;

/**
* Default query transformer without any boolean operators
*/
Expand All @@ -22,16 +24,16 @@ protected String getLogicalNotOperator() {

@Override
protected String handleAuthor(String author) {
return quoteStringIfSpaceIsContained(author);
return StringUtil.quoteStringIfSpaceIsContained(author);
}

@Override
protected String handleTitle(String title) {
return quoteStringIfSpaceIsContained(title);
return StringUtil.quoteStringIfSpaceIsContained(title);
}

@Override
protected String handleJournal(String journalTitle) {
return quoteStringIfSpaceIsContained(journalTitle);
return StringUtil.quoteStringIfSpaceIsContained(journalTitle);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ protected String handleYear(String year) {
}

@Override
protected String handleUnFieldedTerm(String term) {
protected Optional<String> handleUnFieldedTerm(String term) {
// all does not search in full-text
// Other option is txt: but this does not search in meta data
return createKeyValuePair("pica.all", term, "=");
return Optional.of(createKeyValuePair("pica.all", term, "="));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
package org.jabref.logic.importer.fetcher.transformers;

import java.util.List;
import java.util.Objects;
import java.util.Optional;

import org.jabref.model.strings.StringUtil;

/**
* Needs to be instantiated for each new query
*/
public class IEEEQueryTransformer extends YearRangeByFilteringQueryTransformer {
/**
* Returns words ignored by the engine. Need to be removed when querying for them.
* See ADR-0022
*/
private static final List<String> STOP_WORDS = List.of("a", "and", "for", "or", "with");

// These have to be integrated into the IEEE query URL as these are just supported as query parameters
// Journal is wrapped in quotes by the transformer
private String journal;
private String articleNumber;
private int startYear = Integer.MAX_VALUE;
private int endYear = Integer.MIN_VALUE;

@Override
protected String getLogicalAndOperator() {
Expand Down Expand Up @@ -40,8 +47,9 @@ protected String handleTitle(String title) {
}

@Override
protected String handleJournal(String journalTitle) {
return handleUnFieldedTerm(journalTitle);
protected String handleJournal(String journal) {
this.journal = journal;
return StringUtil.quoteStringIfSpaceIsContained(journal);
}

@Override
Expand All @@ -59,6 +67,14 @@ protected Optional<String> handleOtherField(String fieldAsString, String term) {
};
}

@Override
protected Optional<String> handleUnFieldedTerm(String term) {
if (STOP_WORDS.contains(term)) {
return Optional.empty();
}
return super.handleUnFieldedTerm(term);
}

private Optional<String> handleArticleNumber(String term) {
articleNumber = term;
return Optional.empty();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jabref.logic.importer.fetcher.transformers;

import java.util.Optional;

public class ZbMathQueryTransformer extends AbstractQueryTransformer {

@Override
Expand Down Expand Up @@ -43,7 +45,7 @@ protected String handleYearRange(String yearRange) {
}

@Override
protected String handleUnFieldedTerm(String term) {
return createKeyValuePair("any", term);
protected Optional<String> handleUnFieldedTerm(String term) {
return Optional.of(createKeyValuePair("any", term));
}
}
13 changes: 13 additions & 0 deletions src/main/java/org/jabref/model/strings/StringUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -738,4 +738,17 @@ public static String substringBetween(String str, String open, String close) {
public static String ignoreCurlyBracket(String title) {
return isNotBlank(title) ? title.replace("{", "").replace("}", "") : title;
}

/**
* Encloses the given string with " if there is a space contained
*
* @return Returns a string
*/
public static String quoteStringIfSpaceIsContained(String string) {
if (string.contains(" ")) {
return "\"" + string + "\"";
} else {
return string;
}
}
}
Loading