Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue "Add quality check and cleanup for problematic unicode characters" #10817

Closed
wants to merge 14 commits into from
Closed
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv
- We added the possibility to show the BibTeX source in the [web search](https://docs.jabref.org/collect/import-using-online-bibliographic-database) import screen. [#560](https://github.com/koppor/jabref/issues/560)
- We added a fetcher for [ISIDORE](https://isidore.science/), simply paste in the link into the text field or the last 6 digits in the link that identify that paper. [#10423](https://github.com/JabRef/jabref/issues/10423)
- When importing entries form the "Citation relations" tab, the field [cites](https://docs.jabref.org/advanced/entryeditor/entrylinks) is now filled according to the relationship between the entries. [#10572](https://github.com/JabRef/jabref/pull/10752)
- We added a new integrity check and clean up option for non NFC format values. [#10506](https://github.com/JabRef/jabref/issues/10506)
- We added a new group icon column to the main table showing the icons of the entry's groups. [#10801](https://github.com/JabRef/jabref/pull/10801)

### Changed
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jabref/logic/formatter/Formatters.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.jabref.logic.formatter.bibtexfields.NormalizeMonthFormatter;
import org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatter;
import org.jabref.logic.formatter.bibtexfields.NormalizePagesFormatter;
import org.jabref.logic.formatter.bibtexfields.NormalizeUnicodeFormatter;
import org.jabref.logic.formatter.bibtexfields.OrdinalsToSuperscriptFormatter;
import org.jabref.logic.formatter.bibtexfields.RegexFormatter;
import org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter;
Expand Down Expand Up @@ -87,6 +88,7 @@ public static List<Formatter> getOthers() {
new EscapeAmpersandsFormatter(),
new EscapeDollarSignFormatter(),
new ShortenDOIFormatter(),
new NormalizeUnicodeFormatter(),
new ReplaceUnicodeLigaturesFormatter(),
new UnprotectTermsFormatter()
);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package org.jabref.logic.formatter.bibtexfields;

import java.text.Normalizer;
import java.util.Objects;

import org.jabref.logic.cleanup.Formatter;

/**
* Clean up field values by formatting Unicode values with Normalize Unicode
koppor marked this conversation as resolved.
Show resolved Hide resolved
*/
public class NormalizeUnicodeFormatter extends Formatter {

@Override
public String getName() {
return "Normalize Unicode";
}

@Override
public String getKey() {
return "NORMALIZE_UNICODE";
}

@Override
public String getDescription() {
return "Normalize Unicode characters in BibTeX fields.";
}

@Override
public String getExampleInput() {
return "H\u00E9ll\u00F4 W\u00F6rld";
}

@Override
public String format(String value) {
Objects.requireNonNull(value);

String normalizedValue = Normalizer.normalize(value, Normalizer.Form.NFC);

return normalizedValue;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext,
entryCheckers.addAll(List.of(
new ASCIICharacterChecker(),
new NoBibtexFieldChecker(),
new UnicodeNormalFormCCheck(),
new BibTeXEntryTypeChecker())
);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package org.jabref.logic.integrity;

import java.text.Normalizer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.Field;

/**
* Detect any Unicode characters that is not in NFC format
koppor marked this conversation as resolved.
Show resolved Hide resolved
*/
public class UnicodeNormalFormCCheck implements EntryChecker {

@Override
public List<IntegrityMessage> check(BibEntry entry) {
List<IntegrityMessage> results = new ArrayList<>();
for (Map.Entry<Field, String> field : entry.getFieldMap().entrySet()) {
String normalizedString = Normalizer.normalize(field.getValue(), Normalizer.Form.NFC);
koppor marked this conversation as resolved.
Show resolved Hide resolved
if (!(field.getValue().equals(normalizedString))) {
results.add(new IntegrityMessage(Localization.lang("Value is not in Normal Form C (NFC) format"), entry,
field.getKey()));
}
}
return results;
}
}
1 change: 1 addition & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2636,4 +2636,5 @@ More\ options...=More options...
Treat\ all\ duplicates\ entries\ the\ same\ way=Treat all duplicates entries the same way
Ask\ every\ time=Ask every time

Value\ is\ not\ in\ Normal\ Form\ C\ (NFC)\ format=Value is not in Normal Form C (NFC) format
Group\ icons=Group icons
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package org.jabref.logic.integrity;

import java.util.Collections;
import java.util.List;

import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;

public class UnicodeNormalFormCCheckTest {
UnicodeNormalFormCCheck checker = new UnicodeNormalFormCCheck();
BibEntry entry = new BibEntry();

@Test
void checkWithNormalizedStringShouldReturnEmptyList() {
entry.setField(StandardField.TITLE, "Some Title");
entry.setField(StandardField.AUTHOR, "John Doe");

assertEquals(Collections.emptyList(), checker.check(entry));
}

@Test
void checkWithNonNormalizedStringShouldReturnIntegrityMessage() {
koppor marked this conversation as resolved.
Show resolved Hide resolved
entry.setField(StandardField.TITLE, "Café");
entry.setField(StandardField.AUTHOR, "John Doe");
koppor marked this conversation as resolved.
Show resolved Hide resolved

assertFalse(checker.check(entry).isEmpty());
koppor marked this conversation as resolved.
Show resolved Hide resolved
assertEquals(List.of(new IntegrityMessage("Value is not in Normal Form C (NFC) format", entry, StandardField.TITLE)), checker.check(entry));
}
}
Loading