diff --git a/CHANGELOG.md b/CHANGELOG.md index 8783e3c4ba0..3189f2c75c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv - We added the possibility to show the BibTeX source in the [web search](https://docs.jabref.org/collect/import-using-online-bibliographic-database) import screen. [#560](https://github.com/koppor/jabref/issues/560) - We added a fetcher for [ISIDORE](https://isidore.science/), simply paste in the link into the text field or the last 6 digits in the link that identify that paper. [#10423](https://github.com/JabRef/jabref/issues/10423) - When importing entries form the "Citation relations" tab, the field [cites](https://docs.jabref.org/advanced/entryeditor/entrylinks) is now filled according to the relationship between the entries. [#10572](https://github.com/JabRef/jabref/pull/10752) +- We added a new integrity check and clean up option for non NFC format values. [#10506](https://github.com/JabRef/jabref/issues/10506) - We added a new group icon column to the main table showing the icons of the entry's groups. [#10801](https://github.com/JabRef/jabref/pull/10801) ### Changed diff --git a/src/main/java/org/jabref/logic/formatter/Formatters.java b/src/main/java/org/jabref/logic/formatter/Formatters.java index 82fb14932e8..bcc7c202295 100644 --- a/src/main/java/org/jabref/logic/formatter/Formatters.java +++ b/src/main/java/org/jabref/logic/formatter/Formatters.java @@ -22,6 +22,7 @@ import org.jabref.logic.formatter.bibtexfields.NormalizeMonthFormatter; import org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatter; import org.jabref.logic.formatter.bibtexfields.NormalizePagesFormatter; +import org.jabref.logic.formatter.bibtexfields.NormalizeUnicodeFormatter; import org.jabref.logic.formatter.bibtexfields.OrdinalsToSuperscriptFormatter; import org.jabref.logic.formatter.bibtexfields.RegexFormatter; import org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter; @@ -87,6 +88,7 @@ public static List getOthers() { new EscapeAmpersandsFormatter(), new EscapeDollarSignFormatter(), new ShortenDOIFormatter(), + new NormalizeUnicodeFormatter(), new ReplaceUnicodeLigaturesFormatter(), new UnprotectTermsFormatter() ); diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatter.java new file mode 100644 index 00000000000..1bf00558ed6 --- /dev/null +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatter.java @@ -0,0 +1,41 @@ +package org.jabref.logic.formatter.bibtexfields; + +import java.text.Normalizer; +import java.util.Objects; + +import org.jabref.logic.cleanup.Formatter; + +/** + * Clean up field values by formatting Unicode values with Normalize Unicode + */ +public class NormalizeUnicodeFormatter extends Formatter { + + @Override + public String getName() { + return "Normalize Unicode"; + } + + @Override + public String getKey() { + return "NORMALIZE_UNICODE"; + } + + @Override + public String getDescription() { + return "Normalize Unicode characters in BibTeX fields."; + } + + @Override + public String getExampleInput() { + return "H\u00E9ll\u00F4 W\u00F6rld"; + } + + @Override + public String format(String value) { + Objects.requireNonNull(value); + + String normalizedValue = Normalizer.normalize(value, Normalizer.Form.NFC); + + return normalizedValue; + } +} diff --git a/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java b/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java index bcf87382a55..bd8659f3e50 100644 --- a/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java +++ b/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java @@ -52,6 +52,7 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext, entryCheckers.addAll(List.of( new ASCIICharacterChecker(), new NoBibtexFieldChecker(), + new UnicodeNormalFormCCheck(), new BibTeXEntryTypeChecker()) ); } diff --git a/src/main/java/org/jabref/logic/integrity/UnicodeNormalFormCCheck.java b/src/main/java/org/jabref/logic/integrity/UnicodeNormalFormCCheck.java new file mode 100644 index 00000000000..31a4e74bc8f --- /dev/null +++ b/src/main/java/org/jabref/logic/integrity/UnicodeNormalFormCCheck.java @@ -0,0 +1,29 @@ +package org.jabref.logic.integrity; + +import java.text.Normalizer; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.jabref.logic.l10n.Localization; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.Field; + +/** + * Detect any Unicode characters that is not in NFC format + */ +public class UnicodeNormalFormCCheck implements EntryChecker { + + @Override + public List check(BibEntry entry) { + List results = new ArrayList<>(); + for (Map.Entry field : entry.getFieldMap().entrySet()) { + String normalizedString = Normalizer.normalize(field.getValue(), Normalizer.Form.NFC); + if (!(field.getValue().equals(normalizedString))) { + results.add(new IntegrityMessage(Localization.lang("Value is not in Normal Form C (NFC) format"), entry, + field.getKey())); + } + } + return results; + } +} diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties index 46c4789916a..5d89fb8c12f 100644 --- a/src/main/resources/l10n/JabRef_en.properties +++ b/src/main/resources/l10n/JabRef_en.properties @@ -2636,4 +2636,5 @@ More\ options...=More options... Treat\ all\ duplicates\ entries\ the\ same\ way=Treat all duplicates entries the same way Ask\ every\ time=Ask every time +Value\ is\ not\ in\ Normal\ Form\ C\ (NFC)\ format=Value is not in Normal Form C (NFC) format Group\ icons=Group icons diff --git a/src/test/java/org/jabref/logic/integrity/UnicodeNormalFormCCheckTest.java b/src/test/java/org/jabref/logic/integrity/UnicodeNormalFormCCheckTest.java new file mode 100644 index 00000000000..24c8cde1004 --- /dev/null +++ b/src/test/java/org/jabref/logic/integrity/UnicodeNormalFormCCheckTest.java @@ -0,0 +1,34 @@ +package org.jabref.logic.integrity; + +import java.util.Collections; +import java.util.List; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +public class UnicodeNormalFormCCheckTest { + UnicodeNormalFormCCheck checker = new UnicodeNormalFormCCheck(); + BibEntry entry = new BibEntry(); + + @Test + void checkWithNormalizedStringShouldReturnEmptyList() { + entry.setField(StandardField.TITLE, "Some Title"); + entry.setField(StandardField.AUTHOR, "John Doe"); + + assertEquals(Collections.emptyList(), checker.check(entry)); + } + + @Test + void checkWithNonNormalizedStringShouldReturnIntegrityMessage() { + entry.setField(StandardField.TITLE, "CafeĢ"); + entry.setField(StandardField.AUTHOR, "John Doe"); + + assertFalse(checker.check(entry).isEmpty()); + assertEquals(List.of(new IntegrityMessage("Value is not in Normal Form C (NFC) format", entry, StandardField.TITLE)), checker.check(entry)); + } +}