JabRef · koppor · Mar 22, 2020 · Mar 22, 2020 · Mar 22, 2020 · Mar 22, 2020
diff --git a/build.gradle b/build.gradle
@@ -48,14 +48,6 @@ application {
     mainClassName = "$moduleName/org.jabref.JabRefLauncher"
 }
 
-// TODO: Ugly workaround to temporarily ignore build errors to dependencies of latex2unicode
-// These should be removed, as well as the files in the lib folder, as soon as they have valid module names
-patchModules.config = [
-        "test=fastparse_2.12-1.0.0.jar",
-        "test2=fastparse-utils_2.12-1.0.0.jar",
-        "test3=sourcecode_2.12-0.1.4.jar"
-]
-
 // These are the Java version requirements we will check on each start of JabRef
 ext.minRequiredJavaVersion = "1.8.0_171"
 ext.allowJava9 = true
@@ -183,10 +175,6 @@ dependencies {
     implementation group: 'jakarta.xml.bind', name: 'jakarta.xml.bind-api', version: '2.3.2'
     implementation group: 'org.glassfish.jaxb', name: 'jaxb-runtime', version: '2.3.2'
 
-    implementation ('com.github.tomtung:latex2unicode_2.12:0.2.6') {
-        exclude module: 'fastparse_2.12'
-    }
-
     implementation group: 'com.microsoft.azure', name: 'applicationinsights-core', version: '2.4.1'
     implementation (group: 'com.microsoft.azure', name: 'applicationinsights-logging-log4j2', version: '2.4.1') {
         exclude module: "log4j-core"

diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java
@@ -56,7 +56,6 @@
     requires org.apache.pdfbox;
     requires reactfx;
     requires commons.cli;
-    requires com.github.tomtung.latex2unicode;
     requires jbibtex;
     requires citeproc.java;
     requires antlr.runtime;

diff --git a/src/main/java/org/jabref/gui/texparser/CitationsDisplay.java b/src/main/java/org/jabref/gui/texparser/CitationsDisplay.java
@@ -18,7 +18,6 @@
 
 import org.jabref.gui.icon.IconTheme;
 import org.jabref.gui.util.ViewModelListCellFactory;
-import org.jabref.model.strings.LatexToUnicodeAdapter;
 import org.jabref.model.texparser.Citation;
 
 public class CitationsDisplay extends ListView<Citation> {
@@ -44,7 +43,9 @@ private Node getDisplayGraphic(Citation item) {
         }
 
         Node citationIcon = IconTheme.JabRefIcons.LATEX_COMMENT.getGraphicNode();
-        Text contextText = new Text(LatexToUnicodeAdapter.format(item.getContext()));
+        // FIXME
+        Text contextText = null;
+        // Text contextText = new Text(LatexToUnicodeAdapter.format(item.getContext()));
         contextText.wrappingWidthProperty().bind(this.widthProperty().subtract(85));
         HBox contextBox = new HBox(8, citationIcon, contextText);
         contextBox.getStyleClass().add("contextBox");

diff --git a/src/main/java/org/jabref/logic/citationstyle/CSLAdapter.java b/src/main/java/org/jabref/logic/citationstyle/CSLAdapter.java
@@ -11,7 +11,6 @@
 import org.jabref.model.entry.Month;
 import org.jabref.model.entry.field.Field;
 import org.jabref.model.entry.field.StandardField;
-import org.jabref.model.strings.LatexToUnicodeAdapter;
 
 import de.undercouch.citeproc.CSL;
 import de.undercouch.citeproc.DefaultAbbreviationProvider;
@@ -98,7 +97,8 @@ private static CSLItemData bibEntryToCSLItemData(BibEntry bibEntry) {
             for (Field key : bibEntry.getFieldMap().keySet()) {
                 bibEntry.getField(key)
                         .map(removeNewlinesFormatter::format)
-                        .map(LatexToUnicodeAdapter::format)
+                        // FIXME
+                        // .map(LatexToUnicodeAdapter::format)
                         .ifPresent(value -> {
                             if (StandardField.MONTH.equals(key)) {
                                 // Change month from #mon# to mon because CSL does not support the former format

diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java
@@ -18,7 +18,6 @@ public class UnicodeToLatexFormatter extends Formatter implements LayoutFormatte
     @Override
     public String format(String text) {
         String result = Objects.requireNonNull(text);
-
         if (result.isEmpty()) {
             return result;
         }

diff --git a/src/main/java/org/jabref/logic/layout/format/LatexToUnicodeFormatter.java b/src/main/java/org/jabref/logic/layout/format/LatexToUnicodeFormatter.java
@@ -1,9 +1,12 @@
 package org.jabref.logic.layout.format;
 
+import java.util.Map;
+import java.util.Objects;
+
 import org.jabref.logic.l10n.Localization;
 import org.jabref.logic.layout.LayoutFormatter;
+import org.jabref.logic.util.strings.HTMLUnicodeConversionMaps;
 import org.jabref.model.cleanup.Formatter;
-import org.jabref.model.strings.LatexToUnicodeAdapter;
 
 /**
  * This formatter converts LaTeX character sequences their equivalent unicode characters,
@@ -22,8 +25,20 @@ public String getKey() {
     }
 
     @Override
-    public String format(String inField) {
-        return LatexToUnicodeAdapter.format(inField);
+    public String format(String text) {
+        String result = Objects.requireNonNull(text);
+
+        if (result.isEmpty()) {
+            return result;
+        }
+
+        // Standard symbols
+        for (Map.Entry<String, String> unicodeLatexPair : HTMLUnicodeConversionMaps.UNICODE_LATEX_CONVERSION_MAP
+                .entrySet()) {
+            result = result.replace(unicodeLatexPair.getValue(), unicodeLatexPair.getKey());
+        }
+
+        return result;
     }
 
     @Override

diff --git a/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java b/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java
@@ -22,12 +22,33 @@ public class HTMLUnicodeConversionMaps {
      conforming SGML systems and applications as defined in
      ISO 8879, provided this notice is included in all copies.
      */
-
     // as well as http://www.w3.org/Math/characters/unicode.xml
-    // An array of arrays of strings in the format:
-    // {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"}
-    // Leaving a field empty is OK as it then will not be included
-    private static final String[][] CONVERSION_LIST = new String[][] {{"160", "nbsp", "{~}"}, // no-break space = non-breaking space,
+
+    /**
+     * We need to have lookup table, because the unicode table does not follow an easy scheme.
+     * For instance, there is no a with a lower dot, but a b.
+     * See https://www.utf8-chartable.de/unicode-utf8-table.pl
+     *
+     * An array of arrays of strings in the format:
+     * {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"}
+     * Leaving a field empty is OK as it then will not be included.
+     *
+     * Aim for this format is easy addition of data by a developer.
+     * It is not possible to create a multi-dimensional array of different content types.
+     * When creating an enum (e.g., <code>Inverted_Exclamation_Mark(161, "iexcl", "{\\textexclamdown}");</code>, then one needs to assign a name to each entry. This is unnecessary overhead.
+     *
+     * We need to keep the triple together as HTML encoding closely relates to LaTeX encoding - and we want to support LaTeX to HTML as well as LaTeX to Unicode
+     *
+     * Mappings from unicode to latex, unicode to HTML, HTML to unicode, ... are generated based on these entries.
+     *
+     * Helper scripts to gernerate entries:
+     *
+     *   - copy table from https://www.utf8-chartable.de/unicode-utf8-table.pl to input.txt
+     *   -  grep "DOT BELOW" input.txt > input-dot-below.txt
+     *   - sed "s#..\(....\)..............\(LATIN SMALL LETTER \(.\).*\)#{\"0x\1\", \"\", \"\\\\\\\\d{\L\3}\"}, // \2#" input-dot-below.txt  | grep {
+     */
+    private static final String[][] CONVERSION_LIST = new String[][] {
+            {"160", "nbsp", "{~}"}, // no-break space = non-breaking space,
             //                                 U+00A0 ISOnum
             {"161", "iexcl", "{\\textexclamdown}"}, // inverted exclamation mark, U+00A1 ISOnum
             {"162", "cent", "{\\textcent}"}, // cent sign, U+00A2 ISOnum
@@ -293,6 +314,70 @@ public class HTMLUnicodeConversionMaps {
             //                                   U+03D2 NEW
             {"982", "piv", "$\\varphi$"}, // greek pi symbol, U+03D6 ISOgrk3
 
+            // Dot Below
+            {"7717", "", "\\d{h}"}, // ḥ, https://unicode-table.com/de/1E25/
+            {"7751", "", "\\d{n}"}, // ṇ, https://unicode-table.com/de/1E47/
+
+            {"0x1E05", "", "\\d{b}"}, // latin small letter b with dot below
+            {"0x1E0D", "", "\\d{d}"}, // latin small letter d with dot below
+            {"0x1E25", "", "\\d{h}"}, // latin small letter h with dot below
+            {"0x1E33", "", "\\d{k}"}, // latin small letter k with dot below
+            {"0x1E37", "", "\\d{l}"}, // latin small letter l with dot below
+            {"0x1E39", "", "\\d{l}"}, // latin small letter l with dot below and macron
+            {"0x1E43", "", "\\d{m}"}, // latin small letter m with dot below
+            {"0x1E47", "", "\\d{n}"}, // latin small letter n with dot below
+            {"0x1E5B", "", "\\d{r}"}, // latin small letter r with dot below
+            {"0x1E5D", "", "\\d{r}"}, // latin small letter r with dot below and macron
+            {"0x1E63", "", "\\d{s}"}, // latin small letter s with dot below
+            {"0x1E69", "", "\\d{s}"}, // latin small letter s with dot below and dot above
+            {"0x1E6D", "", "\\d{t}"}, // latin small letter t with dot below
+            {"0x1E7F", "", "\\d{v}"}, // latin small letter v with dot below
+            {"0x1E89", "", "\\d{w}"}, // latin small letter w with dot below
+            {"0x1E93", "", "\\d{z}"}, // latin small letter z with dot below
+            {"0x1EA1", "", "\\d{a}"}, // latin small letter a with dot below
+            {"0x1EAD", "", "\\d{a}"}, // latin small letter a with circumflex and dot below
+            {"0x1EB7", "", "\\d{a}"}, // latin small letter a with breve and dot below
+            {"0x1EB9", "", "\\d{e}"}, // latin small letter e with dot below
+            {"0x1EC7", "", "\\d{e}"}, // latin small letter e with circumflex and dot below
+            {"0x1ECB", "", "\\d{i}"}, // latin small letter i with dot below
+            {"0x1ECD", "", "\\d{o}"}, // latin small letter o with dot below
+            {"0x1ED9", "", "\\d{o}"}, // latin small letter o with circumflex and dot below
+            {"0x1EE3", "", "\\d{o}"}, // latin small letter o with horn and dot below
+            {"0x1EE5", "", "\\d{u}"}, // latin small letter u with dot below
+            {"0x1EF1", "", "\\d{u}"}, // latin small letter u with horn and dot below
+            {"0x1EF5", "", "\\d{y}"}, // latin small letter y with dot below
+
+            // TODO macrons and dots above --> special cases
+
+            {"0x1E04", "", "\\d{B}"}, // LATIN CAPITAL LETTER B WITH DOT BELOW
+            {"0x1E0C", "", "\\d{D}"}, // LATIN CAPITAL LETTER D WITH DOT BELOW
+            {"0x1E24", "", "\\d{H}"}, // LATIN CAPITAL LETTER H WITH DOT BELOW
+            {"0x1E32", "", "\\d{K}"}, // LATIN CAPITAL LETTER K WITH DOT BELOW
+            {"0x1E36", "", "\\d{L}"}, // LATIN CAPITAL LETTER L WITH DOT BELOW
+            {"0x1E38", "", "\\d{L}"}, // LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON
+            {"0x1E42", "", "\\d{M}"}, // LATIN CAPITAL LETTER M WITH DOT BELOW
+            {"0x1E46", "", "\\d{N}"}, // LATIN CAPITAL LETTER N WITH DOT BELOW
+            {"0x1E5A", "", "\\d{R}"}, // LATIN CAPITAL LETTER R WITH DOT BELOW
+            {"0x1E5C", "", "\\d{R}"}, // LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON
+            {"0x1E62", "", "\\d{S}"}, // LATIN CAPITAL LETTER S WITH DOT BELOW
+            {"0x1E68", "", "\\d{S}"}, // LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE
+            {"0x1E6C", "", "\\d{T}"}, // LATIN CAPITAL LETTER T WITH DOT BELOW
+            {"0x1E7E", "", "\\d{V}"}, // LATIN CAPITAL LETTER V WITH DOT BELOW
+            {"0x1E88", "", "\\d{W}"}, // LATIN CAPITAL LETTER W WITH DOT BELOW
+            {"0x1E92", "", "\\d{Z}"}, // LATIN CAPITAL LETTER Z WITH DOT BELOW
+            {"0x1EA0", "", "\\d{A}"}, // LATIN CAPITAL LETTER A WITH DOT BELOW
+            {"0x1EAC", "", "\\d{A}"}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW
+            {"0x1EB6", "", "\\d{A}"}, // LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW
+            {"0x1EB8", "", "\\d{E}"}, // LATIN CAPITAL LETTER E WITH DOT BELOW
+            {"0x1EC6", "", "\\d{E}"}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW
+            {"0x1ECA", "", "\\d{I}"}, // LATIN CAPITAL LETTER I WITH DOT BELOW
+            {"0x1ECC", "", "\\d{O}"}, // LATIN CAPITAL LETTER O WITH DOT BELOW
+            {"0x1ED8", "", "\\d{O}"}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW
+            {"0x1EE2", "", "\\d{O}"}, // LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW
+            {"0x1EE4", "", "\\d{U}"}, // LATIN CAPITAL LETTER U WITH DOT BELOW
+            {"0x1EF0", "", "\\d{U}"}, // LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW
+            {"0x1EF4", "", "\\d{Y}"}, // LATIN CAPITAL LETTER Y WITH DOT BELOW
+
             /* General Punctuation */
             {"8211", "ndash", "$\\textendash$"},
             {"8212", "mdash", "$\\textemdash$"},
@@ -764,8 +849,13 @@ public class HTMLUnicodeConversionMaps {
 
     };
 
-    // List of combining accents
-    private static final String[][] ACCENT_LIST = new String[][] {{"768", "`"}, // Grave
+    /**
+     * List of combining accents
+     *
+     * See https://de.wikibooks.org/wiki/LaTeX/_Akzente_und_Sonderzeichen for the LaTeX commands
+     */
+    private static final String[][] ACCENT_LIST = new String[][] {
+            {"768", "`"}, // Grave
             {"769", "'"}, // Acute
             {"770", "^"}, // Circumflex
             {"771", "~"}, // Tilde

diff --git a/src/main/java/org/jabref/model/cleanup/CleanupJob.java b/src/main/java/org/jabref/model/cleanup/CleanupJob.java
@@ -7,9 +7,5 @@
 
 @FunctionalInterface
 public interface CleanupJob {
-
-    /**
-     * Cleanup the entry.
-     */
     List<FieldChange> cleanup(BibEntry entry);
 }
diff --git a/src/main/java/org/jabref/model/entry/BibEntry.java b/src/main/java/org/jabref/model/entry/BibEntry.java
@@ -35,7 +35,6 @@
 import org.jabref.model.entry.types.EntryType;
 import org.jabref.model.entry.types.IEEETranEntryType;
 import org.jabref.model.entry.types.StandardEntryType;
-import org.jabref.model.strings.LatexToUnicodeAdapter;
 import org.jabref.model.strings.StringUtil;
 import org.jabref.model.util.MultiKeyMap;
 
@@ -883,7 +882,9 @@ public Optional<String> getLatexFreeField(Field field) {
         } else {
             Optional<String> fieldValue = getField(field);
             if (fieldValue.isPresent()) {
-                String latexFreeValue = LatexToUnicodeAdapter.format(fieldValue.get()).intern();
+                // FIXME
+                // String latexFreeValue = LatexToUnicodeAdapter.format(fieldValue.get()).intern();
+                String latexFreeValue = fieldValue.get();
                 latexFreeFields.put(field, latexFreeValue);
                 return Optional.of(latexFreeValue);
             } else {

diff --git a/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java b/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java
diff --git a/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java
@@ -19,13 +19,24 @@ void formatWithoutUnicodeCharactersReturnsSameString() {
         assertEquals("abc", formatter.format("abc"));
     }
 
+    @Test
+    void formatOfMacronAIsCorrect() {
+        assertEquals("{\\={a}}", formatter.format("ā"));
+    }
+
     @Test
     void formatMultipleUnicodeCharacters() {
         assertEquals("{{\\aa}}{\\\"{a}}{\\\"{o}}", formatter.format("\u00E5\u00E4\u00F6"));
     }
 
+    @Test
+    void testSanskrit() {
+        assertEquals("Pu\\d{n}ya-pattana-vidy{\\={a}}-p{\\i{\\={}}}ṭh{\\={a}}dhi-kṛtaiḥ pr{\\={a}}-ka{{\\'{s}}}yaṃ n{\\i{\\={}}}taḥ", formatter.format("Pu\\d{n}ya-pattana-vidyā-pı̄ṭhādhi-kṛtaiḥ prā-kaśyaṃ nı̄taḥ"));
+    }
+
     @Test
     void formatExample() {
         assertEquals("M{\\\"{o}}nch", formatter.format(formatter.getExampleInput()));
     }
+
 }
diff --git a/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java b/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java
@@ -27,7 +27,7 @@ void preserveUnknownCommand() {
 
     @Test
     void testFormatTextit() {
-        // See #1464
+        // See https://github.com/JabRef/jabref/pull/1464
         assertEquals("\uD835\uDC61\uD835\uDC52\uD835\uDC65\uD835\uDC61", formatter.format("\\textit{text}"));
     }
 
@@ -187,4 +187,9 @@ void testConversionOfOrdinal4th() {
     void testConversionOfOrdinal9th() {
         assertEquals("9ᵗʰ", formatter.format("9\\textsuperscript{th}"));
     }
+
+    @Test
+    void testSanskrit() {
+        assertEquals("Puṇya-pattana-vidyā-pı̄ṭhādhi-kṛtaiḥ prā-kaśyaṃ nı̄taḥ", formatter.format("Pu\\d{n}ya-pattana-vidy{\\={a}}-p{\\i{\\={}}}\\d{t}h{\\={a}}dhi-k\\d{r}tai\\d{h} pr{\\={a}}-ka{{\\'{s}}}ya\\d{m} n{\\i{\\={}}}ta\\d{h}"));
+    }
 }