Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix latex2unicode and unicode2latex #6155

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,6 @@ application {
mainClassName = "$moduleName/org.jabref.JabRefLauncher"
}

// TODO: Ugly workaround to temporarily ignore build errors to dependencies of latex2unicode
// These should be removed, as well as the files in the lib folder, as soon as they have valid module names
patchModules.config = [
"test=fastparse_2.12-1.0.0.jar",
"test2=fastparse-utils_2.12-1.0.0.jar",
"test3=sourcecode_2.12-0.1.4.jar"
]

// These are the Java version requirements we will check on each start of JabRef
ext.minRequiredJavaVersion = "1.8.0_171"
ext.allowJava9 = true
Expand Down Expand Up @@ -183,10 +175,6 @@ dependencies {
implementation group: 'jakarta.xml.bind', name: 'jakarta.xml.bind-api', version: '2.3.2'
implementation group: 'org.glassfish.jaxb', name: 'jaxb-runtime', version: '2.3.2'

implementation ('com.github.tomtung:latex2unicode_2.12:0.2.6') {
exclude module: 'fastparse_2.12'
}

implementation group: 'com.microsoft.azure', name: 'applicationinsights-core', version: '2.4.1'
implementation (group: 'com.microsoft.azure', name: 'applicationinsights-logging-log4j2', version: '2.4.1') {
exclude module: "log4j-core"
Expand Down
1 change: 0 additions & 1 deletion src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@
requires org.apache.pdfbox;
requires reactfx;
requires commons.cli;
requires com.github.tomtung.latex2unicode;
requires jbibtex;
requires citeproc.java;
requires antlr.runtime;
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/org/jabref/gui/texparser/CitationsDisplay.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import org.jabref.gui.icon.IconTheme;
import org.jabref.gui.util.ViewModelListCellFactory;
import org.jabref.model.strings.LatexToUnicodeAdapter;
import org.jabref.model.texparser.Citation;

public class CitationsDisplay extends ListView<Citation> {
Expand All @@ -44,7 +43,9 @@ private Node getDisplayGraphic(Citation item) {
}

Node citationIcon = IconTheme.JabRefIcons.LATEX_COMMENT.getGraphicNode();
Text contextText = new Text(LatexToUnicodeAdapter.format(item.getContext()));
// FIXME
Text contextText = null;
// Text contextText = new Text(LatexToUnicodeAdapter.format(item.getContext()));
contextText.wrappingWidthProperty().bind(this.widthProperty().subtract(85));
HBox contextBox = new HBox(8, citationIcon, contextText);
contextBox.getStyleClass().add("contextBox");
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/jabref/logic/citationstyle/CSLAdapter.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import org.jabref.model.entry.Month;
import org.jabref.model.entry.field.Field;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.strings.LatexToUnicodeAdapter;

import de.undercouch.citeproc.CSL;
import de.undercouch.citeproc.DefaultAbbreviationProvider;
Expand Down Expand Up @@ -98,7 +97,8 @@ private static CSLItemData bibEntryToCSLItemData(BibEntry bibEntry) {
for (Field key : bibEntry.getFieldMap().keySet()) {
bibEntry.getField(key)
.map(removeNewlinesFormatter::format)
.map(LatexToUnicodeAdapter::format)
// FIXME
// .map(LatexToUnicodeAdapter::format)
.ifPresent(value -> {
if (StandardField.MONTH.equals(key)) {
// Change month from #mon# to mon because CSL does not support the former format
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ public class UnicodeToLatexFormatter extends Formatter implements LayoutFormatte
@Override
public String format(String text) {
String result = Objects.requireNonNull(text);

if (result.isEmpty()) {
return result;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
package org.jabref.logic.layout.format;

import java.util.Map;
import java.util.Objects;

import org.jabref.logic.l10n.Localization;
import org.jabref.logic.layout.LayoutFormatter;
import org.jabref.logic.util.strings.HTMLUnicodeConversionMaps;
import org.jabref.model.cleanup.Formatter;
import org.jabref.model.strings.LatexToUnicodeAdapter;

/**
* This formatter converts LaTeX character sequences their equivalent unicode characters,
Expand All @@ -22,8 +25,20 @@ public String getKey() {
}

@Override
public String format(String inField) {
return LatexToUnicodeAdapter.format(inField);
public String format(String text) {
String result = Objects.requireNonNull(text);

if (result.isEmpty()) {
return result;
}

// Standard symbols
for (Map.Entry<String, String> unicodeLatexPair : HTMLUnicodeConversionMaps.UNICODE_LATEX_CONVERSION_MAP
.entrySet()) {
result = result.replace(unicodeLatexPair.getValue(), unicodeLatexPair.getKey());
}

return result;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,33 @@ public class HTMLUnicodeConversionMaps {
conforming SGML systems and applications as defined in
ISO 8879, provided this notice is included in all copies.
*/

// as well as http://www.w3.org/Math/characters/unicode.xml
// An array of arrays of strings in the format:
// {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"}
// Leaving a field empty is OK as it then will not be included
private static final String[][] CONVERSION_LIST = new String[][] {{"160", "nbsp", "{~}"}, // no-break space = non-breaking space,

/**
* We need to have lookup table, because the unicode table does not follow an easy scheme.
* For instance, there is no a with a lower dot, but a b.
* See https://www.utf8-chartable.de/unicode-utf8-table.pl
*
* An array of arrays of strings in the format:
* {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"}
* Leaving a field empty is OK as it then will not be included.
*
* Aim for this format is easy addition of data by a developer.
* It is not possible to create a multi-dimensional array of different content types.
* When creating an enum (e.g., <code>Inverted_Exclamation_Mark(161, "iexcl", "{\\textexclamdown}");</code>, then one needs to assign a name to each entry. This is unnecessary overhead.
*
* We need to keep the triple together as HTML encoding closely relates to LaTeX encoding - and we want to support LaTeX to HTML as well as LaTeX to Unicode
*
* Mappings from unicode to latex, unicode to HTML, HTML to unicode, ... are generated based on these entries.
*
* Helper scripts to gernerate entries:
*
* - copy table from https://www.utf8-chartable.de/unicode-utf8-table.pl to input.txt
* - grep "DOT BELOW" input.txt > input-dot-below.txt
* - sed "s#..\(....\)..............\(LATIN SMALL LETTER \(.\).*\)#{\"0x\1\", \"\", \"\\\\\\\\d{\L\3}\"}, // \2#" input-dot-below.txt | grep {
*/
private static final String[][] CONVERSION_LIST = new String[][] {
{"160", "nbsp", "{~}"}, // no-break space = non-breaking space,
// U+00A0 ISOnum
{"161", "iexcl", "{\\textexclamdown}"}, // inverted exclamation mark, U+00A1 ISOnum
{"162", "cent", "{\\textcent}"}, // cent sign, U+00A2 ISOnum
Expand Down Expand Up @@ -293,6 +314,70 @@ public class HTMLUnicodeConversionMaps {
// U+03D2 NEW
{"982", "piv", "$\\varphi$"}, // greek pi symbol, U+03D6 ISOgrk3

// Dot Below
{"7717", "", "\\d{h}"}, // ḥ, https://unicode-table.com/de/1E25/
{"7751", "", "\\d{n}"}, // ṇ, https://unicode-table.com/de/1E47/

{"0x1E05", "", "\\d{b}"}, // latin small letter b with dot below
{"0x1E0D", "", "\\d{d}"}, // latin small letter d with dot below
{"0x1E25", "", "\\d{h}"}, // latin small letter h with dot below
{"0x1E33", "", "\\d{k}"}, // latin small letter k with dot below
{"0x1E37", "", "\\d{l}"}, // latin small letter l with dot below
{"0x1E39", "", "\\d{l}"}, // latin small letter l with dot below and macron
{"0x1E43", "", "\\d{m}"}, // latin small letter m with dot below
{"0x1E47", "", "\\d{n}"}, // latin small letter n with dot below
{"0x1E5B", "", "\\d{r}"}, // latin small letter r with dot below
{"0x1E5D", "", "\\d{r}"}, // latin small letter r with dot below and macron
{"0x1E63", "", "\\d{s}"}, // latin small letter s with dot below
{"0x1E69", "", "\\d{s}"}, // latin small letter s with dot below and dot above
{"0x1E6D", "", "\\d{t}"}, // latin small letter t with dot below
{"0x1E7F", "", "\\d{v}"}, // latin small letter v with dot below
{"0x1E89", "", "\\d{w}"}, // latin small letter w with dot below
{"0x1E93", "", "\\d{z}"}, // latin small letter z with dot below
{"0x1EA1", "", "\\d{a}"}, // latin small letter a with dot below
{"0x1EAD", "", "\\d{a}"}, // latin small letter a with circumflex and dot below
{"0x1EB7", "", "\\d{a}"}, // latin small letter a with breve and dot below
{"0x1EB9", "", "\\d{e}"}, // latin small letter e with dot below
{"0x1EC7", "", "\\d{e}"}, // latin small letter e with circumflex and dot below
{"0x1ECB", "", "\\d{i}"}, // latin small letter i with dot below
{"0x1ECD", "", "\\d{o}"}, // latin small letter o with dot below
{"0x1ED9", "", "\\d{o}"}, // latin small letter o with circumflex and dot below
{"0x1EE3", "", "\\d{o}"}, // latin small letter o with horn and dot below
{"0x1EE5", "", "\\d{u}"}, // latin small letter u with dot below
{"0x1EF1", "", "\\d{u}"}, // latin small letter u with horn and dot below
{"0x1EF5", "", "\\d{y}"}, // latin small letter y with dot below

// TODO macrons and dots above --> special cases

{"0x1E04", "", "\\d{B}"}, // LATIN CAPITAL LETTER B WITH DOT BELOW
{"0x1E0C", "", "\\d{D}"}, // LATIN CAPITAL LETTER D WITH DOT BELOW
{"0x1E24", "", "\\d{H}"}, // LATIN CAPITAL LETTER H WITH DOT BELOW
{"0x1E32", "", "\\d{K}"}, // LATIN CAPITAL LETTER K WITH DOT BELOW
{"0x1E36", "", "\\d{L}"}, // LATIN CAPITAL LETTER L WITH DOT BELOW
{"0x1E38", "", "\\d{L}"}, // LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON
{"0x1E42", "", "\\d{M}"}, // LATIN CAPITAL LETTER M WITH DOT BELOW
{"0x1E46", "", "\\d{N}"}, // LATIN CAPITAL LETTER N WITH DOT BELOW
{"0x1E5A", "", "\\d{R}"}, // LATIN CAPITAL LETTER R WITH DOT BELOW
{"0x1E5C", "", "\\d{R}"}, // LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON
{"0x1E62", "", "\\d{S}"}, // LATIN CAPITAL LETTER S WITH DOT BELOW
{"0x1E68", "", "\\d{S}"}, // LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE
{"0x1E6C", "", "\\d{T}"}, // LATIN CAPITAL LETTER T WITH DOT BELOW
{"0x1E7E", "", "\\d{V}"}, // LATIN CAPITAL LETTER V WITH DOT BELOW
{"0x1E88", "", "\\d{W}"}, // LATIN CAPITAL LETTER W WITH DOT BELOW
{"0x1E92", "", "\\d{Z}"}, // LATIN CAPITAL LETTER Z WITH DOT BELOW
{"0x1EA0", "", "\\d{A}"}, // LATIN CAPITAL LETTER A WITH DOT BELOW
{"0x1EAC", "", "\\d{A}"}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW
{"0x1EB6", "", "\\d{A}"}, // LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW
{"0x1EB8", "", "\\d{E}"}, // LATIN CAPITAL LETTER E WITH DOT BELOW
{"0x1EC6", "", "\\d{E}"}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW
{"0x1ECA", "", "\\d{I}"}, // LATIN CAPITAL LETTER I WITH DOT BELOW
{"0x1ECC", "", "\\d{O}"}, // LATIN CAPITAL LETTER O WITH DOT BELOW
{"0x1ED8", "", "\\d{O}"}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW
{"0x1EE2", "", "\\d{O}"}, // LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW
{"0x1EE4", "", "\\d{U}"}, // LATIN CAPITAL LETTER U WITH DOT BELOW
{"0x1EF0", "", "\\d{U}"}, // LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW
{"0x1EF4", "", "\\d{Y}"}, // LATIN CAPITAL LETTER Y WITH DOT BELOW

/* General Punctuation */
{"8211", "ndash", "$\\textendash$"},
{"8212", "mdash", "$\\textemdash$"},
Expand Down Expand Up @@ -764,8 +849,13 @@ public class HTMLUnicodeConversionMaps {

};

// List of combining accents
private static final String[][] ACCENT_LIST = new String[][] {{"768", "`"}, // Grave
/**
* List of combining accents
*
* See https://de.wikibooks.org/wiki/LaTeX/_Akzente_und_Sonderzeichen for the LaTeX commands
*/
private static final String[][] ACCENT_LIST = new String[][] {
{"768", "`"}, // Grave
{"769", "'"}, // Acute
{"770", "^"}, // Circumflex
{"771", "~"}, // Tilde
Expand Down
4 changes: 0 additions & 4 deletions src/main/java/org/jabref/model/cleanup/CleanupJob.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,5 @@

@FunctionalInterface
public interface CleanupJob {

/**
* Cleanup the entry.
*/
List<FieldChange> cleanup(BibEntry entry);
}
5 changes: 3 additions & 2 deletions src/main/java/org/jabref/model/entry/BibEntry.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
import org.jabref.model.entry.types.EntryType;
import org.jabref.model.entry.types.IEEETranEntryType;
import org.jabref.model.entry.types.StandardEntryType;
import org.jabref.model.strings.LatexToUnicodeAdapter;
import org.jabref.model.strings.StringUtil;
import org.jabref.model.util.MultiKeyMap;

Expand Down Expand Up @@ -883,7 +882,9 @@ public Optional<String> getLatexFreeField(Field field) {
} else {
Optional<String> fieldValue = getField(field);
if (fieldValue.isPresent()) {
String latexFreeValue = LatexToUnicodeAdapter.format(fieldValue.get()).intern();
// FIXME
// String latexFreeValue = LatexToUnicodeAdapter.format(fieldValue.get()).intern();
String latexFreeValue = fieldValue.get();
latexFreeFields.put(field, latexFreeValue);
return Optional.of(latexFreeValue);
} else {
Expand Down
27 changes: 0 additions & 27 deletions src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,24 @@ void formatWithoutUnicodeCharactersReturnsSameString() {
assertEquals("abc", formatter.format("abc"));
}

@Test
void formatOfMacronAIsCorrect() {
assertEquals("{\\={a}}", formatter.format("ā"));
}

@Test
void formatMultipleUnicodeCharacters() {
assertEquals("{{\\aa}}{\\\"{a}}{\\\"{o}}", formatter.format("\u00E5\u00E4\u00F6"));
}

@Test
void testSanskrit() {
assertEquals("Pu\\d{n}ya-pattana-vidy{\\={a}}-p{\\i{\\={}}}ṭh{\\={a}}dhi-kṛtaiḥ pr{\\={a}}-ka{{\\'{s}}}yaṃ n{\\i{\\={}}}taḥ", formatter.format("Pu\\d{n}ya-pattana-vidyā-pı̄ṭhādhi-kṛtaiḥ prā-kaśyaṃ nı̄taḥ"));
}

@Test
void formatExample() {
assertEquals("M{\\\"{o}}nch", formatter.format(formatter.getExampleInput()));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ void preserveUnknownCommand() {

@Test
void testFormatTextit() {
// See #1464
// See https://github.com/JabRef/jabref/pull/1464
assertEquals("\uD835\uDC61\uD835\uDC52\uD835\uDC65\uD835\uDC61", formatter.format("\\textit{text}"));
}

Expand Down Expand Up @@ -187,4 +187,9 @@ void testConversionOfOrdinal4th() {
void testConversionOfOrdinal9th() {
assertEquals("9ᵗʰ", formatter.format("9\\textsuperscript{th}"));
}

@Test
void testSanskrit() {
assertEquals("Puṇya-pattana-vidyā-pı̄ṭhādhi-kṛtaiḥ prā-kaśyaṃ nı̄taḥ", formatter.format("Pu\\d{n}ya-pattana-vidy{\\={a}}-p{\\i{\\={}}}\\d{t}h{\\={a}}dhi-k\\d{r}tai\\d{h} pr{\\={a}}-ka{{\\'{s}}}ya\\d{m} n{\\i{\\={}}}ta\\d{h}"));
}
}