Skip to content

Commit

Permalink
LUCENE-10400: revise binary dictionaries' constructor in nori (#693)
Browse files Browse the repository at this point in the history
  • Loading branch information
mocobeta committed Feb 20, 2022
1 parent fcc384f commit e3a7d27
Show file tree
Hide file tree
Showing 9 changed files with 292 additions and 123 deletions.
2 changes: 1 addition & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ API Changes
* LUCENE-10368: IntTaxonomyFacets has been deprecated and is no longer a supported extension point
for user-created faceting implementations. (Greg Miller)

* LUCENE-10400: Add constructors that take external resource Paths to dictionary classes in Kuromoji:
* LUCENE-10400: Add constructors that take external resource Paths to dictionary classes in Kuromoji and Nori:
ConnectionCosts, TokenInfoDictionary, and UnknownDictionary. Old constructors that take resource scheme and
resource path in those classes are deprecated; These are replaced with the new constructors and planned to be
removed in a future release. (Tomoko Uchida, Uwe Schindler, Mike Sokolov)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,23 @@

import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.file.Files;
import java.nio.file.Paths;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.IntsRef;

/** Base class for a binary-encoded in-memory dictionary. */
public abstract class BinaryDictionary implements Dictionary {

/** Used to specify where (dictionary) resources get loaded from. */
@Deprecated(forRemoval = true, since = "9.1")
public enum ResourceScheme {
CLASSPATH,
FILE
Expand All @@ -51,75 +49,36 @@ public enum ResourceScheme {
public static final String POSDICT_HEADER = "ko_dict_pos";
public static final int VERSION = 1;

private final ResourceScheme resourceScheme;
private final String resourcePath;
private final ByteBuffer buffer;
private final int[] targetMapOffsets, targetMap;
private final POS.Tag[] posDict;

protected BinaryDictionary() throws IOException {
this(ResourceScheme.CLASSPATH, null);
}

/**
* @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
* scheme only, use this class's name as the path.
*/
protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath)
protected BinaryDictionary(
IOSupplier<InputStream> targetMapResource,
IOSupplier<InputStream> posResource,
IOSupplier<InputStream> dictResource)
throws IOException {
this.resourceScheme = resourceScheme;
if (resourcePath == null) {
if (resourceScheme != ResourceScheme.CLASSPATH) {
throw new IllegalArgumentException(
"resourcePath must be supplied with FILE resource scheme");
}
this.resourcePath = getClass().getSimpleName();
} else {
if (resourceScheme == ResourceScheme.CLASSPATH && !resourcePath.startsWith("/")) {
resourcePath = "/".concat(resourcePath);
}
this.resourcePath = resourcePath;
}
int[] targetMapOffsets, targetMap;
ByteBuffer buffer;
try (InputStream mapIS = new BufferedInputStream(getResource(TARGETMAP_FILENAME_SUFFIX));
InputStream posIS = new BufferedInputStream(getResource(POSDICT_FILENAME_SUFFIX));
// no buffering here, as we load in one large buffer
InputStream dictIS = getResource(DICT_FILENAME_SUFFIX)) {
try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) {
DataInput in = new InputStreamDataInput(mapIS);
CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
targetMap = new int[in.readVInt()];
targetMapOffsets = new int[in.readVInt()];
int accum = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMap.length; ofs++) {
final int val = in.readVInt();
if ((val & 0x01) != 0) {
targetMapOffsets[sourceId] = ofs;
sourceId++;
}
accum += val >>> 1;
targetMap[ofs] = accum;
}
if (sourceId + 1 != targetMapOffsets.length)
throw new IOException(
"targetMap file format broken; targetMap.length="
+ targetMap.length
+ ", targetMapOffsets.length="
+ targetMapOffsets.length
+ ", sourceId="
+ sourceId);
targetMapOffsets[sourceId] = targetMap.length;
this.targetMap = new int[in.readVInt()];
this.targetMapOffsets = new int[in.readVInt()];
populateTargetMap(in, this.targetMap, this.targetMapOffsets);
}

in = new InputStreamDataInput(posIS);
try (InputStream posIS = new BufferedInputStream(posResource.get())) {
DataInput in = new InputStreamDataInput(posIS);
CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
int posSize = in.readVInt();
posDict = new POS.Tag[posSize];
this.posDict = new POS.Tag[posSize];
for (int j = 0; j < posSize; j++) {
posDict[j] = POS.resolveTag(in.readByte());
}
}

in = new InputStreamDataInput(dictIS);
// no buffering here, as we load in one large buffer
try (InputStream dictIS = dictResource.get()) {
DataInput in = new InputStreamDataInput(dictIS);
CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
final int size = in.readVInt();
final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
Expand All @@ -128,48 +87,31 @@ protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath)
if (read != size) {
throw new EOFException("Cannot read whole dictionary");
}
buffer = tmpBuffer.asReadOnlyBuffer();
}

this.targetMap = targetMap;
this.targetMapOffsets = targetMapOffsets;
this.buffer = buffer;
}

protected final InputStream getResource(String suffix) throws IOException {
switch (resourceScheme) {
case CLASSPATH:
return getClassResource(resourcePath + suffix);
case FILE:
return Files.newInputStream(Paths.get(resourcePath + suffix));
default:
throw new IllegalStateException("unknown resource scheme " + resourceScheme);
}
}

public static InputStream getResource(ResourceScheme scheme, String path) throws IOException {
switch (scheme) {
case CLASSPATH:
return getClassResource(path);
case FILE:
return Files.newInputStream(Paths.get(path));
default:
throw new IllegalStateException("unknown resource scheme " + scheme);
this.buffer = tmpBuffer.asReadOnlyBuffer();
}
}

// util, reused by ConnectionCosts and CharacterDefinition
public static InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
if (is == null) {
throw new FileNotFoundException(
"Not in classpath: " + clazz.getName().replace('.', '/') + suffix);
private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets)
throws IOException {
int accum = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMap.length; ofs++) {
final int val = in.readVInt();
if ((val & 0x01) != 0) {
targetMapOffsets[sourceId] = ofs;
sourceId++;
}
accum += val >>> 1;
targetMap[ofs] = accum;
}
return is;
}

private static InputStream getClassResource(String path) throws IOException {
return IOUtils.requireResourceNonNull(BinaryDictionary.class.getResourceAsStream(path), path);
if (sourceId + 1 != targetMapOffsets.length)
throw new IOException(
"targetMap file format broken; targetMap.length="
+ targetMap.length
+ ", targetMapOffsets.length="
+ targetMapOffsets.length
+ ", sourceId="
+ sourceId);
targetMapOffsets[sourceId] = targetMap.length;
}

public void lookupWordIds(int sourceId, IntsRef ref) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,7 @@ enum CharacterClass {
public static final byte HANJANUMERIC = (byte) CharacterClass.HANJANUMERIC.ordinal();

private CharacterDefinition() throws IOException {
InputStream is = null;
boolean success = false;
try {
is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
is = new BufferedInputStream(is);
try (InputStream is = new BufferedInputStream(getClassResource())) {
final DataInput in = new InputStreamDataInput(is);
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
in.readBytes(characterCategoryMap, 0, characterCategoryMap.length);
Expand All @@ -86,16 +82,15 @@ private CharacterDefinition() throws IOException {
invokeMap[i] = (b & 0x01) != 0;
groupMap[i] = (b & 0x02) != 0;
}
success = true;
} finally {
if (success) {
IOUtils.close(is);
} else {
IOUtils.closeWhileHandlingException(is);
}
}
}

private static InputStream getClassResource() throws IOException {
final String resourcePath = CharacterDefinition.class.getSimpleName() + FILENAME_SUFFIX;
return IOUtils.requireResourceNonNull(
CharacterDefinition.class.getResourceAsStream(resourcePath), resourcePath);
}

public byte getCharacterClass(char c) {
return characterCategoryMap[c];
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,14 @@
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.IOUtils;

/** n-gram connection cost data */
public final class ConnectionCosts {
Expand All @@ -38,12 +43,32 @@ public final class ConnectionCosts {
* @param scheme - scheme for loading resources (FILE or CLASSPATH).
* @param resourcePath - where to load resources from, without the ".dat" suffix
*/
@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String resourcePath)
throws IOException {
try (InputStream is =
new BufferedInputStream(
BinaryDictionary.getResource(
scheme, "/" + resourcePath.replace('.', '/') + FILENAME_SUFFIX))) {
this(
scheme == BinaryDictionary.ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + FILENAME_SUFFIX))
: ConnectionCosts::getClassResource);
}

/**
* Create a {@link ConnectionCosts} from an external resource path.
*
* @param connectionCostsFile where to load connection costs resource
* @throws IOException if resource was not found or broken
*/
public ConnectionCosts(Path connectionCostsFile) throws IOException {
this(() -> Files.newInputStream(connectionCostsFile));
}

private ConnectionCosts() throws IOException {
this(ConnectionCosts::getClassResource);
}

private ConnectionCosts(IOSupplier<InputStream> connectionCostResource) throws IOException {
try (InputStream is = new BufferedInputStream(connectionCostResource.get())) {
final DataInput in = new InputStreamDataInput(is);
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
this.forwardSize = in.readVInt();
Expand All @@ -63,8 +88,10 @@ public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String resourcePa
}
}

private ConnectionCosts() throws IOException {
this(BinaryDictionary.ResourceScheme.CLASSPATH, ConnectionCosts.class.getName());
private static InputStream getClassResource() throws IOException {
final String resourcePath = ConnectionCosts.class.getSimpleName() + FILENAME_SUFFIX;
return IOUtils.requireResourceNonNull(
ConnectionCosts.class.getResourceAsStream(resourcePath), resourcePath);
}

public int get(int forwardId, int backwardId) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;

Expand All @@ -35,25 +40,76 @@ public final class TokenInfoDictionary extends BinaryDictionary {
private final TokenInfoFST fst;

private TokenInfoDictionary() throws IOException {
this(ResourceScheme.CLASSPATH, null);
this(
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
() -> getClassResource(POSDICT_FILENAME_SUFFIX),
() -> getClassResource(DICT_FILENAME_SUFFIX),
() -> getClassResource(FST_FILENAME_SUFFIX));
}

/**
* @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
* scheme only, use this class's name as the path.
*/
@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath)
throws IOException {
super(resourceScheme, resourcePath);
this(
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + TARGETMAP_FILENAME_SUFFIX))
: () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + POSDICT_FILENAME_SUFFIX))
: () -> getClassResource(POSDICT_FILENAME_SUFFIX),
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + DICT_FILENAME_SUFFIX))
: () -> getClassResource(DICT_FILENAME_SUFFIX),
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + FST_FILENAME_SUFFIX))
: () -> getClassResource(FST_FILENAME_SUFFIX));
}

/**
* Create a {@link TokenInfoDictionary} from an external resource path.
*
* @param targetMapFile where to load target map resource
* @param posDictFile where to load POS dictionary resource
* @param dictFile where to load dictionary entries resource
* @param fstFile where to load encoded FST data resource
* @throws IOException if resource was not found or broken
*/
public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile, Path fstFile)
throws IOException {
this(
() -> Files.newInputStream(targetMapFile),
() -> Files.newInputStream(posDictFile),
() -> Files.newInputStream(dictFile),
() -> Files.newInputStream(fstFile));
}

private TokenInfoDictionary(
IOSupplier<InputStream> targetMapResource,
IOSupplier<InputStream> posResource,
IOSupplier<InputStream> dictResource,
IOSupplier<InputStream> fstResource)
throws IOException {
super(targetMapResource, posResource, dictResource);
FST<Long> fst;
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
try (InputStream is = new BufferedInputStream(fstResource.get())) {
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
}
this.fst = new TokenInfoFST(fst);
}

private static InputStream getClassResource(String suffix) throws IOException {
final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix;
return IOUtils.requireResourceNonNull(
TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath);
}

public TokenInfoFST getFST() {
return fst;
}
Expand Down
Loading

0 comments on commit e3a7d27

Please sign in to comment.