Skip to content

Commit

Permalink
src: improve buffer.transcode performance
Browse files Browse the repository at this point in the history
  • Loading branch information
anonrig committed Aug 3, 2024
1 parent 20aff2b commit 63a202d
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 63 deletions.
34 changes: 34 additions & 0 deletions benchmark/buffers/buffer-transcode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
'use strict';
const common = require('../common.js');
const assert = require('node:assert');
const buffer = require('node:buffer');

const encodings = ['latin1', 'ascii', 'ucs2', 'utf8'];

const bench = common.createBenchmark(main, {
fromEncoding: encodings,
toEncoding: encodings,
length: [1, 10, 1000],
n: [1e5],
}, {
combinationFilter(p) {
// It is useless to benchmark the same encoding, since it is highly unlikely.
return p.fromEncoding !== p.toEncoding;
},
});

function main({ n, fromEncoding, toEncoding, length }) {
const input = Buffer.from('a'.repeat(length));
let out = 0;
bench.start();
for (let i = 0; i < n; i++) {
try {
const dest = buffer.transcode(input, fromEncoding, toEncoding);
out += dest.buffer.byteLength;
} catch {
// do nothing
}
}
bench.end(n);
assert.ok(out >= 0);
}
111 changes: 48 additions & 63 deletions src/node_i18n.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@

#include "node_i18n.h"
#include "node_external_reference.h"
#include "simdutf.h"

#if defined(NODE_HAVE_I18N_SUPPORT)

Expand Down Expand Up @@ -147,7 +148,6 @@ MaybeLocal<Object> Transcode(Environment* env,
const char* source,
const size_t source_length,
UErrorCode* status) {
*status = U_ZERO_ERROR;
MaybeLocal<Object> ret;
MaybeStackBuffer<char> result;
Converter to(toEncoding);
Expand All @@ -170,22 +170,21 @@ MaybeLocal<Object> Transcode(Environment* env,
return ret;
}

MaybeLocal<Object> TranscodeToUcs2(Environment* env,
const char* fromEncoding,
const char* toEncoding,
const char* source,
const size_t source_length,
UErrorCode* status) {
*status = U_ZERO_ERROR;
MaybeLocal<Object> ret;
MaybeLocal<Object> TranscodeLatin1ToUcs2(Environment* env,
const char* fromEncoding,
const char* toEncoding,
const char* source,
const size_t source_length,
UErrorCode* status) {
MaybeStackBuffer<UChar> destbuf(source_length);
Converter from(fromEncoding);
const size_t length_in_chars = source_length * sizeof(UChar);
ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
source, source_length, status);
if (U_SUCCESS(*status))
ret = ToBufferEndian(env, &destbuf);
return ret;
auto actual_length =
simdutf::convert_latin1_to_utf16(source, source_length, destbuf.out());
if (actual_length == 0) {
*status = U_INVALID_CHAR_FOUND;
return {};
}

return Buffer::New(env, &destbuf);
}

MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
Expand All @@ -194,13 +193,11 @@ MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
const char* source,
const size_t source_length,
UErrorCode* status) {
*status = U_ZERO_ERROR;
MaybeStackBuffer<UChar> sourcebuf;
MaybeLocal<Object> ret;
Converter to(toEncoding);

size_t sublen = ucnv_getMinCharSize(to.conv());
std::string sub(sublen, '?');
std::string sub(to.min_char_size(), '?');
to.set_subst_chars(sub.c_str());

const size_t length_in_chars = source_length / sizeof(UChar);
Expand All @@ -221,26 +218,20 @@ MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
const char* source,
const size_t source_length,
UErrorCode* status) {
*status = U_ZERO_ERROR;
MaybeStackBuffer<UChar> destbuf;
int32_t result_length;
u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
source, source_length, status);
MaybeLocal<Object> ret;
if (U_SUCCESS(*status)) {
destbuf.SetLength(result_length);
ret = ToBufferEndian(env, &destbuf);
} else if (*status == U_BUFFER_OVERFLOW_ERROR) {
*status = U_ZERO_ERROR;
destbuf.AllocateSufficientStorage(result_length);
u_strFromUTF8(*destbuf, result_length, &result_length,
source, source_length, status);
if (U_SUCCESS(*status)) {
destbuf.SetLength(result_length);
ret = ToBufferEndian(env, &destbuf);
}
size_t expected_utf16_length =
simdutf::utf16_length_from_utf8(source, source_length);
MaybeStackBuffer<UChar> destbuf(expected_utf16_length);
auto actual_length =
simdutf::convert_utf8_to_utf16(source, source_length, destbuf.out());

if (actual_length == 0) {
*status = U_INVALID_CHAR_FOUND;
return {};
}
return ret;

CHECK_EQ(actual_length, expected_utf16_length);

return Buffer::New(env, &destbuf);
}

MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
Expand All @@ -249,32 +240,27 @@ MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
const char* source,
const size_t source_length,
UErrorCode* status) {
*status = U_ZERO_ERROR;
MaybeLocal<Object> ret;
const size_t length_in_chars = source_length / sizeof(UChar);
int32_t result_length;
MaybeStackBuffer<UChar> sourcebuf;
MaybeStackBuffer<char> destbuf;
CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
*sourcebuf, length_in_chars, status);
if (U_SUCCESS(*status)) {
destbuf.SetLength(result_length);
ret = ToBufferEndian(env, &destbuf);
} else if (*status == U_BUFFER_OVERFLOW_ERROR) {
*status = U_ZERO_ERROR;
destbuf.AllocateSufficientStorage(result_length);
u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
length_in_chars, status);
if (U_SUCCESS(*status)) {
destbuf.SetLength(result_length);
ret = ToBufferEndian(env, &destbuf);
}
size_t expected_utf8_length = simdutf::utf8_length_from_utf16(
reinterpret_cast<const char16_t*>(source), length_in_chars);

MaybeStackBuffer<char> destbuf(expected_utf8_length);
auto actual_length =
simdutf::convert_utf16_to_utf8(reinterpret_cast<const char16_t*>(source),
length_in_chars,
destbuf.out());

if (actual_length == 0) {
*status = U_INVALID_CHAR_FOUND;
return {};
}
return ret;

CHECK_EQ(actual_length, expected_utf8_length);

return Buffer::New(env, &destbuf);
}

const char* EncodingName(const enum encoding encoding) {
constexpr const char* EncodingName(const enum encoding encoding) {
switch (encoding) {
case ASCII: return "us-ascii";
case LATIN1: return "iso8859-1";
Expand All @@ -284,7 +270,7 @@ const char* EncodingName(const enum encoding encoding) {
}
}

bool SupportedEncoding(const enum encoding encoding) {
constexpr bool SupportedEncoding(const enum encoding encoding) {
switch (encoding) {
case ASCII:
case LATIN1:
Expand All @@ -309,8 +295,7 @@ void Transcode(const FunctionCallbackInfo<Value>&args) {
switch (fromEncoding) {
case ASCII:
case LATIN1:
if (toEncoding == UCS2)
tfn = &TranscodeToUcs2;
if (toEncoding == UCS2) tfn = &TranscodeLatin1ToUcs2;
break;
case UTF8:
if (toEncoding == UCS2)
Expand Down

0 comments on commit 63a202d

Please sign in to comment.