From 45b56be83edd111424f0bf01f22d554a8622fde4 Mon Sep 17 00:00:00 2001 From: Dmitry Shirokov Date: Thu, 8 Aug 2024 08:09:34 +1000 Subject: [PATCH] Strings (#103) * Development snapshot * Development snapshot * Development snapshot --------- Co-authored-by: Dmitry Shirokov --- README.md | 19 ++++++++++++++----- src/index.test.ts | 2 -- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 187ab43..21171e6 100644 --- a/README.md +++ b/README.md @@ -55,21 +55,30 @@ chardet.analyse(new Uint8Array([0x68, 0x65, 0x6c, 0x6c, 0x6f])); ## Working with large data sets -Sometimes, when data set is huge and you want to optimize performance (with a tradeoff of less accuracy), +Sometimes, when data set is huge and you want to optimize performance (with a trade off of less accuracy), you can sample only the first N bytes of the buffer: ```javascript -const encoding = await chardet - .detectFile('/path/to/file', { sampleSize: 32 }); +const encoding = await chardet.detectFile('/path/to/file', { sampleSize: 32 }); ``` You can also specify where to begin reading from in the buffer: ```javascript -const encoding = await chardet - .detectFile('/path/to/file', { sampleSize: 32, offset: 128 }); +const encoding = await chardet.detectFile('/path/to/file', { + sampleSize: 32, + offset: 128, +}); ``` +## Working with strings + +In both Node.js and browsers, all strings in memory are represented in UTF-16 encoding. This is a fundamental aspect of the JavaScript language specification. Therefore, you cannot use plain strings directly as input for `chardet.analyse()` or `chardet.detect()`. Instead, you need the original string data in the form of a Buffer or Uint8Array. + +In other words, if you receive a piece of data over the network and want to detect its encoding, use the original data payload, not its string representation. By the time you convert data to a string, it will be in UTF-16 encoding. + +Note on [TextEncoder](https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/TextEncoder): By default, it returns a UTF-8 encoded buffer, which means the buffer will not be in the original encoding of the string. + ## Supported Encodings: - UTF-8 diff --git a/src/index.test.ts b/src/index.test.ts index 9616909..5e9a03c 100644 --- a/src/index.test.ts +++ b/src/index.test.ts @@ -75,6 +75,4 @@ describe('chardet', () => { expect(matches).toEqual(expectedEncodingsFromPath); }); }); - - });