From 45b56be83edd111424f0bf01f22d554a8622fde4 Mon Sep 17 00:00:00 2001
From: Dmitry Shirokov <deadrunk@gmail.com>
Date: Thu, 8 Aug 2024 08:09:34 +1000
Subject: [PATCH] Strings (#103)

* Development snapshot

* Development snapshot

* Development snapshot

---------

Co-authored-by: Dmitry Shirokov <dshirokov@atlassian.com>
---
 README.md         | 19 ++++++++++++++-----
 src/index.test.ts |  2 --
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 187ab43..21171e6 100644
--- a/README.md
+++ b/README.md
@@ -55,21 +55,30 @@ chardet.analyse(new Uint8Array([0x68, 0x65, 0x6c, 0x6c, 0x6f]));
 
 ## Working with large data sets
 
-Sometimes, when data set is huge and you want to optimize performance (with a tradeoff of less accuracy),
+Sometimes, when data set is huge and you want to optimize performance (with a trade off of less accuracy),
 you can sample only the first N bytes of the buffer:
 
 ```javascript
-const encoding = await chardet
-  .detectFile('/path/to/file', { sampleSize: 32 });
+const encoding = await chardet.detectFile('/path/to/file', { sampleSize: 32 });
 ```
 
 You can also specify where to begin reading from in the buffer:
 
 ```javascript
-const encoding = await chardet
-  .detectFile('/path/to/file', { sampleSize: 32, offset: 128 });
+const encoding = await chardet.detectFile('/path/to/file', {
+  sampleSize: 32,
+  offset: 128,
+});
 ```
 
+## Working with strings
+
+In both Node.js and browsers, all strings in memory are represented in UTF-16 encoding. This is a fundamental aspect of the JavaScript language specification. Therefore, you cannot use plain strings directly as input for `chardet.analyse()` or `chardet.detect()`. Instead, you need the original string data in the form of a Buffer or Uint8Array.
+
+In other words, if you receive a piece of data over the network and want to detect its encoding, use the original data payload, not its string representation. By the time you convert data to a string, it will be in UTF-16 encoding.
+
+Note on [TextEncoder](https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/TextEncoder): By default, it returns a UTF-8 encoded buffer, which means the buffer will not be in the original encoding of the string.
+
 ## Supported Encodings:
 
 - UTF-8
diff --git a/src/index.test.ts b/src/index.test.ts
index 9616909..5e9a03c 100644
--- a/src/index.test.ts
+++ b/src/index.test.ts
@@ -75,6 +75,4 @@ describe('chardet', () => {
       expect(matches).toEqual(expectedEncodingsFromPath);
     });
   });
-
-
 });