Merge pull request #374 from SpaceCowboy850/bugfix_train_vocab

Setting an encoding for tiny_file tokenizer file.
karpathy · Sep 1, 2023 · 0776f86 · 0776f86
2 parents 7325bab + ab19aa0
commit 0776f86
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/tinystories.py b/tinystories.py
@@ -88,7 +88,7 @@ def train_vocab(vocab_size):
     shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
 
     print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
-    with open(tiny_file, "w") as of:
+    with open(tiny_file, "w", encoding="utf-8") as of:
         for shard in tqdm(shard_filenames[:num_shards]):
             with open(shard, "r") as f:
                 data = json.load(f)