Skip to content

Commit

Permalink
Merge pull request #374 from SpaceCowboy850/bugfix_train_vocab
Browse files Browse the repository at this point in the history
Setting an encoding for tiny_file tokenizer file.
  • Loading branch information
karpathy committed Sep 1, 2023
2 parents 7325bab + ab19aa0 commit 0776f86
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion tinystories.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def train_vocab(vocab_size):
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))

print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
with open(tiny_file, "w") as of:
with open(tiny_file, "w", encoding="utf-8") as of:
for shard in tqdm(shard_filenames[:num_shards]):
with open(shard, "r") as f:
data = json.load(f)
Expand Down

0 comments on commit 0776f86

Please sign in to comment.