Skip to content

Commit

Permalink
convert : fix Baichuan2 models by using vocab size in config.json (#3299
Browse files Browse the repository at this point in the history
)

Use local GGUF package when possible in Baichuan converter
  • Loading branch information
KerfuffleV2 committed Oct 4, 2023
1 parent beabc8c commit 019ba1d
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions convert-baichuan-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,14 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any
import itertools
import gguf
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]

if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf


if TYPE_CHECKING:
from typing import TypeAlias
Expand Down Expand Up @@ -174,8 +177,11 @@ def parse_args() -> argparse.Namespace:
print("gguf: get sentencepiece tokenizer vocab, scores and token types")

tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
vocab_size = hparams.get('vocab_size')
if vocab_size is None:
vocab_size = tokenizer.vocab_size()

for i in range(tokenizer.vocab_size()):
for i in range(vocab_size):
text: bytes
score: float

Expand Down

0 comments on commit 019ba1d

Please sign in to comment.