Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,3 @@ example/testdata/
*.dat
*.log

*.json
8 changes: 6 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
module github.com/sugarme/tokenizer

go 1.13
go 1.18

require (
github.com/emirpasic/gods v1.12.0
github.com/rivo/uniseg v0.1.0
github.com/schollz/progressbar/v2 v2.15.0
github.com/stretchr/testify v1.4.0 // indirect
github.com/sugarme/regexpset v0.0.0-20200920021344-4d4ec8eaf93c
golang.org/x/text v0.3.3
)

require (
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
github.com/stretchr/testify v1.4.0 // indirect
gopkg.in/yaml.v2 v2.2.7 // indirect
)
3 changes: 0 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJ
github.com/schollz/progressbar/v2 v2.15.0 h1:dVzHQ8fHRmtPjD3K10jT3Qgn/+H+92jhPrhmxIJfDz8=
github.com/schollz/progressbar/v2 v2.15.0/go.mod h1:UdPq3prGkfQ7MOzZKlDRpYKcFqEMczbD7YmbPgpzKMI=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
Expand All @@ -21,9 +20,7 @@ github.com/sugarme/regexpset v0.0.0-20200920021344-4d4ec8eaf93c/go.mod h1:2gwkXL
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.7 h1:VUgggvou5XRW9mHwD/yXxIYSMtY0zoKQf/v226p2nyo=
gopkg.in/yaml.v2 v2.2.7/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
45 changes: 43 additions & 2 deletions model/bpe/bpe.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@ package bpe

import (
"bufio"
"embed"
"encoding/json"
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"sort"

// "strconv"
"log"
"strings"
Expand All @@ -35,6 +38,7 @@ type Config struct {
unkToken *string
continuingSubwordPrefix *string
endOfWordSuffix *string
fs *embed.FS
}

// BpeBuilder can be used to create a `BPE` model with
Expand All @@ -58,6 +62,7 @@ func NewBpeBuilder() *BpeBuilder {
unkToken: nil,
continuingSubwordPrefix: nil,
endOfWordSuffix: nil,
fs: nil,
},
}
}
Expand All @@ -67,6 +72,11 @@ func (bb *BpeBuilder) Files(vocab string, merges string) {
bb.config.files = &configFiles{vocab, merges}
}

// FS sets the fs to use to load embedded vocab and merges files.
func (bb *BpeBuilder) FS(fs embed.FS) {
bb.config.fs = &fs
}

// VocabAndMerges sets vocab and merges
func (bb *BpeBuilder) VocabAndMerges(vocab model.Vocab, merges Merges) {
bb.config.vocab = &vocab
Expand Down Expand Up @@ -123,6 +133,11 @@ func (bb *BpeBuilder) Build() (*BPE, error) {
}
}

// Have the BPE read the files from |bb.config.fs| if set.
if bb.config.fs != nil {
bpe.fs = bb.config.fs
}

// Read files if provided
if bb.config.files != nil {
vocab, merges, err = bpe.ReadFiles(bb.config.files.vocab, bb.config.files.merges)
Expand Down Expand Up @@ -190,6 +205,10 @@ type BPE struct {
// EndOfWordSuffix is an optional suffix
// to caracterize and end-of-word subword
EndOfWordSuffix *string

// fs is an embedded file system. It allows you to import vocab/merges files using go:embed.
// If |fs| is not nil, it is preferred over the local filesystem.
fs *embed.FS
}

func (b *BPE) builder() *BpeBuilder {
Expand Down Expand Up @@ -230,6 +249,14 @@ func NewBpeFromFiles(vocab, merges string) (*BPE, error) {
return b.Build()
}

// NewBPEFromFS create BPE model from |vocab| and |merges| files in |fs|.
func NewBPEFromFS(fs embed.FS, vocab, merges string) (*BPE, error) {
b := NewBpeBuilder()
b.FS(fs)
b.Files(vocab, merges)
return b.Build()
}

// NewBPE creates new BPE model with given vocab and merges
func NewBPE(vocab model.Vocab, merges Merges) *BPE {
b, err := newBPE()
Expand Down Expand Up @@ -260,7 +287,15 @@ func (b *BPE) FromFiles(vocab string, merges string) *BpeBuilder {
func (b *BPE) ReadFiles(vocabF string, mergesF string) (*model.Vocab, *Merges, error) {
var err error
// read json file
vocabBytes, err := ioutil.ReadFile(vocabF)

var vocabBytes []byte

if b.fs != nil {
vocabBytes, err = b.fs.ReadFile(vocabF)
} else {
vocabBytes, err = ioutil.ReadFile(vocabF)
}

if err != nil {
return nil, nil, err
}
Expand All @@ -275,9 +310,15 @@ func (b *BPE) ReadFiles(vocabF string, mergesF string) (*model.Vocab, *Merges, e
return nil, nil, err
}

var mFile io.ReadCloser
// Read merges file. Each line contains a Merges object(rank, )
// Recall: Merges is map[Pair]PairVal (rank int, newId int)
mFile, err := os.Open(mergesF)
if b.fs != nil {
mFile, err = b.fs.Open(mergesF)
} else {
mFile, err = os.Open(mergesF)
}

if err != nil {
return nil, nil, err
}
Expand Down
22 changes: 10 additions & 12 deletions pretrained/gpt2.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
package pretrained

import (
"embed"
"log"
"os"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/decoder"
"github.com/sugarme/tokenizer/model/bpe"
"github.com/sugarme/tokenizer/pretokenizer"
"github.com/sugarme/tokenizer/processor"
"github.com/sugarme/tokenizer/util"
)

// GPT2 loads GPT2 (small) tokenizer from vocab and merges files.
Expand All @@ -29,19 +28,18 @@ import (
// Source:
// "https://cdn.huggingface.co/gpt2-merges.txt"
// "https://cdn.huggingface.co/gpt2-vocab.json"
func GPT2(addPrefixSpace bool, trimOffsets bool) *tokenizer.Tokenizer {

currDir, err := os.Getwd()
if err != nil {
log.Fatal(err)
}
util.CdToThis()
defer util.CdBack(currDir)
//go:embed model/gpt2-merges.txt
//go:embed model/gpt2-vocab.json
var fs embed.FS

vocabFile := "model/gpt2-vocab.json"
mergeFile := "model/gpt2-merges.txt"
const (
vocabFilename = "model/gpt2-vocab.json"
mergeFilename = "model/gpt2-merges.txt"
)

model, err := bpe.NewBpeFromFiles(vocabFile, mergeFile)
func GPT2(addPrefixSpace bool, trimOffsets bool) *tokenizer.Tokenizer {
var model, err = bpe.NewBPEFromFS(fs, vocabFilename, mergeFilename)
if err != nil {
log.Fatal(err)
}
Expand Down
1 change: 1 addition & 0 deletions pretrained/model/gpt2-vocab.json

Large diffs are not rendered by default.