Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(vector):Similar_To Function Support & Progress on Vector Indexing in Dgraph #9048

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions dql/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@ import (
)

const (
uidFunc = "uid"
valueFunc = "val"
typFunc = "type"
lenFunc = "len"
countFunc = "count"
uidInFunc = "uid_in"
uidFunc = "uid"
valueFunc = "val"
typFunc = "type"
lenFunc = "len"
countFunc = "count"
uidInFunc = "uid_in"
similarToFn = "similar_to"
)

var (
Expand Down Expand Up @@ -1621,7 +1622,7 @@ func validFuncName(name string) bool {

switch name {
case "regexp", "anyofterms", "allofterms", "alloftext", "anyoftext",
"has", "uid", "uid_in", "anyof", "allof", "type", "match":
"has", "uid", "uid_in", "anyof", "allof", "type", "match", "similar_to":
return true
}
return false
Expand Down Expand Up @@ -1794,7 +1795,7 @@ L:
case IsInequalityFn(function.Name):
err = parseFuncArgs(it, function)

case function.Name == "uid_in":
case function.Name == "uid_in" || function.Name == "similar_to":
err = parseFuncArgs(it, function)

default:
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ require (
github.com/dgraph-io/graphql-transport-ws v0.0.0-20210511143556-2cef522f1f15
github.com/dgraph-io/ristretto v0.1.1
github.com/dgraph-io/simdjson-go v0.3.0
github.com/dgraph-io/vector_indexer v0.0.7-beta
github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/dgrijalva/jwt-go/v4 v4.0.0-preview1
github.com/dgryski/go-farm v0.0.0-20200201041132-a6ae2369ad13
github.com/dgryski/go-groupvarint v0.0.0-20190318181831-5ce5df8ca4e1
github.com/docker/docker v24.0.5+incompatible
Expand Down
52 changes: 52 additions & 0 deletions posting/heap.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package posting

import (
"container/heap"
)

type minBadgerHeapElement struct {
value float64
index uint64
}

func initBadgerHeapElement(val float64, i uint64) *minBadgerHeapElement {
return &minBadgerHeapElement{
value: val,
index: i,
}
}

type minBadgerTupleHeap []minBadgerHeapElement

func (h minBadgerTupleHeap) Len() int {
return len(h)
}

func (h minBadgerTupleHeap) Less(i, j int) bool {
return h[i].value < h[j].value
}

func (h minBadgerTupleHeap) Swap(i, j int) {
h[i], h[j] = h[j], h[i]
}

func (h *minBadgerTupleHeap) Push(x interface{}) {
*h = append(*h, x.(minBadgerHeapElement))
}

func (h *minBadgerTupleHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[:n-1]
return x
}

// Time: O(n)
func buildBadgerHeapByInit(array []minBadgerHeapElement) *minBadgerTupleHeap {
// initialize the MinTupleHeap that has implement the heap.Interface
minBadgerTupleHeap := &minBadgerTupleHeap{}
*minBadgerTupleHeap = array
heap.Init(minBadgerTupleHeap)
return minBadgerTupleHeap
}
100 changes: 100 additions & 0 deletions posting/helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package posting

import (
"errors"
"math"
"sort"
)

func norm(v []float64) float64 {
vectorNorm, _ := dotProduct(v, v)
return math.Sqrt(vectorNorm)
}

func dotProduct(a, b []float64) (float64, error) {
var dotProduct float64
if len(a) != len(b) {
err := errors.New("can not compute dot product on vectors of different lengths")
return dotProduct, err
}
for i := range a {
dotProduct += a[i] * b[i]
}
return dotProduct, nil
}

func euclidianDistance(a, b []float64) (float64, error) {
subtractResult := make([]float64, len(a))
err := vectorSubtract(a, b, subtractResult)
return norm(subtractResult), err
}

func cosineSimilarity(a, b []float64) (float64, error) {
dotProd, err := dotProduct(a, b)
if err != nil {
return 0, err
}
if norm(a) == 0 || norm(b) == 0 {
err := errors.New("can not compute cosine similarity on zero vector")
return 0, err
}
return dotProd / (norm(a) * norm(b)), nil
}

func max(a, b int) int {
if a < b {
return b
}
return a
}

func min(a, b int) int {
if a < b {
return a
}
return b
}

func vectorAdd(a, b, result []float64) error {
if len(a) != len(b) {
return errors.New("can not add vectors of different lengths")
}
if len(a) != len(result) {
return errors.New("result and operand vectors must be same length")
}
for i := range a {
result[i] = a[i] + b[i]
}
return nil
}

func vectorSubtract(a, b, result []float64) error {
if len(a) != len(b) {
return errors.New("can not subtract vectors of different lengths")
}
if len(a) != len(result) {
return errors.New("result and operand vectors must be same length")
}
for i := range a {
result[i] = a[i] - b[i]
}
return nil
}

// Used for distance, since shorter distance is better
func insortBadgerHeapAscending(slice []minBadgerHeapElement, val minBadgerHeapElement) []minBadgerHeapElement {
i := sort.Search(len(slice), func(i int) bool { return slice[i].value > val.value })
slice = append(slice, *initBadgerHeapElement(0.0, 0))
copy(slice[i+1:], slice[i:])
slice[i] = val
return slice
}

// Used for cosine similarity, since higher similarity score is better
func insortBadgerHeapDescending(slice []minBadgerHeapElement, val minBadgerHeapElement) []minBadgerHeapElement {
i := sort.Search(len(slice), func(i int) bool { return slice[i].value > val.value })
slice = append(slice, *initBadgerHeapElement(0.0, 0))
copy(slice[i+1:], slice[i:])
slice[i] = val
return slice
}
6 changes: 6 additions & 0 deletions posting/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,12 @@ func (txn *Txn) addIndexMutation(ctx context.Context, edge *pb.DirectedEdge, tok
if err = plist.addMutation(ctx, txn, edge); err != nil {
return err
}
if edge.Attr == "0-profile" { // change to checking for vector type do get on attr
InsertToBadger(ctx, txn, edge.ValueId, edge.Attr, 5, 3, 12) // use ctx.Value to access current vector GENIUS
}
// if edge.ValueType == pb.Posting_VFLOAT {
// InsertToBadger(ctx, plist, txn, edge.ValueId, edge.Attr, 5, 3, 12)
// }
ostats.Record(ctx, x.NumEdges.M(1))
return nil
}
Expand Down
Loading
Loading