the-algorithm/ann/src/main/scala/com/twitter/ann/hnsw/TypedHnswIndex.scala
2023-03-31 22:16:43 -04:00

174 lines
7.7 KiB
Scala

package com.twitter.ann.hnsw
import com.twitter.ann.common._
import com.twitter.bijection.Injection
import com.twitter.search.common.file.AbstractFile
// Class to provide HNSW based approximate nearest neighbour index
object TypedHnswIndex {
/**
* Creates in-memory HNSW based index which supports querying/addition/updates of the entity embeddings.
* See https://docbird.twitter.biz/ann/hnsw.html to check information about arguments.
*
* @param dimension Dimension of the embedding to be indexed
* @param metric Distance metric (InnerProduct/Cosine/L2)
* @param efConstruction The parameter has the same meaning as ef, but controls the
* index_time/index_accuracy ratio. Bigger ef_construction leads to longer
* construction, but better index quality. At some point, increasing
* ef_construction does not improve the quality of the index. One way to
* check if the selection of ef_construction was ok is to measure a recall
* for M nearest neighbour search when ef = ef_construction: if the recall is
* lower than 0.9, than there is room for improvement.
* @param maxM The number of bi-directional links created for every new element during construction.
* Reasonable range for M is 2-100. Higher M work better on datasets with high
* intrinsic dimensionality and/or high recall, while low M work better for datasets
* with low intrinsic dimensionality and/or low recalls. The parameter also determines
* the algorithm's memory consumption, bigger the param more the memory requirement.
* For high dimensional datasets (word embeddings, good face descriptors), higher M
* are required (e.g. M=48, 64) for optimal performance at high recall.
* The range M=12-48 is ok for the most of the use cases.
* @param expectedElements Approximate number of elements to be indexed
* @param readWriteFuturePool Future pool for performing read (query) and write operation (addition/updates).
* @tparam T Type of item to index
* @tparam D Type of distance
*/
def index[T, D <: Distance[D]](
dimension: Int,
metric: Metric[D],
efConstruction: Int,
maxM: Int,
expectedElements: Int,
readWriteFuturePool: ReadWriteFuturePool
): Appendable[T, HnswParams, D] with Queryable[T, HnswParams, D] with Updatable[T] = {
Hnsw[T, D](
dimension,
metric,
efConstruction,
maxM,
expectedElements,
readWriteFuturePool,
JMapBasedIdEmbeddingMap.applyInMemory[T](expectedElements)
)
}
/**
* Creates in-memory HNSW based index which supports querying/addition/updates of the entity embeddings.
* It can be serialized to a directory (HDFS/Local file system)
* See https://docbird.twitter.biz/ann/hnsw.html to check information about arguments.
*
* @param dimension Dimension of the embedding to be indexed
* @param metric Distance metric (InnerProduct/Cosine/L2)
* @param efConstruction The parameter has the same meaning as ef, but controls the
* index_time/index_accuracy ratio. Bigger ef_construction leads to longer
* construction, but better index quality. At some point, increasing
* ef_construction does not improve the quality of the index. One way to
* check if the selection of ef_construction was ok is to measure a recall
* for M nearest neighbour search when ef = ef_construction: if the recall is
* lower than 0.9, than there is room for improvement.
* @param maxM The number of bi-directional links created for every new element during construction.
* Reasonable range for M is 2-100. Higher M work better on datasets with high
* intrinsic dimensionality and/or high recall, while low M work better for datasets
* with low intrinsic dimensionality and/or low recalls. The parameter also determines
* the algorithm's memory consumption, bigger the param more the memory requirement.
* For high dimensional datasets (word embeddings, good face descriptors), higher M
* are required (e.g. M=48, 64) for optimal performance at high recall.
* The range M=12-48 is ok for the most of the use cases.
* @param expectedElements Approximate number of elements to be indexed
* @param injection Injection for typed Id T to Array[Byte]
* @param readWriteFuturePool Future pool for performing read (query) and write operation (addition/updates).
* @tparam T Type of item to index
* @tparam D Type of distance
*/
def serializableIndex[T, D <: Distance[D]](
dimension: Int,
metric: Metric[D],
efConstruction: Int,
maxM: Int,
expectedElements: Int,
injection: Injection[T, Array[Byte]],
readWriteFuturePool: ReadWriteFuturePool
): Appendable[T, HnswParams, D]
with Queryable[T, HnswParams, D]
with Updatable[T]
with Serialization = {
val index = Hnsw[T, D](
dimension,
metric,
efConstruction,
maxM,
expectedElements,
readWriteFuturePool,
JMapBasedIdEmbeddingMap
.applyInMemoryWithSerialization[T](expectedElements, injection)
)
SerializableHnsw[T, D](
index,
injection
)
}
/**
* Loads HNSW index from a directory to in-memory
* @param dimension dimension of the embedding to be indexed
* @param metric Distance metric
* @param readWriteFuturePool Future pool for performing read (query) and write operation (addition/updates).
* @param injection : Injection for typed Id T to Array[Byte]
* @param directory : Directory(HDFS/Local file system) where hnsw index is stored
* @tparam T : Type of item to index
* @tparam D : Type of distance
*/
def loadIndex[T, D <: Distance[D]](
dimension: Int,
metric: Metric[D],
injection: Injection[T, Array[Byte]],
readWriteFuturePool: ReadWriteFuturePool,
directory: AbstractFile
): Appendable[T, HnswParams, D]
with Queryable[T, HnswParams, D]
with Updatable[T]
with Serialization = {
SerializableHnsw.loadMapBasedQueryableIndex[T, D](
dimension,
metric,
injection,
readWriteFuturePool,
directory
)
}
/**
* Loads a HNSW index from a directory and memory map it.
* It will take less memory but rely more on disk as it leverages memory mapped file backed by disk.
* Latency will go up considerably (Could be by factor of > 10x) if used on instance with low
* memory since lot of page faults may occur. Best use case to use would with scalding jobs
* where mapper/reducers instance are limited by 8gb memory.
* @param dimension dimension of the embedding to be indexed
* @param metric Distance metric
* @param readWriteFuturePool Future pool for performing read (query) and write operation (addition/updates).
* @param injection Injection for typed Id T to Array[Byte]
* @param directory Directory(HDFS/Local file system) where hnsw index is stored
* @tparam T Type of item to index
* @tparam D Type of distance
*/
def loadMMappedIndex[T, D <: Distance[D]](
dimension: Int,
metric: Metric[D],
injection: Injection[T, Array[Byte]],
readWriteFuturePool: ReadWriteFuturePool,
directory: AbstractFile
): Appendable[T, HnswParams, D]
with Queryable[T, HnswParams, D]
with Updatable[T]
with Serialization = {
SerializableHnsw.loadMMappedBasedQueryableIndex[T, D](
dimension,
metric,
injection,
readWriteFuturePool,
directory
)
}
}