superduperdb.components package#

Submodules#

superduperdb.components.component module#

The component module provides the base class for all components in SuperDuperDB.

class superduperdb.components.component.Component(identifier: str, artifacts: dataclasses.InitVar[Optional[Dict]] = None)[source]#

Bases: Serializable, Leaf

Parameters:

identifier – A unique identifier for the component

property artifact_schema#
artifacts: dc.InitVar[t.Optional[t.Dict]] = None#
create_validation_job(validation_set: t.Union[str, Dataset], metrics: t.Sequence[str]) ComponentJob[source]#
property db#
classmethod decode(r, db: Any | None = None, reference: bool = False)[source]#
dict() Document[source]#
encode(bytes_encoding: BytesEncoding | None = None, leaf_types_to_keep: Sequence = ())[source]#
export()[source]#
identifier: str#
static import_(path: str)[source]#
leaf_type: t.ClassVar[str] = 'component'#
classmethod make_unique_id(type_id: str, identifier: str, version: int) str[source]#
on_load(db: Datalayer) None[source]#

Called when this component is loaded from the data store

Parameters:

db – the db that loaded the component

post_create(db: Datalayer) None[source]#

Called after the first time this component is created. Generally used if self.version is important in this logic.

Parameters:

db – the db that creates the component

pre_create(db: Datalayer) None[source]#

Called the first time this component is created

Parameters:

db – the db that creates the component

schedule_jobs(db: Datalayer, dependencies: t.Sequence[Job] = (), verbose: bool = False) t.Sequence[t.Any][source]#

Run the job for this listener

Parameters:
  • database – The db to process

  • dependencies – A sequence of dependencies,

  • verbose – If true, print more information

set_post_init: t.ClassVar[t.Sequence] = ('version',)#
type_id: t.ClassVar[str]#
property unique_id: str#

superduperdb.components.dataset module#

class superduperdb.components.dataset.Dataset(identifier: str, artifacts: dataclasses.InitVar[Optional[Dict]] = None, *, select: Select | None = None, sample_size: int | None = None, random_seed: int | None = None, creation_date: str | None = None, raw_data: Sequence[Any] | None = None)[source]#

Bases: Component

A dataset is an immutable collection of documents.

Parameters:
  • identifier – A unique identifier for the component

  • select – A query to select the documents for the dataset

  • sample_size – The number of documents to sample from the query

  • random_seed – The random seed to use for sampling

  • creation_date – The date the dataset was created

  • raw_data – The raw data for the dataset

creation_date: t.Optional[str] = None#
identifier: str#
on_load(db: Datalayer) None[source]#

Called when this component is loaded from the data store

Parameters:

db – the db that loaded the component

post_create(db: Datalayer) None[source]#

Called after the first time this component is created. Generally used if self.version is important in this logic.

Parameters:

db – the db that creates the component

pre_create(db: Datalayer) None[source]#

Called the first time this component is created

Parameters:

db – the db that creates the component

property random#
random_seed: t.Optional[int] = None#
raw_data: t.Optional[t.Sequence[t.Any]] = None#
sample_size: t.Optional[int] = None#
select: t.Optional[Select] = None#
type_id: t.ClassVar[str] = 'dataset'#

superduperdb.components.datatype module#

class superduperdb.components.datatype.DataType(identifier: str, artifacts: dataclasses.InitVar[typing.Optional[typing.Dict]] = None, *, encoder: ~typing.Callable = <function dill_encode>, decoder: ~typing.Callable = <function dill_decode>, info: ~typing.Dict | None = None, shape: ~typing.Sequence | None = None, artifact: bool = False, reference: bool = False, directory: str | None = None)[source]#

Bases: Component

Parameters:
  • identifier – A unique identifier for the component

  • identifier – Unique identifier

  • decoder – callable converting a bytes string to a Encodable of this Encoder

  • encoder – Callable converting an Encodable of this Encoder to bytes

  • shape – Shape of the data

  • load_hybrid – Whether to load the data from the URI or return the URI in CFG.hybrid mode

artifact: bool = False#
decoder(info: Dict | None = None) Any#
directory: str | None = None#
encoder(info: Dict | None = None) bytes#
identifier: str#
info: Dict | None = None#
reference: bool = False#
shape: Sequence | None = None#
type_id: ClassVar[str] = 'datatype'#
class superduperdb.components.datatype.DecodeTorchStateDict(cls)[source]#

Bases: object

class superduperdb.components.datatype.Encodable(datatype: DataType, x: Any | None = None, uri: str | None = None)[source]#

Bases: Leaf

Data variable wrapping encode-able item. Encoding is controlled by the referred to Encoder instance.

Parameters:
  • encoder – Instance of Encoder controlling encoding

  • x – Wrapped content

  • uri – URI of the content, if any

property artifact#
datatype: DataType#
classmethod decode(r, db, reference: bool = False)[source]#
encode(bytes_encoding: BytesEncoding | None = None, leaf_types_to_keep: Sequence = ()) str | None | Dict[str, Any][source]#
leaf_type: ClassVar[str] = 'encodable'#
property reference#
property unique_id#
uri: str | None = None#
x: Any | None = None#
superduperdb.components.datatype.Encoder#

alias of DataType

class superduperdb.components.datatype.LazyLoader(info: Dict)[source]#

Bases: object

info: Dict#
superduperdb.components.datatype.build_torch_state_serializer(module, info)[source]#
superduperdb.components.datatype.dill_decode(b: bytes, info: Dict | None = None) Any[source]#
superduperdb.components.datatype.dill_encode(object: Any, info: Dict | None = None) bytes[source]#
superduperdb.components.datatype.encode_torch_state_dict(module, info)[source]#
superduperdb.components.datatype.from_base64(encoded)[source]#
superduperdb.components.datatype.pickle_decode(b: bytes, info: Dict | None = None) Any[source]#
superduperdb.components.datatype.pickle_encode(object: Any, info: Dict | None = None) bytes[source]#
superduperdb.components.datatype.to_base64(bytes)[source]#
superduperdb.components.datatype.torch_decode(b: bytes, info: Dict | None = None) Any[source]#
superduperdb.components.datatype.torch_encode(object: Any, info: Dict | None = None) bytes[source]#

superduperdb.components.listener module#

class superduperdb.components.listener.Listener(artifacts: dataclasses.InitVar[typing.Optional[typing.Dict]] = None, *, identifier: str | None = None, key: str, model: str | ~superduperdb.components.model.Model, select: ~superduperdb.backends.base.query.CompoundSelect, active: bool = True, predict_kwargs: ~typing.Dict | None = <factory>)[source]#

Bases: Component

Listener object which is used to process a column/ key of a collection or table, and store the outputs.

Parameters:
  • identifier – A unique identifier for the component

  • key – Key to be bound to model

  • model – Model for processing data

  • select – Object for selecting which data is processed

  • identifier – A string used to identify the model.

  • active – Toggle to False to deactivate change data triggering

  • predict_kwargs – Keyword arguments to self.model.predict

active: bool = True#
cleanup(database: Datalayer) None[source]#

Clean up when the listener is deleted

Parameters:

database – The DB instance to process

property dependencies: List[str]#
property id_key: str#
identifier: str | None = None#
key: str#
model: str | Model#
property outputs#
post_create(db: Datalayer) None[source]#

Called after the first time this component is created. Generally used if self.version is important in this logic.

Parameters:

db – the db that creates the component

pre_create(db: Datalayer) None[source]#

Called the first time this component is created

Parameters:

db – the db that creates the component

predict_kwargs: Dict | None#
schedule_jobs(db: Datalayer, dependencies: Sequence[Job] = (), verbose: bool = False) Sequence[Any][source]#

Schedule jobs for the listener

Parameters:
  • database – The DB instance to process

  • dependencies – A list of dependencies

  • verbose – Whether to print verbose output

select: CompoundSelect#
type_id: ClassVar[str] = 'listener'#

superduperdb.components.metric module#

class superduperdb.components.metric.Metric(identifier: str, artifacts: dataclasses.InitVar[Optional[Dict]] = None, *, object: Callable)[source]#

Bases: Component

Metric base object with which to evaluate performance on a data-set. These objects are callable and are applied row-wise to the data, and averaged.

Parameters:
  • identifier – A unique identifier for the component

  • object – callable or Artifact to be applied to the data

public_api(beta): This API is in beta and may change before becoming stable.

identifier: str#
object: Callable#
type_id: ClassVar[str] = 'metric'#

superduperdb.components.model module#

class superduperdb.components.model.APIModel(identifier: str, artifacts: dc.InitVar[t.Optional[t.Dict]] = None, *, datatype: EncoderArg = None, output_schema: t.Optional[Schema] = None, flatten: bool = False, preprocess: t.Optional[t.Callable] = None, postprocess: t.Optional[t.Callable] = None, collate_fn: t.Optional[t.Callable] = None, batch_predict: bool = False, takes_context: bool = False, metrics: t.Sequence[t.Union[str, Metric, None]] = (), model_update_kwargs: t.Dict = <factory>, validation_sets: t.Optional[t.Sequence[t.Union[str, Dataset]]] = None, predict_X: t.Optional[str] = None, predict_select: t.Optional[CompoundSelect] = None, predict_max_chunk_size: t.Optional[int] = None, predict_kwargs: t.Optional[t.Dict] = None, model: t.Optional[str] = None)[source]#

Bases: Component, _Predictor

Parameters:
  • identifier – A unique identifier for the component

  • encoder – Encoder instance

  • output_schema – Output schema (mapping of encoders)

  • flatten – Flatten the model outputs

  • preprocess – Preprocess function

  • postprocess – Postprocess function

  • collate_fn – Collate function

  • batch_predict – Whether to batch predict

  • takes_context – Whether the model takes context into account

  • metrics – The metrics to evaluate on

  • model_update_kwargs – The kwargs to use for model update

  • validation_sets – The validation Dataset instances to use

  • predict_X – The key of the input data to use for .predict

  • predict_select – The select to use for .predict

  • predict_max_chunk_size – The max chunk size to use for .predict

  • predict_kwargs – The kwargs to use for .predict

  • model – The model to use, e.g. 'text-embedding-ada-002'

public_api(beta): This API is in beta and may change before becoming stable.

model: t.Optional[str] = None#
post_create(db: Datalayer) None[source]#

Called after the first time this component is created. Generally used if self.version is important in this logic.

Parameters:

db – the db that creates the component

schedule_jobs(db: Datalayer, dependencies: t.Sequence[Job] = (), verbose: bool = False) t.Sequence[t.Any][source]#

Run the job for this listener

Parameters:
  • database – The db to process

  • dependencies – A sequence of dependencies,

  • verbose – If true, print more information

class superduperdb.components.model.Model(identifier: str, artifacts: dc.InitVar[t.Optional[t.Dict]] = None, *, datatype: EncoderArg = None, output_schema: t.Optional[Schema] = None, flatten: bool = False, preprocess: t.Optional[t.Callable] = None, postprocess: t.Optional[t.Callable] = None, collate_fn: t.Optional[t.Callable] = None, batch_predict: bool = False, takes_context: bool = False, metrics: t.Sequence[t.Union[str, Metric, None]] = (), model_update_kwargs: dict = <factory>, validation_sets: t.Optional[t.Sequence[t.Union[str, Dataset]]] = None, predict_X: t.Optional[str] = None, predict_select: t.Optional[CompoundSelect] = None, predict_max_chunk_size: t.Optional[int] = None, predict_kwargs: t.Optional[t.Dict] = None, object: t.Any, model_to_device_method: t.Optional[str] = None, metric_values: t.Optional[t.Dict] = <factory>, predict_method: t.Optional[str] = None, device: str = 'cpu', preferred_devices: t.Union[None, t.Sequence[str]] = ('cuda', 'mps', 'cpu'), training_configuration: t.Union[str, _TrainingConfiguration, None] = None, train_X: t.Optional[str] = None, train_y: t.Optional[str] = None, train_select: t.Optional[CompoundSelect] = None)[source]#

Bases: _Predictor, Component

Model component which wraps a model to become serializable

Parameters:
  • identifier – A unique identifier for the component

  • encoder – Encoder instance

  • output_schema – Output schema (mapping of encoders)

  • flatten – Flatten the model outputs

  • preprocess – Preprocess function

  • postprocess – Postprocess function

  • collate_fn – Collate function

  • batch_predict – Whether to batch predict

  • takes_context – Whether the model takes context into account

  • metrics – The metrics to evaluate on

  • model_update_kwargs – The kwargs to use for model update

  • validation_sets – The validation Dataset instances to use

  • predict_X – The key of the input data to use for .predict

  • predict_select – The select to use for .predict

  • predict_max_chunk_size – The max chunk size to use for .predict

  • predict_kwargs – The kwargs to use for .predict

  • object – Model object, e.g. sklearn model, etc..

  • model_to_device_method – The method to transfer the model to a device

  • metric_values – The metric values

  • predict_method – The method to use for prediction

  • model_update_kwargs – The kwargs to use for model update

  • serializer – Serializer to store model to artifact store

  • device – The device to use

  • preferred_devices – The preferred devices to use

  • training_configuration – The training configuration

  • train_X – The key of the input data to use for training

  • train_y – The key of the target data to use for training

  • train_select – The select to use for training

append_metrics(d: Dict[str, float]) None[source]#
create_fit_job(X: str | Sequence[str], select: Select | None = None, y: str | None = None, **kwargs)[source]#
device: str = 'cpu'#
fit(X: t.Any, y: t.Optional[t.Any] = None, configuration: t.Optional[_TrainingConfiguration] = None, data_prefetch: bool = False, db: t.Optional[Datalayer] = None, dependencies: t.Sequence[Job] = (), metrics: t.Optional[t.Sequence[Metric]] = None, select: t.Optional[Select] = None, validation_sets: t.Optional[t.Sequence[t.Union[str, Dataset]]] = None, **kwargs) t.Optional[Pipeline][source]#

Fit the model on the given data.

Parameters:
  • X – The key of the input data to use for training

  • y – The key of the target data to use for training

  • configuration – The training configuration (optional)

  • data_prefetch – Whether to prefetch the data (optional)

  • db – The datalayer (optional)

  • dependencies – The dependencies (optional)

  • metrics – The metrics to evaluate on (optional)

  • select – The select to use for training (optional)

  • validation_sets – The validation Dataset instances to use (optional)

identifier: str#
metric_values: t.Optional[t.Dict]#
model_to_device_method: t.Optional[str] = None#
model_update_kwargs: dict#
object: t.Any#
on_load(db: Datalayer) None[source]#

Called when this component is loaded from the data store

Parameters:

db – the db that loaded the component

post_create(db: Datalayer) None[source]#

Called after the first time this component is created. Generally used if self.version is important in this logic.

Parameters:

db – the db that creates the component

pre_create(db: Datalayer)[source]#

Called the first time this component is created

Parameters:

db – the db that creates the component

predict_method: t.Optional[str] = None#
preferred_devices: t.Union[None, t.Sequence[str]] = ('cuda', 'mps', 'cpu')#
schedule_jobs(db: Datalayer, dependencies: t.Sequence[Job] = (), verbose: bool = False) t.Sequence[t.Any][source]#

Run the job for this listener

Parameters:
  • database – The db to process

  • dependencies – A sequence of dependencies,

  • verbose – If true, print more information

to_call(X, *args, **kwargs)[source]#

The method to use to call prediction. Should be implemented by the child class.

train_X: t.Optional[str] = None#
train_select: t.Optional[CompoundSelect] = None#
train_y: t.Optional[str] = None#
training_configuration: t.Union[str, _TrainingConfiguration, None] = None#
property training_keys: List#
type_id: t.ClassVar[str] = 'model'#
validate(db, validation_set: t.Union[Dataset, str], metrics: t.Sequence[Metric])[source]#
class superduperdb.components.model.QueryModel(identifier: str, artifacts: dc.InitVar[t.Optional[t.Dict]] = None, *, datatype: EncoderArg = None, output_schema: t.Optional[Schema] = None, flatten: bool = False, preprocess: t.Optional[t.Callable] = None, postprocess: t.Optional[t.Callable] = None, collate_fn: t.Optional[t.Callable] = None, batch_predict: bool = False, takes_context: bool = False, metrics: t.Sequence[t.Union[str, Metric, None]] = (), model_update_kwargs: t.Dict = <factory>, validation_sets: t.Optional[t.Sequence[t.Union[str, Dataset]]] = None, predict_X: t.Optional[str] = None, predict_select: t.Optional[CompoundSelect] = None, predict_max_chunk_size: t.Optional[int] = None, predict_kwargs: t.Optional[t.Dict] = None, select: CompoundSelect)[source]#

Bases: Component, _Predictor

Model which can be used to query data and return those results as pre-computed queries.

Parameters:

select – query used to find data (can include like)

schedule_jobs(db: Datalayer, dependencies: t.Sequence[Job] = (), verbose: bool = False) t.Sequence[t.Any][source]#

Run the job for this listener

Parameters:
  • database – The db to process

  • dependencies – A sequence of dependencies,

  • verbose – If true, print more information

select: CompoundSelect#
class superduperdb.components.model.SequentialModel(identifier: str, artifacts: dc.InitVar[t.Optional[t.Dict]] = None, *, datatype: EncoderArg = None, output_schema: t.Optional[Schema] = None, flatten: bool = False, preprocess: t.Optional[t.Callable] = None, postprocess: t.Optional[t.Callable] = None, collate_fn: t.Optional[t.Callable] = None, batch_predict: bool = False, takes_context: bool = False, metrics: t.Sequence[t.Union[str, Metric, None]] = (), model_update_kwargs: t.Dict = <factory>, validation_sets: t.Optional[t.Sequence[t.Union[str, Dataset]]] = None, predict_X: t.Optional[str] = None, predict_select: t.Optional[CompoundSelect] = None, predict_max_chunk_size: t.Optional[int] = None, predict_kwargs: t.Optional[t.Dict] = None, predictors: t.List[t.Union[str, Model, APIModel]])[source]#

Bases: Component, _Predictor

Sequential model component which wraps a model to become serializable

Parameters:
  • identifier – A unique identifier for the component

  • encoder – Encoder instance

  • output_schema – Output schema (mapping of encoders)

  • flatten – Flatten the model outputs

  • preprocess – Preprocess function

  • postprocess – Postprocess function

  • collate_fn – Collate function

  • batch_predict – Whether to batch predict

  • takes_context – Whether the model takes context into account

  • metrics – The metrics to evaluate on

  • model_update_kwargs – The kwargs to use for model update

  • validation_sets – The validation Dataset instances to use

  • predict_X – The key of the input data to use for .predict

  • predict_select – The select to use for .predict

  • predict_max_chunk_size – The max chunk size to use for .predict

  • predict_kwargs – The kwargs to use for .predict

  • predictors – A list of predictors to use

on_load(db: Datalayer)[source]#

Called when this component is loaded from the data store

Parameters:

db – the db that loaded the component

post_create(db: Datalayer)[source]#

Called after the first time this component is created. Generally used if self.version is important in this logic.

Parameters:

db – the db that creates the component

predictors: t.List[t.Union[str, Model, APIModel]]#
schedule_jobs(db: Datalayer, dependencies: t.Sequence[Job] = (), verbose: bool = False) t.Sequence[t.Any][source]#

Run the job for this listener

Parameters:
  • database – The db to process

  • dependencies – A sequence of dependencies,

  • verbose – If true, print more information

superduperdb.components.model.TrainingConfiguration(identifier: str, artifacts: dc.InitVar[t.Optional[t.Dict]] = None, *, kwargs: t.Optional[t.Dict] = None) None[source]#

Training configuration object, containing all settings necessary for a particular learning-task use-case to be serialized and initiated. The object is callable and returns a class which may be invoked to apply training.

Parameters:

**kwargs

Key-values pairs, the variables which configure training.

class superduperdb.components.model._Predictor(*, datatype: EncoderArg = None, output_schema: t.Optional[Schema] = None, flatten: bool = False, preprocess: t.Optional[t.Callable] = None, postprocess: t.Optional[t.Callable] = None, collate_fn: t.Optional[t.Callable] = None, batch_predict: bool = False, takes_context: bool = False, metrics: t.Sequence[t.Union[str, Metric, None]] = (), model_update_kwargs: t.Dict = <factory>, validation_sets: t.Optional[t.Sequence[t.Union[str, Dataset]]] = None, predict_X: t.Optional[str] = None, predict_select: t.Optional[CompoundSelect] = None, predict_max_chunk_size: t.Optional[int] = None, predict_kwargs: t.Optional[t.Dict] = None)[source]#

Bases: object

Parameters:
  • encoder – Encoder instance

  • output_schema – Output schema (mapping of encoders)

  • flatten – Flatten the model outputs

  • preprocess – Preprocess function

  • postprocess – Postprocess function

  • collate_fn – Collate function

  • batch_predict – Whether to batch predict

  • takes_context – Whether the model takes context into account

  • metrics – The metrics to evaluate on

  • model_update_kwargs – The kwargs to use for model update

  • validation_sets – The validation Dataset instances to use

  • predict_X – The key of the input data to use for .predict

  • predict_select – The select to use for .predict

  • predict_max_chunk_size – The max chunk size to use for .predict

  • predict_kwargs – The kwargs to use for .predict

async apredict(X: Any, context: Dict | None = None, one: bool = False, **kwargs)[source]#
batch_predict: bool = False#
collate_fn: t.Optional[t.Callable] = None#
create_predict_job(X: str, select: Select | None = None, ids: Sequence[str] | None = None, max_chunk_size: int | None = None, **kwargs)[source]#
datatype: EncoderArg = None#
flatten: bool = False#
metrics: t.Sequence[t.Union[str, Metric, None]] = ()#
model_update_kwargs: t.Dict#
output_schema: t.Optional[Schema] = None#
postprocess: t.Optional[t.Callable] = None#
predict(X: t.Any, db: t.Optional[Datalayer] = None, select: t.Optional[CompoundSelect] = None, ids: t.Optional[t.List[str]] = None, max_chunk_size: t.Optional[int] = None, dependencies: t.Sequence[Job] = (), listen: bool = False, one: bool = False, context: t.Optional[t.Dict] = None, insert_to: t.Optional[t.Union[TableOrCollection, str]] = None, key: t.Optional[str] = None, in_memory: bool = True, overwrite: bool = False, **kwargs) t.Any[source]#
predict_X: t.Optional[str] = None#
predict_kwargs: t.Optional[t.Dict] = None#
predict_max_chunk_size: t.Optional[int] = None#
predict_select: t.Optional[CompoundSelect] = None#
preprocess: t.Optional[t.Callable] = None#
takes_context: bool = False#
abstract to_call(X, *args, **kwargs)[source]#

The method to use to call prediction. Should be implemented by the child class.

type_id: t.ClassVar[str] = 'model'#
validation_sets: t.Optional[t.Sequence[t.Union[str, Dataset]]] = None#
class superduperdb.components.model._TrainingConfiguration(identifier: str, artifacts: dataclasses.InitVar[Optional[Dict]] = None, *, kwargs: Dict | None = None)[source]#

Bases: Component

Training configuration object, containing all settings necessary for a particular learning-task use-case to be serialized and initiated. The object is callable and returns a class which may be invoked to apply training.

Parameters:

**kwargs

Key-values pairs, the variables which configure training.

get(k, default=None)[source]#
kwargs: t.Optional[t.Dict] = None#
type_id: t.ClassVar[str] = 'training_configuration'#

superduperdb.components.schema module#

class superduperdb.components.schema.Schema(identifier: str, artifacts: dataclasses.InitVar[Optional[Dict]] = None, *, fields: Mapping[str, DataType | str])[source]#

Bases: Component

A component carrying the information about the types or Encoders of a Table

Parameters:
  • identifier – A unique identifier for the component

  • fields – A mapping of field names to types or Encoders

public_api(beta): This API is in beta and may change before becoming stable.

__call__(data: Mapping[str, Any])[source]#

Encode data using the schema’s encoders

Parameters:

data – data to encode

decode_data(data: Mapping[str, Any]) Mapping[str, Any][source]#

Decode data using the schema’s encoders

Parameters:

data – data to decode

property encoded_types#
property encoders#
fields: Mapping[str, DataType | str]#
identifier: str#
pre_create(db) None[source]#

Called the first time this component is created

Parameters:

db – the db that creates the component

property raw#
property trivial#
type_id: ClassVar[str] = 'schema'#

superduperdb.components.stack module#

class superduperdb.components.stack.Stack(identifier: str, artifacts: dataclasses.InitVar[Optional[Dict]] = None, *, components: Sequence[Component] = ())[source]#

Bases: Component

A placeholder to hold list of components under a namespace and packages them as a tarball This tarball can be retrieved back to a Stack instance with load method.

Parameters:
  • identifier – A unique identifier for the component

  • components – List of components to stack together and add to database.

public_api(alpha): This API is in alpha and may change before becoming stable.

components: Sequence[Component] = ()#
type_id: ClassVar[str] = 'stack'#

superduperdb.components.vector_index module#

class superduperdb.components.vector_index.DecodeArray(dtype)[source]#

Bases: object

class superduperdb.components.vector_index.EncodeArray(dtype)[source]#

Bases: object

class superduperdb.components.vector_index.VectorIndex(identifier: str, artifacts: dataclasses.InitVar[typing.Optional[typing.Dict]] = None, *, indexing_listener: ~superduperdb.components.listener.Listener | str, compatible_listener: None | ~superduperdb.components.listener.Listener | str = None, measure: ~superduperdb.vector_search.base.VectorIndexMeasureType = VectorIndexMeasureType.cosine, metric_values: ~typing.Dict | None = <factory>)[source]#

Bases: Component

A component carrying the information to apply a vector index to a DB instance

Parameters:
  • identifier – A unique identifier for the component

  • indexing_listener – Listener which is applied to created vectors

  • compatible_listener – Listener which is applied to vectors to be compared

  • measure – Measure to use for comparison

  • metric_values – Metric values for this index

compatible_listener: None | Listener | str = None#
property dimensions: int#
get_nearest(like: Document, db: Any, id_field: str = '_id', outputs: Dict | None = None, ids: Sequence[str] | None = None, n: int = 100) Tuple[List[str], List[float]][source]#

Given a document, find the nearest results in this vector index, returned as two parallel lists of result IDs and scores

Parameters:
  • like – The document to compare against

  • db – The datastore to use

  • outputs – An optional dictionary

  • ids – A list of ids to match

  • n – Number of items to return

get_vector(like: Document, models: List[str], keys: List[str], db: Any = None, outputs: Dict | None = None)[source]#
identifier: str#
indexing_listener: Listener | str#
measure: VectorIndexMeasureType = 'cosine'#
metric_values: Dict | None#
property models_keys: Tuple[List[str], List[str]]#

Return a list of model and keys for each listener

on_load(db: Datalayer) None[source]#

Called when this component is loaded from the data store

Parameters:

db – the db that loaded the component

type_id: ClassVar[str] = 'vector_index'#
superduperdb.components.vector_index.sqlvector(shape)[source]#

Create an encoder for a vector (list of ints/ floats) of a given shape compatible with sql databases.

Parameters:

shape – The shape of the vector

superduperdb.components.vector_index.vector(shape)[source]#

Create an encoder for a vector (list of ints/ floats) of a given shape

Parameters:

shape – The shape of the vector

Module contents#

The core package provides the core functionality of SuperDuperDB. This includes the main wrappers and classes for communicating with the database and for defining AI functionality.