superduperdb.backends.base package#

Submodules#

superduperdb.backends.base.artifact module#

exception superduperdb.backends.base.artifact.ArtifactSavingError[source]#

Bases: Exception

class superduperdb.backends.base.artifact.ArtifactStore(conn: Any, name: str | None = None)[source]#

Bases: ABC

Abstraction for storing large artifacts separately from primary data.

Parameters:
  • conn – connection to the meta-data store

  • name – Name to identify DB using the connection

abstract _delete_bytes(file_id: str)[source]#

Delete artifact from artifact store :param file_id: File id uses to identify artifact in store

abstract _load_bytes(file_id: str) bytes[source]#

Load bytes from artifact store.

Parameters:

file_id – Identifier of artifact in the store

delete(r: Dict)[source]#
abstract disconnect()[source]#

Disconnect the client

abstract drop(force: bool = False)[source]#

Drop the artifact store.

Parameters:

force – If True, don’t ask for confirmation

exists(file_id: str | None = None, datatype: str | None = None, uri: str | None = None)[source]#
load_artifact(r)[source]#

Load artifact from artifact store, and deserialize.

Parameters:
  • file_id – Identifier of artifact in the store

  • serializer – Serializer to use for deserialization

save(r: Dict) Dict[source]#

Save list of artifacts and replace the artifacts with file reference :param artifacts: List of Artifact instances

save_artifact(r: Dict)[source]#

Save serialized object in the artifact store.

Parameters:

r – dictionary with mandatory fields {‘bytes’, ‘datatype’} and optional fields {‘file_id’, ‘uri’}

property serializers#
abstract url()[source]#

Artifact store connection url

superduperdb.backends.base.backends module#

superduperdb.backends.base.compute module#

class superduperdb.backends.base.compute.ComputeBackend[source]#

Bases: ABC

Abstraction for sending jobs to a distributed compute platform.

abstract disconnect() None[source]#

Disconnect the client.

get_local_client()[source]#

Returns a local version of self

abstract property name: str#

Return the name of current compute engine

abstract result(identifier: str) Any[source]#

Retrieves the result of a previously submitted task. Note: This will block until the future is completed.

Parameters:

identifier – The identifier of the submitted task.

abstract shutdown() None[source]#

Shuts down the compute cluster.

abstract submit(function: Callable, **kwargs) Any[source]#

Submits a function to the Dask server for execution.

Parameters:
  • function – The function to be executed.

  • kwargs – Additional keyword arguments to be passed to the function.

abstract property tasks: Any#

List for all tasks

abstract property type: str#

Return the type of compute engine

abstract wait_all() None[source]#

Waits for all pending tasks to complete.

superduperdb.backends.base.data_backend module#

class superduperdb.backends.base.data_backend.BaseDataBackend(conn: Any, name: str)[source]#

Bases: ABC

abstract build_artifact_store()[source]#

Build a default artifact store based on current connection.

abstract build_metadata()[source]#

Build a default metadata store based on current connection.

create_model_table_or_collection(model: Model | APIModel)[source]#
property db#
abstract disconnect()[source]#

Disconnect the client

abstract drop(force: bool = False)[source]#

Drop the databackend.

abstract get_table_or_collection(identifier)[source]#
set_content_bytes(r, key, bytes_)[source]#
abstract url()[source]#

Databackend connection url

superduperdb.backends.base.metadata module#

class superduperdb.backends.base.metadata.MetaDataStore(conn: Any, name: str | None = None)[source]#

Bases: ABC

Abstraction for storing meta-data separately from primary data.

Parameters:
  • conn – connection to the meta-data store

  • name – Name to identify DB using the connection

add_query(query: Select, model: str)[source]#

Add query id to query table

abstract component_version_has_parents(type_id: str, identifier: str, version: int)[source]#

Check if a component version has parents.

Parameters:
  • type_id – type of component

  • identifier – identifier of component

  • version – version of component

abstract create_component(info: Dict)[source]#

Create a component in the metadata store.

Parameters:

info – dictionary containing information about the component.

abstract create_job(info: Dict)[source]#

Create a job in the metadata store. :param info: dictionary containing information about the job.

abstract create_parent_child(parent: str, child: str)[source]#

Create a parent-child relationship between two components.

Parameters:
  • parent – parent component

  • child – child component

abstract delete_component_version(type_id: str, identifier: str, version: int)[source]#

Delete a component version from the metadata store.

Parameters:
  • type_id – type of component

  • identifier – identifier of component

  • version – version of component

abstract disconnect()[source]#

Disconnect the client

abstract drop(force: bool = False)[source]#

Drop the metadata store.

Parameters:

force – whether to force the drop (without confirmation)

get_component(type_id: str, identifier: str, version: int | None = None, allow_hidden: bool = False) Dict[str, Any][source]#

Get a component from the metadata store.

Parameters:
  • type_id – type of component

  • identifier – identifier of component

  • version – version of component

  • allow_hidden – whether to allow hidden components

abstract get_component_version_parents(unique_id: str)[source]#

Get the parents of a component version.

Parameters:

unique_id – unique identifier of component version

get_indexing_listener_of_vector_index(identifier: str, version: int | None = None)[source]#
abstract get_job(job_id: str)[source]#

Get a job from the metadata store.

Parameters:

job_id – job identifier

abstract get_latest_version(type_id: str, identifier: str, allow_hidden: bool = False)[source]#

Get the latest version of a component.

Parameters:
  • type_id – type of component

  • identifier – identifier of component

  • allow_hidden – whether to allow hidden components

abstract get_metadata(key)[source]#

Get metadata from the metadata store.

Parameters:

key – key of metadata

get_queries(model: str)[source]#

Get all queries from query table corresponding to the model.

abstract hide_component_version(type_id: str, identifier: str, version: int)[source]#

Hide a component version.

Parameters:
  • type_id – type of component

  • identifier – identifier of component

  • version – version of component

replace_object(info: Dict[str, Any], identifier: str, type_id: str, version: int | None = None) None[source]#

Replace an object in the metadata store.

Parameters:
  • info – dictionary containing information about the object

  • identifier – identifier of object

  • type_id – type of object

  • version – version of object

abstract show_component_versions(type_id: str, identifier: str)[source]#

Show all versions of a component in the metadata store.

Parameters:
  • type_id – type of component

  • identifier – identifier of component

abstract show_components(type_id: str, **kwargs)[source]#

Show all components in the metadata store.

Parameters:
  • type_id – type of component

  • **kwargs

    additional arguments

abstract show_jobs()[source]#

Show all jobs in the metadata store.

abstract update_job(job_id: str, key: str, value: Any)[source]#

Update a job in the metadata store.

Parameters:
  • job_id – job identifier

  • key – key to be updated

  • value – value to be updated

abstract update_metadata(key, value)[source]#

Update metadata in the metadata store.

Parameters:
  • key – Key of metadata

  • value – Value of metadata

update_object(identifier: str, type_id: str, key: str, value: str, version: int | None = None)[source]#

Update an object in the metadata store.

Parameters:
  • identifier – identifier of object

  • type_id – type of object

  • key – key to be updated

  • value – value to be updated

  • version – version of object

abstract url()[source]#

Metadata store connection url

watch_job(identifier: str)[source]#

Listen to a job.

Parameters:

identifier – job identifier

abstract write_output_to_job(identifier: str, msg: str, stream: str)[source]#

Write output to a job in the metadata store.

Parameters:
  • identifier – identifier of job

  • msg – message to be written

  • stream – stream to be written to

exception superduperdb.backends.base.metadata.NonExistentMetadataError[source]#

Bases: Exception

superduperdb.backends.base.query module#

class superduperdb.backends.base.query.CompoundSelect(table_or_collection: TableOrCollection, pre_like: Like | None = None, post_like: Like | None = None, query_linker: QueryLinker | None = None)[source]#

Bases: _ReprMixin, Select, ABC

A query with multiple parts.

like —-> select —-> like

Parameters:
  • table_or_collection – The table or collection that this query is linked to

  • pre_like – The pre_like part of the query (e.g. table.like(...)...)

  • post_like – The post_like part of the query (e.g. table.filter(...)....like(...))

  • query_linker – The query linker that is responsible for linking the query chain. E.g. table.filter(...).select(...).

  • i – The index of the query in the query chain

add_fold(fold: str)[source]#
abstract execute(db, load_hybrid: bool = True)[source]#

Execute the compound query on the DB instance.

Parameters:

db – The DB instance to use

property id_field#
like(r: Document, vector_index: str, n: int = 10)[source]#
abstract property output_fields#
post_like: Like | None = None#
pre_like: Like | None = None#
property primary_id#
query_linker: QueryLinker | None = None#
repr_()[source]#

String representation of the query.

property select_ids#

Query which selects the same documents/ rows but only ids.

select_ids_of_missing_outputs(key: str, model: str, version: int)[source]#

Query which selects ids where outputs are missing.

select_single_id(id: str)[source]#

Query which selects a single id.

Parameters:

id – The id to select.

select_using_ids(ids)[source]#

Subset a query to only these ids.

Parameters:

ids – The ids to subset to.

table_or_collection: TableOrCollection#
class superduperdb.backends.base.query.Delete(table_or_collection: ~superduperdb.backends.base.query.TableOrCollection, args: ~typing.Sequence = <factory>, kwargs: ~typing.Dict = <factory>)[source]#

Bases: Serializable, ABC

Base class for all deletion queries

Parameters:

table_or_collection – The table or collection that this query is linked to

args: Sequence#
abstract execute(db)[source]#
kwargs: Dict#
table_or_collection: TableOrCollection#
class superduperdb.backends.base.query.Insert(table_or_collection: ~superduperdb.backends.base.query.TableOrCollection, documents: ~typing.Sequence[~superduperdb.base.document.Document] = <factory>, verbose: bool = True, kwargs: ~typing.Dict = <factory>)[source]#

Bases: _ReprMixin, Serializable, ABC

Base class for all insert queries.

Parameters:
  • table_or_collection – The table or collection that this query is linked to

  • documents – The documents to insert

  • refresh – Whether to refresh the task-graph after inserting

  • verbose – Whether to print the progress of the insert

  • kwargs – Any additional keyword arguments to pass to the insert method

  • encoders – The encoders to use to encode the documents

documents: Sequence[Document]#
abstract execute(parent: Any)[source]#

Insert the data.

Parameters:

parent – The parent instance to use for insertion

kwargs: Dict#
repr_()[source]#
abstract select_table()[source]#
table_or_collection: TableOrCollection#
to_select(ids=None)[source]#
verbose: bool = True#
class superduperdb.backends.base.query.Like(r: Dict | Document, vector_index: str, n: int = 10)[source]#

Bases: Serializable

Base class for all like (vector-search) queries.

Parameters:
  • r – The item to be converted to a vector, to search with.

  • vector_index – The vector index to use

  • n – The number of results to return

execute(db, ids: Sequence[str] | None = None)[source]#
n: int = 10#
r: Dict | Document#
vector_index: str#
class superduperdb.backends.base.query.QueryComponent(name: str, type: str = QueryType.ATTR, args: ~typing.Sequence = <factory>, kwargs: ~typing.Dict = <factory>)[source]#

Bases: Serializable

This is a representation of a single query object in ibis query chain. This is used to build a query chain that can be executed on a database. Query will be executed in the order they are added to the chain.

If we have a query chain like this:

query = t.select([‘id’, ‘name’]).limit(10)

here we have 2 query objects, select and limit.

select will be wrapped with this class and added to the chain.

Parameters:
  • name – The name of the query

  • type – The type of the query, either query or attr

  • args – The arguments to pass to the query

  • kwargs – The keyword arguments to pass to the query

args: Sequence#
execute(parent: Any)[source]#
kwargs: Dict#
name: str#
repr_() str[source]#
type: str = 'attr'#
class superduperdb.backends.base.query.QueryLinker(table_or_collection: ~superduperdb.backends.base.query.TableOrCollection, members: ~typing.List = <factory>)[source]#

Bases: _ReprMixin, Serializable, ABC

This class is responsible for linking together a query using getattr and __call__.

This allows superduperdb to serialize queries from a range of APIs. Intuitively this allows us to do something like this:

>>> collection.find({}).limit(10).sort('name')
-->
[
    ('<NAME>', <ARGS>, <KWARGS>),
    ('find', {}, None),
    ('limit', 10, None),
    ('sort', 'name', None),
]

table.filter(t.select(‘id’) == ‘1’)

Parameters:
  • table_or_collection – The table or collection that this query is linked to.

  • members – The members of the query chain.

abstract execute(db)[source]#
members: List#
property query_components#
repr_() str[source]#
abstract property select_ids#
abstract select_single_id(id)[source]#
abstract select_using_ids(ids)[source]#
table_or_collection: TableOrCollection#
class superduperdb.backends.base.query.QueryType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]#

Bases: str, Enum

The type of a query. Either query or attr.

ATTR = 'attr'#
QUERY = 'query'#
_generate_next_value_(start, count, last_values)#

Generate the next value when not given.

name: the name of the member start: the initial start value or None count: the number of existing members last_values: the list of values assigned

class superduperdb.backends.base.query.RawQuery(query: Any)[source]#

Bases: object

abstract execute(db)[source]#

A raw query method which executes the query and returns the result

query: Any#
class superduperdb.backends.base.query.Select[source]#

Bases: Serializable, ABC

Base class for all select queries.

abstract add_fold(fold: str) Select[source]#
abstract execute(db, reference: bool = True)[source]#

Execute the query on the DB instance.

abstract property id_field#
model_update(db, ids: List[str], key: str, model: str, version: int, outputs: Sequence[Any], **kwargs)[source]#

Update model outputs for a set of ids.

Parameters:
  • db – The DB instance to use

  • ids – The ids to update

  • key – The key to update

  • model – The model to update

  • outputs – The outputs to update

property query_components#
abstract property select_ids: Select#
abstract select_ids_of_missing_outputs(key: str, model: str, version: int) Select[source]#
abstract select_single_id(id: str) Select[source]#
abstract property select_table#
abstract select_using_ids(ids: Sequence[str]) Select[source]#
class superduperdb.backends.base.query.TableOrCollection(identifier: str | Variable)[source]#

Bases: Serializable, ABC

This is a representation of an SQL table in ibis.

Parameters:

identifier – The name of the table

identifier: str | Variable#
abstract insert(documents: Sequence[Document], **kwargs) Insert[source]#
like(r: Document, vector_index: str, n: int = 10)[source]#

This method appends a query to the query chain where the query is repsonsible for performing a vector search on the parent query chain inputs.

Parameters:
  • r – The vector to search for

  • vector_index – The vector index to use

  • n – The number of results to return

abstract model_update(db, ids: List[Any], key: str, model: str, version: int, outputs: Sequence[Any], flatten: bool = False, **kwargs)[source]#
query_components: ClassVar[Dict] = {}#
type_id: ClassVar[str] = 'table_or_collection'#
class superduperdb.backends.base.query.Update(table_or_collection: TableOrCollection)[source]#

Bases: Serializable, ABC

Base class for all update queries

Parameters:

table_or_collection – The table or collection that this query is linked to

abstract execute(db)[source]#
abstract select_table()[source]#
table_or_collection: TableOrCollection#

Module contents#