Transfer learning

Configure your production system

note

If you would like to use the production features of SuperDuperDB, then you should set the relevant connections and configurations in a configuration file. Otherwise you are welcome to use "development" mode to get going with SuperDuperDB quickly.

import os

os.makedirs('.superduperdb', exist_ok=True)
os.environ['SUPERDUPERDB_CONFIG'] = '.superduperdb/config.yaml'

CFG = '''
data_backend: mongodb://127.0.0.1:27017/documents
artifact_store: filesystem://./artifact_store
cluster:
  cdc:
    strategy: null
    uri: ray://127.0.0.1:20000
  compute:
    uri: ray://127.0.0.1:10001
  vector_search:
    backfill_batch_size: 100
    type: in_memory
    uri: http://127.0.0.1:21000
'''        

CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
        type: native
databackend: mongodb+srv://<user>:<password>@<mongo-host>:27017/documents
'''        

CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: sqlite://<path-to-db>.db
'''        

CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: mysql://<user>:<password>@<host>:<port>/database
'''        

CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: mssql://<user>:<password>@<host>:<port>
'''        

CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: postgres://<user>:<password>@<host>:<port</<database>
'''        

CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: snowflake://<user>:<password>@<account>/<database>
'''        

CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: clickhouse://<user>:<password>@<host>:<port>
'''        

with open(os.environ['SUPERDUPERDB_CONFIG'], 'w') as f:
    f.write(CFG)

Start your cluster

note

Starting a SuperDuperDB cluster is useful in production and model development if you want to enable scalable compute, access to the models by multiple users for collaboration, monitoring.

If you don't need this, then it is simpler to start in development mode.

Experimental Cluster
Docker-Compose

!python -m superduperdb local-cluster up        

!make testenv_image
!make testenv_init

Connect to SuperDuperDB

note

Note that this is only relevant if you are running SuperDuperDB in development mode. Otherwise refer to "Configuring your production system".

from superduperdb import superduper

db = superduper('mongodb://localhost:27017/documents')        

from superduperdb import superduper
db = superduper('sqlite://my_db.db')        

from superduperdb import superduper

user = 'superduper'
password = 'superduper'
port = 3306
host = 'localhost'
database = 'test_db'

db = superduper(f"mysql://{user}:{password}@{host}:{port}/{database}")        

from superduperdb import superduper

user = 'sa'
password = 'Superduper#1'
port = 1433
host = 'localhost'

db = superduper(f"mssql://{user}:{password}@{host}:{port}")        

!pip install psycopg2
from superduperdb import superduper

user = 'postgres'
password = 'postgres'
port = 5432
host = 'localhost'
database = 'test_db'
db_uri = f"postgres://{user}:{password}@{host}:{port}/{database}"

db = superduper(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))        

from superduperdb import superduper

user = "superduperuser"
password = "superduperpassword"
account = "XXXX-XXXX"  # ORGANIZATIONID-USERID
database = "FREE_COMPANY_DATASET/PUBLIC"

snowflake_uri = f"snowflake://{user}:{password}@{account}/{database}"

db = superduper(
    snowflake_uri, 
    metadata_store='sqlite:///your_database_name.db',
)        

from superduperdb import superduper

user = 'default'
password = ''
port = 8123
host = 'localhost'

db = superduper(f"clickhouse://{user}:{password}@{host}:{port}", metadata_store=f'mongomock://meta')        

from superduperdb import superduper

db = superduper('duckdb://mydb.duckdb')        

from superduperdb import superduper

db = superduper(['my.csv'], metadata_store=f'mongomock://meta')        

from superduperdb import superduper

db = superduper('mongomock:///test_db')        

Get useful sample data

from superduperdb import dtype

Text
Image

!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text.json
import json

with open('text.json', 'r') as f:
    data = json.load(f)
sample_datapoint = "What is mongodb?"

chunked_model_datatype = dtype('str')        

!curl -O s3://superduperdb-public-demo/images.zip && unzip images.zip
import os
from PIL import Image

data = [f'images/{x}' for x in os.listdir('./images')]
data = [ Image.open(path) for path in data]
sample_datapoint = data[-1]

from superduperdb.ext.pillow import pil_image
chunked_model_datatype = pil_image        

Setup tables or collections

MongoDB
SQL

# Note this is an optional step for MongoDB
# Users can also work directly with `DataType` if they want to add
# custom data
from superduperdb import Schema, DataType
from superduperdb.backends.mongodb import Collection

table_or_collection = Collection('documents')
USE_SCHEMA = False

if USE_SCHEMA and isinstance(datatype, DataType):
    schema = Schema(fields={'x': datatype})
    db.apply(schema)        

from superduperdb.backends.ibis import Table
from superduperdb import Schema, DataType
from superduperdb.backends.ibis.field_types import dtype

datatype = "str"

if isinstance(datatype, DataType):
    schema = Schema(identifier="schema", fields={"id": dtype("str"), "x": datatype})
else:
    schema = Schema(
        identifier="schema", fields={"id": dtype("str"), "x": dtype(datatype)}
    )

table_or_collection = Table('documents', schema=schema)

db.apply(table_or_collection)        

Insert data

In order to create data, we need to create a Schema for encoding our special Datatype column(s) in the databackend.

MongoDB
SQL

from superduperdb import Document

def do_insert(data):
    schema = None
    
    
    if schema is None and (datatype is None  or isinstance(datatype, str)) :
        data = [Document({'x': x}) for x in data]
        db.execute(table_or_collection.insert_many(data))
    elif schema is None and datatype is not None and isintance():
        data = [Document({'x': datatype(x)}) for x in data]
        db.execute(table_or_collection.insert_many(data))
    else:
        data = [Document({'x': x}) for x in data]
        db.execute(table_or_collection.insert_many(data, schema='my_schema'))        

from superduperdb import Document

def do_insert(data):
    db.execute(table_or_collection.insert([Document({'id': str(idx), 'x': x}) for idx, x in enumerate(data)]))        

do_insert(data[:-len(data) // 4])

Compute features

Text
Image

key = 'txt'

import sentence_transformers
from superduperdb import vector, Listener
from superduperdb.ext.sentence_transformers import SentenceTransformer

superdupermodel = SentenceTransformer(
    identifier="embedding",
    object=sentence_transformers.SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
    datatype=vector(shape=(384,)),
    postprocess=lambda x: x.tolist(),
)

jobs, listener = db.apply(
    Listener(
        model=superdupermodel,
        select=select,
        key=key,
        identifier="features"
    )
)        

key = 'image'

import torchvision.models as models
from torchvision import transforms
from superduperdb.ext.torch import TorchModel
from superduperdb import Listener
from PIL import Image

class TorchVisionEmbedding:
    def __init__(self):
        # Load the pre-trained ResNet-18 model
        self.resnet = models.resnet18(pretrained=True)
        
        # Set the model to evaluation mode
        self.resnet.eval()
        
    def preprocess(self, image_array):
        # Preprocess the image
        image = Image.fromarray(image_array.astype(np.uint8))
        preprocess = preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        tensor_image = preprocess(image)
        return tensor_image
        
model = TorchVisionEmbedding()
superdupermodel = TorchModel(identifier='my-vision-model-torch', object=model.resnet, preprocess=model.preprocess, postprocess=lambda x: x.numpy().tolist())

jobs, listener = db.apply(
    Listener(
        model=superdupermodel,
        select=select,
        key=key,
        identifier="features"
    )
)        

Choose input key

The input key to the fine-tuning model is the output of the previous listener:

input_key = listener.outputs

Build and train classifier

Scikit-Learn
Torch

from sklearn.linear_model import LogisticRegression
from superduperdb.ext.sklearn.model import SklearnTrainer, Estimator

# Create a Logistic Regression model
model = LogisticRegression()
model = Estimator(
    object=model,
    identifier='my-model',
    trainer=SklearnTrainer(
        key=(input_key, 'y'),
        select=Collection('clt').find(),
    )
)        

from torch import nn
from superduperdb.ext.torch.model import TorchModel
from superduperdb.ext.torch.training import TorchTrainer


class SimpleModel(nn.Module):
    def __init__(self, input_size=16, hidden_size=32, num_classes=3):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Loss function
def my_loss(X, y):
    return torch.nn.functional.binary_cross_entropy_with_logits(
        X[:, 0], y.type(torch.float)
    )


# Create a Logistic Regression model
model = SimpleModel()
model = TorchModel(
    identifier='my-model',
    object=model,         
    trainer=TorchTrainer(
        key=(input_key, 'y'),
        identifier='my_trainer',
        objective=my_loss,
        loader_kwargs={'batch_size': 10},
        max_iterations=100,
        validation_interval=10,
        select=Collection('clt').find(),
    ),
)        

The following command adds the model to the system and trains the model in one command.

db.apply(model)

Transfer learning

Configure your production system​

Start your cluster​

Connect to SuperDuperDB​

Get useful sample data​

Setup tables or collections​

Insert data​

Compute features​

Choose input key​

Build and train classifier​

Configure your production system

Start your cluster

Connect to SuperDuperDB

Get useful sample data

Setup tables or collections

Insert data

Compute features

Choose input key

Build and train classifier