Skip to main content

Transfer learning

Configure your production system

note

If you would like to use the production features of SuperDuperDB, then you should set the relevant connections and configurations in a configuration file. Otherwise you are welcome to use "development" mode to get going with SuperDuperDB quickly.

import os

os.makedirs('.superduperdb', exist_ok=True)
os.environ['SUPERDUPERDB_CONFIG'] = '.superduperdb/config.yaml'
CFG = '''
data_backend: mongodb://127.0.0.1:27017/documents
artifact_store: filesystem://./artifact_store
cluster:
cdc:
strategy: null
uri: ray://127.0.0.1:20000
compute:
uri: ray://127.0.0.1:10001
vector_search:
backfill_batch_size: 100
type: in_memory
uri: http://127.0.0.1:21000
'''
with open(os.environ['SUPERDUPERDB_CONFIG'], 'w') as f:
f.write(CFG)

Start your cluster

note

Starting a SuperDuperDB cluster is useful in production and model development if you want to enable scalable compute, access to the models by multiple users for collaboration, monitoring.

If you don't need this, then it is simpler to start in development mode.

!python -m superduperdb local-cluster up        

Connect to SuperDuperDB

note

Note that this is only relevant if you are running SuperDuperDB in development mode. Otherwise refer to "Configuring your production system".

from superduperdb import superduper

db = superduper('mongodb://localhost:27017/documents')

Get useful sample data

from superduperdb.backends.ibis import dtype

!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text_classification.json
import json

with open("text_classification.json", "r") as f:
data = json.load(f)
sample_datapoint = data[-1]

Setup tables or collections

# Note this is an optional step for MongoDB
# Users can also work directly with `DataType` if they want to add
# custom data
from superduperdb import Schema, DataType
from superduperdb.backends.mongodb import Collection

table_or_collection = Collection('documents')
USE_SCHEMA = False

if USE_SCHEMA and isinstance(datatype, DataType):
schema = Schema(fields={'x': datatype})
db.apply(schema)

Insert data

In order to create data, we need to create a Schema for encoding our special Datatype column(s) in the databackend.

from superduperdb import Document, DataType

def do_insert(data, schema = None):

if schema is None and (datatype is None or isinstance(datatype, str)):
data = [Document({'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': x}) for x in data]
db.execute(table_or_collection.insert_many(data))
elif schema is None and datatype is not None and isinstance(datatype, DataType):
data = [Document({'x': datatype(x['x']), 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': datatype(x)}) for x in data]
db.execute(table_or_collection.insert_many(data))
else:
data = [Document({'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': x}) for x in data]
db.execute(table_or_collection.insert_many(data, schema=schema))

do_insert(data[:-len(data) // 4])

Compute features


key = 'txt'

import sentence_transformers
from superduperdb import vector, Listener
from superduperdb.ext.sentence_transformers import SentenceTransformer

superdupermodel = SentenceTransformer(
identifier="embedding",
object=sentence_transformers.SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
datatype=vector(shape=(384,)),
postprocess=lambda x: x.tolist(),
)

jobs, listener = db.apply(
Listener(
model=superdupermodel,
select=select,
key=key,
identifier="features"
)
)

Choose input key from listener outputs

note

This is useful if you have performed a first step, such as pre-computing features, or chunking your data. You can use this query to choose the input key for further models such as classification models.

input_key = listener.outputs
select = table_or_collection.find()

Build and train classifier

from sklearn.linear_model import LogisticRegression
from superduperdb.ext.sklearn.model import SklearnTrainer, Estimator

# Create a Logistic Regression model
model = LogisticRegression()
model = Estimator(
object=model,
identifier='my-model',
trainer=SklearnTrainer(
key=(input_key, 'y'),
select=select,
)
)

The following command adds the model to the system and trains the model in one command.

db.apply(model)