Transfer learning
Configure your production system​
If you would like to use the production features of SuperDuperDB, then you should set the relevant connections and configurations in a configuration file. Otherwise you are welcome to use "development" mode to get going with SuperDuperDB quickly.
import os
os.makedirs('.superduperdb', exist_ok=True)
os.environ['SUPERDUPERDB_CONFIG'] = '.superduperdb/config.yaml'
- MongoDB Community
- MongoDB Atlas
- SQLite
- MySQL
- Oracle
- PostgreSQL
- Snowflake
- Clickhouse
CFG = '''
data_backend: mongodb://127.0.0.1:27017/documents
artifact_store: filesystem://./artifact_store
cluster:
cdc:
strategy: null
uri: ray://127.0.0.1:20000
compute:
uri: ray://127.0.0.1:10001
vector_search:
backfill_batch_size: 100
type: in_memory
uri: http://127.0.0.1:21000
'''
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster:
compute: ray://<ray-host>
cdc:
uri: http://<cdc-host>:<cdc-port>
vector_search:
uri: http://<vector-search-host>:<vector-search-port>
type: native
databackend: mongodb+srv://<user>:<password>@<mongo-host>:27017/documents
'''
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster:
compute: ray://<ray-host>
cdc:
uri: http://<cdc-host>:<cdc-port>
vector_search:
uri: http://<vector-search-host>:<vector-search-port>
databackend: sqlite://<path-to-db>.db
'''
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster:
compute: ray://<ray-host>
cdc:
uri: http://<cdc-host>:<cdc-port>
vector_search:
uri: http://<vector-search-host>:<vector-search-port>
databackend: mysql://<user>:<password>@<host>:<port>/database
'''
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster:
compute: ray://<ray-host>
cdc:
uri: http://<cdc-host>:<cdc-port>
vector_search:
uri: http://<vector-search-host>:<vector-search-port>
databackend: mssql://<user>:<password>@<host>:<port>
'''
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster:
compute: ray://<ray-host>
cdc:
uri: http://<cdc-host>:<cdc-port>
vector_search:
uri: http://<vector-search-host>:<vector-search-port>
databackend: postgres://<user>:<password>@<host>:<port</<database>
'''
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster:
compute: ray://<ray-host>
cdc:
uri: http://<cdc-host>:<cdc-port>
vector_search:
uri: http://<vector-search-host>:<vector-search-port>
databackend: snowflake://<user>:<password>@<account>/<database>
'''
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster:
compute: ray://<ray-host>
cdc:
uri: http://<cdc-host>:<cdc-port>
vector_search:
uri: http://<vector-search-host>:<vector-search-port>
databackend: clickhouse://<user>:<password>@<host>:<port>
'''
with open(os.environ['SUPERDUPERDB_CONFIG'], 'w') as f:
f.write(CFG)
Start your cluster​
Starting a SuperDuperDB cluster is useful in production and model development if you want to enable scalable compute, access to the models by multiple users for collaboration, monitoring.
If you don't need this, then it is simpler to start in development mode.
- Experimental Cluster
- Docker-Compose
!python -m superduperdb local-cluster up
!make testenv_image
!make testenv_init
Connect to SuperDuperDB​
Note that this is only relevant if you are running SuperDuperDB in development mode. Otherwise refer to "Configuring your production system".
- MongoDB
- SQLite
- MySQL
- Oracle
- PostgreSQL
- Snowflake
- Clickhouse
- DuckDB
- Pandas
- MongoMock
from superduperdb import superduper
db = superduper('mongodb://localhost:27017/documents')
from superduperdb import superduper
db = superduper('sqlite://my_db.db')
from superduperdb import superduper
user = 'superduper'
password = 'superduper'
port = 3306
host = 'localhost'
database = 'test_db'
db = superduper(f"mysql://{user}:{password}@{host}:{port}/{database}")
from superduperdb import superduper
user = 'sa'
password = 'Superduper#1'
port = 1433
host = 'localhost'
db = superduper(f"mssql://{user}:{password}@{host}:{port}")
!pip install psycopg2
from superduperdb import superduper
user = 'postgres'
password = 'postgres'
port = 5432
host = 'localhost'
database = 'test_db'
db_uri = f"postgres://{user}:{password}@{host}:{port}/{database}"
db = superduper(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))
from superduperdb import superduper
user = "superduperuser"
password = "superduperpassword"
account = "XXXX-XXXX" # ORGANIZATIONID-USERID
database = "FREE_COMPANY_DATASET/PUBLIC"
snowflake_uri = f"snowflake://{user}:{password}@{account}/{database}"
db = superduper(
snowflake_uri,
metadata_store='sqlite:///your_database_name.db',
)
from superduperdb import superduper
user = 'default'
password = ''
port = 8123
host = 'localhost'
db = superduper(f"clickhouse://{user}:{password}@{host}:{port}", metadata_store=f'mongomock://meta')
from superduperdb import superduper
db = superduper('duckdb://mydb.duckdb')
from superduperdb import superduper
db = superduper(['my.csv'], metadata_store=f'mongomock://meta')
from superduperdb import superduper
db = superduper('mongomock:///test_db')
Get useful sample data​
from superduperdb import dtype
- Text
- Image
!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text.json
import json
with open('text.json', 'r') as f:
data = json.load(f)
sample_datapoint = "What is mongodb?"
chunked_model_datatype = dtype('str')
!curl -O s3://superduperdb-public-demo/images.zip && unzip images.zip
import os
from PIL import Image
data = [f'images/{x}' for x in os.listdir('./images')]
data = [ Image.open(path) for path in data]
sample_datapoint = data[-1]
from superduperdb.ext.pillow import pil_image
chunked_model_datatype = pil_image
Setup tables or collections​
- MongoDB
- SQL
# Note this is an optional step for MongoDB
# Users can also work directly with `DataType` if they want to add
# custom data
from superduperdb import Schema, DataType
from superduperdb.backends.mongodb import Collection
table_or_collection = Collection('documents')
USE_SCHEMA = False
if USE_SCHEMA and isinstance(datatype, DataType):
schema = Schema(fields={'x': datatype})
db.apply(schema)
from superduperdb.backends.ibis import Table
from superduperdb import Schema, DataType
from superduperdb.backends.ibis.field_types import dtype
datatype = "str"
if isinstance(datatype, DataType):
schema = Schema(identifier="schema", fields={"id": dtype("str"), "x": datatype})
else:
schema = Schema(
identifier="schema", fields={"id": dtype("str"), "x": dtype(datatype)}
)
table_or_collection = Table('documents', schema=schema)
db.apply(table_or_collection)
Insert data​
In order to create data, we need to create a Schema
for encoding our special Datatype
column(s) in the databackend.
- MongoDB
- SQL
from superduperdb import Document
def do_insert(data):
schema = None
if schema is None and (datatype is None or isinstance(datatype, str)) :
data = [Document({'x': x}) for x in data]
db.execute(table_or_collection.insert_many(data))
elif schema is None and datatype is not None and isintance():
data = [Document({'x': datatype(x)}) for x in data]
db.execute(table_or_collection.insert_many(data))
else:
data = [Document({'x': x}) for x in data]
db.execute(table_or_collection.insert_many(data, schema='my_schema'))
from superduperdb import Document
def do_insert(data):
db.execute(table_or_collection.insert([Document({'id': str(idx), 'x': x}) for idx, x in enumerate(data)]))
do_insert(data[:-len(data) // 4])
Compute features​
- Text
- Image
key = 'txt'
import sentence_transformers
from superduperdb import vector, Listener
from superduperdb.ext.sentence_transformers import SentenceTransformer
superdupermodel = SentenceTransformer(
identifier="embedding",
object=sentence_transformers.SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
datatype=vector(shape=(384,)),
postprocess=lambda x: x.tolist(),
)
jobs, listener = db.apply(
Listener(
model=superdupermodel,
select=select,
key=key,
identifier="features"
)
)
key = 'image'
import torchvision.models as models
from torchvision import transforms
from superduperdb.ext.torch import TorchModel
from superduperdb import Listener
from PIL import Image
class TorchVisionEmbedding:
def __init__(self):
# Load the pre-trained ResNet-18 model
self.resnet = models.resnet18(pretrained=True)
# Set the model to evaluation mode
self.resnet.eval()
def preprocess(self, image_array):
# Preprocess the image
image = Image.fromarray(image_array.astype(np.uint8))
preprocess = preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
tensor_image = preprocess(image)
return tensor_image
model = TorchVisionEmbedding()
superdupermodel = TorchModel(identifier='my-vision-model-torch', object=model.resnet, preprocess=model.preprocess, postprocess=lambda x: x.numpy().tolist())
jobs, listener = db.apply(
Listener(
model=superdupermodel,
select=select,
key=key,
identifier="features"
)
)
Choose input key​
The input key to the fine-tuning model is the output of the previous listener:
input_key = listener.outputs
Build and train classifier​
- Scikit-Learn
- Torch
from sklearn.linear_model import LogisticRegression
from superduperdb.ext.sklearn.model import SklearnTrainer, Estimator
# Create a Logistic Regression model
model = LogisticRegression()
model = Estimator(
object=model,
identifier='my-model',
trainer=SklearnTrainer(
key=(input_key, 'y'),
select=Collection('clt').find(),
)
)
from torch import nn
from superduperdb.ext.torch.model import TorchModel
from superduperdb.ext.torch.training import TorchTrainer
class SimpleModel(nn.Module):
def __init__(self, input_size=16, hidden_size=32, num_classes=3):
super(SimpleModel, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
# Loss function
def my_loss(X, y):
return torch.nn.functional.binary_cross_entropy_with_logits(
X[:, 0], y.type(torch.float)
)
# Create a Logistic Regression model
model = SimpleModel()
model = TorchModel(
identifier='my-model',
object=model,
trainer=TorchTrainer(
key=(input_key, 'y'),
identifier='my_trainer',
objective=my_loss,
loader_kwargs={'batch_size': 10},
max_iterations=100,
validation_interval=10,
select=Collection('clt').find(),
),
)
The following command adds the model to the system and trains the model in one command.
db.apply(model)