Implement a in-memory Embedbase

⚠️ This is not recommended for production. ⚠️

To get started quickly, you can implement a in-memory Embedbase database. This is not recommended for production, but it's useful for testing and prototyping.

Installation

Install the required dependencies in a virtual environment:

virtualenv env
source env/bin/activate
pip install embedbase numpy

Start Embedbase

Create a new file main.py with the following code:

main.py

import uvicorn
import numpy as np
from embedbase import get_app
from embedbase.database.base import VectorDatabase
from embedbase.embedding.openai import OpenAI
 
class InMemoryDatabase(VectorDatabase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.storage = {}
 
    async def update(
        self, df, dataset_id, user_id=None, store_data=True, batch_size=100
    ):
        for _, row in df.iterrows():
            doc_id = row.id
            self.storage[doc_id] = {
                "data": row.data if store_data else None,
                "embedding": np.array(row.embedding),
                "metadata": row.metadata,
                "dataset_id": dataset_id,
                "user_id": user_id,
                "hash": row.hash,
            }
        return True
 
    async def select(self, ids=[], hashes=[], dataset_id=None, user_id=None):
        if ids:
            return [
                self.storage[doc_id]
                for doc_id in ids
                if doc_id in self.storage
                and (
                    dataset_id is None
                    or self.storage[doc_id]["dataset_id"] == dataset_id
                )
                and (user_id is None or self.storage[doc_id]["user_id"] == user_id)
            ]
        elif hashes:
            return [
                {
                    "id": id,
                    "data": doc["data"],
                    "embedding": doc["embedding"].tolist(),
                    "metadata": doc["metadata"],
                    "hash": doc["hash"],
                }
                for id, doc in self.storage.items()
                if doc["hash"] in hashes
                and (dataset_id is None or doc["dataset_id"] == dataset_id)
                and (user_id is None or doc["user_id"] == user_id)
            ]
        else:
            return []
 
    async def search(self, vector, top_k, dataset_ids, user_id=None):
        query_embedding = np.array(vector)
        similarities = [
            (
                doc_id,
                np.dot(doc["embedding"], query_embedding)
                / (np.linalg.norm(doc["embedding"]) * np.linalg.norm(query_embedding)),
            )
            for doc_id, doc in self.storage.items()
            if doc["dataset_id"] in dataset_ids
            and (user_id is None or doc["user_id"] == user_id)
        ]
        similarities.sort(key=lambda x: x[1], reverse=True)
        return [
            {
                "id": doc_id,
                "score": sim,
                "data": self.storage[doc_id]["data"],
                "metadata": self.storage[doc_id]["metadata"],
                "embedding": self.storage[doc_id]["embedding"].tolist(),
                "hash": self.storage[doc_id]["hash"],
            }
            for doc_id, sim in similarities[:top_k]
        ]
 
    async def delete(self, ids, dataset_id, user_id=None):
        for doc_id in ids:
            if (
                doc_id in self.storage
                and (
                    dataset_id is None
                    or self.storage[doc_id]["dataset_id"] == dataset_id
                )
                and (user_id is None or self.storage[doc_id]["user_id"] == user_id)
            ):
                del self.storage[doc_id]
        return None
 
    async def get_datasets(self, user_id=None):
        datasets = {}
        for doc in self.storage.values():
            if user_id is None or doc["user_id"] == user_id:
                dataset_id = doc["dataset_id"]
                if dataset_id not in datasets:
                    datasets[dataset_id] = 1
                else:
                    datasets[dataset_id] += 1
        return [{"dataset_id": k, "documents_count": v} for k, v in datasets.items()]
 
    async def clear(self, dataset_id, user_id=None):
        doc_ids_to_remove = [
            doc_id
            for doc_id, doc in self.storage.items()
            if doc["dataset_id"] == dataset_id
            and (user_id is None or doc["user_id"] == user_id)
        ]
        for doc_id in doc_ids_to_remove:
            del self.storage[doc_id]
        return None
 
 
app = (get_app().use_embedder(OpenAI("<your-openai-api-key>")).use_db(InMemoryDatabase())).run()
 
if __name__ == "__main__":
    uvicorn.run("main:app")

Start the Embedbase application with the following command:

python3 main.py

Test the endpoint

TypeScript

import { createClient } from 'embedbase-js'
const embedbase = createClient("http://localhost:8000")
const SENTENCES = [
    "The lion is the king of the savannah.",
    "The chimpanzee is a great ape.",
    "The elephant is the largest land animal.",
];
const DATASET_ID = "animals"
const add = async () => {
    const data = await embedbase
        .dataset(DATASET_ID)
        .batchAdd(SENTENCES.map((data) => ({ data })))
    console.log(data)
}
add()

You should get a similar response:

{
  "results": [
        {
            "data": "The lion is the king of the savannah.",
            "embedding": [...],
            "hash": ...,
            "metadata": null
        },
        {
            "data": "The chimpanzee is a great ape.",
            "embedding": [...],
            "hash": ...,
            "metadata": null
        },
        {
            "data": "The elephant is the largest land animal.",
            "embedding": [...],
            "hash": ...,
            "metadata": null
        }
    ]
}

Let's try to search now

TypeScript

const search = async () => {
    const data = await embedbase
        .dataset(DATASET_ID)
        .search("Animal that lives in the savannah", { limit: 1})
    console.log(data)
}
search()

You should get a similar response:

"The lion is the king of the savannah."

When you want to move to production, you can use Embedbase Cloud (opens in a new tab) or Supabase, or Postgres as a database.

Adding a Github repository content to Embedbase Run a privacy first local Embedbase instance