Setting up Supabase
We need to create tables, functions and indexes to make Embedbase work with Supabase.
- Go to SQL Editor
- Run the following SQL commands
-- This is a postgres extension that allows us to store vectors and
-- perform similarity search on them
create extension vector
with
schema extensions;
-- This is the main table that stores documents and embeddings
create table documents (
id text primary key,
data text,
embedding vector (1536),
hash text,
dataset_id text,
user_id text,
metadata json,
created_date TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Make sure to prevent clients from accessing this table directly
alter table documents
enable row level security;
-- We are creating a function to run a similarity search on the documents table
create or replace function match_documents (
query_embedding vector(1536),
similarity_threshold float,
match_count int,
query_dataset_ids text[],
query_user_id text default null,
metadata_field text default null,
metadata_value text default null
)
returns table (
id text,
data text,
score float,
hash text,
embedding vector(1536),
metadata json
)
language plpgsql
as $$
begin
return query
select
documents.id,
documents.data,
(1 - (documents.embedding <=> query_embedding)) as similarity,
documents.hash,
documents.embedding,
documents.metadata
from documents
where 1 - (documents.embedding <=> query_embedding) > similarity_threshold
and documents.dataset_id = any(query_dataset_ids)
and (query_user_id is null or query_user_id = documents.user_id)
and (metadata_field is null or documents.metadata->>metadata_field = metadata_value) -- filter by metadata
order by documents.embedding <=> query_embedding
limit match_count;
end;
$$;
-- This index will allow us to perform similarity search on the documents table
create index on documents
using ivfflat (embedding vector_cosine_ops)
with (lists = 100);
-- This is a view that will allow us to get an overview of the datasets
CREATE OR REPLACE VIEW distinct_datasets AS
SELECT dataset_id, user_id, COUNT(*) AS documents_count
FROM documents
GROUP BY dataset_id, user_id;