From 0489f1111c1c1967b9d30fd87020652dfffed941 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Mon, 4 May 2026 15:41:39 -0700 Subject: [PATCH] feat(vector): add LocalSQLiteVecStore backed by sqlite-vec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the VectorStore ABC using sqlite-vec virtual tables. Two-table design (vec0 virtual + companion meta) supports upsert, top-k ANN query with optional metadata post-filter, delete by ID, and bulk delete_where. Also renames VectorMatch.id → entry_id to avoid shadowing the Python builtin, updating base.py and all tests. Installed: sqlite-vec 0.1.9 Tests: 16 passed (7 base + 9 integration) --- circuitforge_core/vector/__init__.py | 3 +- circuitforge_core/vector/base.py | 2 +- circuitforge_core/vector/sqlite_vec.py | 176 +++++++++++++++++++++++++ tests/test_vector/test_base.py | 10 +- tests/test_vector/test_sqlite_vec.py | 77 +++++++++++ 5 files changed, 261 insertions(+), 7 deletions(-) create mode 100644 circuitforge_core/vector/sqlite_vec.py create mode 100644 tests/test_vector/test_sqlite_vec.py diff --git a/circuitforge_core/vector/__init__.py b/circuitforge_core/vector/__init__.py index f6e5e4b..0559dbf 100644 --- a/circuitforge_core/vector/__init__.py +++ b/circuitforge_core/vector/__init__.py @@ -1,3 +1,4 @@ from .base import VectorMatch, VectorStore +from .sqlite_vec import LocalSQLiteVecStore -__all__ = ["VectorMatch", "VectorStore"] +__all__ = ["VectorMatch", "VectorStore", "LocalSQLiteVecStore"] diff --git a/circuitforge_core/vector/base.py b/circuitforge_core/vector/base.py index 737beaa..ffbb203 100644 --- a/circuitforge_core/vector/base.py +++ b/circuitforge_core/vector/base.py @@ -15,7 +15,7 @@ from typing import Any class VectorMatch: """A single result from a vector similarity search.""" - id: str + entry_id: str score: float # lower is better (L2 / cosine distance) metadata: dict[str, Any] = field(default_factory=dict) diff --git a/circuitforge_core/vector/sqlite_vec.py b/circuitforge_core/vector/sqlite_vec.py new file mode 100644 index 0000000..d88ca94 --- /dev/null +++ b/circuitforge_core/vector/sqlite_vec.py @@ -0,0 +1,176 @@ +# circuitforge_core/vector/sqlite_vec.py +""" +circuitforge_core.vector.sqlite_vec -- sqlite-vec backed VectorStore. + +Suitable for single-user local deployments. Cloud Paid tier replaces +this with QdrantStore via the same VectorStore ABC. +""" + +from __future__ import annotations + +import json +import logging +import sqlite3 +import struct +from contextlib import contextmanager +from pathlib import Path +from typing import Any, Generator + +import sqlite_vec + +from .base import VectorMatch, VectorStore + +logger = logging.getLogger(__name__) + + +def _serialize(vector: list[float]) -> bytes: + return struct.pack(f"<{len(vector)}f", *vector) + + +class LocalSQLiteVecStore(VectorStore): + """ + VectorStore backed by sqlite-vec virtual tables. + + Uses two tables per logical store: + - ``_vecs``: vec0 virtual table (rowid-indexed float vectors) + - ``
_meta``: companion table mapping rowid to string ID + JSON metadata + + Args: + db_path: Path to SQLite database file. + table: Logical name prefix (default ``"vecs"``). + dimensions: Vector length; must match the embedding model (default 768). + """ + + def __init__( + self, + db_path: str | Path, + table: str = "vecs", + dimensions: int = 768, + ) -> None: + self.db_path = str(db_path) + self.table = table + self.dimensions = dimensions + self._init_tables() + + @contextmanager + def _conn(self) -> Generator[sqlite3.Connection, None, None]: + conn = sqlite3.connect(self.db_path) + conn.enable_load_extension(True) + sqlite_vec.load(conn) + conn.enable_load_extension(False) + conn.row_factory = sqlite3.Row + try: + yield conn + conn.commit() + finally: + conn.close() + + def _init_tables(self) -> None: + with self._conn() as conn: + conn.execute(f""" + CREATE VIRTUAL TABLE IF NOT EXISTS {self.table}_vecs + USING vec0(embedding float[{self.dimensions}]) + """) + conn.execute(f""" + CREATE TABLE IF NOT EXISTS {self.table}_meta ( + rowid INTEGER PRIMARY KEY, + entry_id TEXT NOT NULL UNIQUE, + metadata TEXT NOT NULL DEFAULT '{{}}' + ) + """) + + def upsert( + self, entry_id: str, vector: list[float], metadata: dict[str, Any] + ) -> None: + with self._conn() as conn: + row = conn.execute( + f"SELECT rowid FROM {self.table}_meta WHERE entry_id = ?", [entry_id] + ).fetchone() + + if row: + rowid = row["rowid"] + conn.execute( + f"UPDATE {self.table}_vecs SET embedding = ? WHERE rowid = ?", + [_serialize(vector), rowid], + ) + conn.execute( + f"UPDATE {self.table}_meta SET metadata = ? WHERE rowid = ?", + [json.dumps(metadata), rowid], + ) + else: + cursor = conn.execute( + f"INSERT INTO {self.table}_meta(entry_id, metadata) VALUES (?, ?)", + [entry_id, json.dumps(metadata)], + ) + rowid = cursor.lastrowid + conn.execute( + f"INSERT INTO {self.table}_vecs(rowid, embedding) VALUES (?, ?)", + [rowid, _serialize(vector)], + ) + + def query( + self, + vector: list[float], + top_k: int = 10, + filter_metadata: dict[str, Any] | None = None, + ) -> list[VectorMatch]: + with self._conn() as conn: + rows = conn.execute( + f""" + SELECT m.entry_id, v.distance, m.metadata + FROM {self.table}_vecs v + JOIN {self.table}_meta m ON m.rowid = v.rowid + WHERE v.embedding MATCH ? AND k = ? + ORDER BY v.distance + """, + [_serialize(vector), top_k], + ).fetchall() + + results = [ + VectorMatch( + entry_id=r["entry_id"], + score=r["distance"], + metadata=json.loads(r["metadata"]), + ) + for r in rows + ] + + if filter_metadata: + results = [ + r + for r in results + if all(r.metadata.get(k) == v for k, v in filter_metadata.items()) + ] + return results + + def delete(self, entry_id: str) -> None: + with self._conn() as conn: + row = conn.execute( + f"SELECT rowid FROM {self.table}_meta WHERE entry_id = ?", [entry_id] + ).fetchone() + if row: + rowid = row["rowid"] + conn.execute(f"DELETE FROM {self.table}_vecs WHERE rowid = ?", [rowid]) + conn.execute(f"DELETE FROM {self.table}_meta WHERE rowid = ?", [rowid]) + + def delete_where(self, filter_metadata: dict[str, Any]) -> int: + if not filter_metadata: + raise ValueError( + "delete_where requires a non-empty filter; refusing to delete entire store" + ) + with self._conn() as conn: + rows = conn.execute( + f"SELECT rowid, metadata FROM {self.table}_meta" + ).fetchall() + to_delete = [ + r["rowid"] + for r in rows + if all( + json.loads(r["metadata"]).get(k) == v + for k, v in filter_metadata.items() + ) + ] + for rowid in to_delete: + conn.execute(f"DELETE FROM {self.table}_vecs WHERE rowid = ?", [rowid]) + conn.execute(f"DELETE FROM {self.table}_meta WHERE rowid = ?", [rowid]) + return len(to_delete) diff --git a/tests/test_vector/test_base.py b/tests/test_vector/test_base.py index 077eef4..21709a6 100644 --- a/tests/test_vector/test_base.py +++ b/tests/test_vector/test_base.py @@ -25,7 +25,7 @@ class _ConcreteStore(VectorStore): filter_metadata: dict | None = None, ) -> list[VectorMatch]: results = [ - VectorMatch(id=k, score=0.0, metadata=v[1]) + VectorMatch(entry_id=k, score=0.0, metadata=v[1]) for k, v in self._data.items() ] if filter_metadata: @@ -51,13 +51,13 @@ class _ConcreteStore(VectorStore): def test_vector_match_is_frozen(): - match = VectorMatch(id="a", score=0.1, metadata={}) + match = VectorMatch(entry_id="a", score=0.1, metadata={}) with pytest.raises(FrozenInstanceError): match.score = 0.5 # type: ignore[misc] def test_vector_match_metadata_is_dict(): - match = VectorMatch(id="a", score=0.1, metadata={"k": "v"}) + match = VectorMatch(entry_id="a", score=0.1, metadata={"k": "v"}) assert isinstance(match.metadata, dict) assert match.metadata["k"] == "v" @@ -67,7 +67,7 @@ def test_upsert_and_query(): store.upsert("chunk-1", [0.1, 0.2], {"doc_id": "book-a", "page": 1}) results = store.query([0.1, 0.2]) assert len(results) == 1 - assert results[0].id == "chunk-1" + assert results[0].entry_id == "chunk-1" assert results[0].metadata["page"] == 1 @@ -77,7 +77,7 @@ def test_query_filter_metadata(): store.upsert("c2", [0.2], {"doc_id": "book-b"}) results = store.query([0.1], filter_metadata={"doc_id": "book-a"}) assert len(results) == 1 - assert results[0].id == "c1" + assert results[0].entry_id == "c1" def test_delete(): diff --git a/tests/test_vector/test_sqlite_vec.py b/tests/test_vector/test_sqlite_vec.py new file mode 100644 index 0000000..5c9820e --- /dev/null +++ b/tests/test_vector/test_sqlite_vec.py @@ -0,0 +1,77 @@ +# tests/test_vector/test_sqlite_vec.py +"""Integration tests for LocalSQLiteVecStore (uses a real in-memory sqlite-vec DB).""" + +from __future__ import annotations + +import pytest + +from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore + +DIMS = 4 # small dimension for tests + + +@pytest.fixture +def store(tmp_path) -> LocalSQLiteVecStore: + return LocalSQLiteVecStore(db_path=tmp_path / "vecs.db", dimensions=DIMS) + + +def _vec(val: float) -> list[float]: + return [val] * DIMS + + +def test_upsert_and_query_returns_match(store): + store.upsert("doc-1::p1", _vec(0.1), {"doc_id": "doc-1", "page": 1}) + results = store.query(_vec(0.1), top_k=5) + assert len(results) == 1 + assert results[0].entry_id == "doc-1::p1" + assert results[0].metadata["page"] == 1 + + +def test_upsert_replaces_existing(store): + store.upsert("chunk-1", _vec(0.1), {"page": 1}) + store.upsert("chunk-1", _vec(0.2), {"page": 99}) + results = store.query(_vec(0.2), top_k=5) + assert results[0].metadata["page"] == 99 + + +def test_query_respects_top_k(store): + for i in range(5): + store.upsert(f"chunk-{i}", _vec(float(i) * 0.1), {"i": i}) + results = store.query(_vec(0.0), top_k=2) + assert len(results) == 2 + + +def test_filter_metadata(store): + store.upsert("c1", _vec(0.1), {"doc_id": "book-a"}) + store.upsert("c2", _vec(0.2), {"doc_id": "book-b"}) + results = store.query(_vec(0.1), filter_metadata={"doc_id": "book-a"}) + assert all(r.metadata["doc_id"] == "book-a" for r in results) + + +def test_delete(store): + store.upsert("x", _vec(0.5), {}) + store.delete("x") + assert store.query(_vec(0.5)) == [] + + +def test_delete_where(store): + store.upsert("c1", _vec(0.1), {"doc_id": "book-a"}) + store.upsert("c2", _vec(0.2), {"doc_id": "book-a"}) + store.upsert("c3", _vec(0.3), {"doc_id": "book-b"}) + count = store.delete_where({"doc_id": "book-a"}) + assert count == 2 + assert len(store.query(_vec(0.1))) == 1 + + +def test_delete_nonexistent_is_noop(store): + store.delete("does-not-exist") # should not raise + + +def test_empty_query_returns_empty(store): + assert store.query(_vec(0.1)) == [] + + +def test_delete_where_raises_on_empty_filter(store): + store.upsert("c1", _vec(0.1), {"doc_id": "book-a"}) + with pytest.raises(ValueError, match="empty"): + store.delete_where({})