feat(vector): add LocalSQLiteVecStore backed by sqlite-vec
Implements the VectorStore ABC using sqlite-vec virtual tables. Two-table design (vec0 virtual + companion meta) supports upsert, top-k ANN query with optional metadata post-filter, delete by ID, and bulk delete_where. Also renames VectorMatch.id → entry_id to avoid shadowing the Python builtin, updating base.py and all tests. Installed: sqlite-vec 0.1.9 Tests: 16 passed (7 base + 9 integration)
This commit is contained in:
parent
e6c69f25ae
commit
0489f1111c
5 changed files with 261 additions and 7 deletions
|
|
@ -1,3 +1,4 @@
|
||||||
from .base import VectorMatch, VectorStore
|
from .base import VectorMatch, VectorStore
|
||||||
|
from .sqlite_vec import LocalSQLiteVecStore
|
||||||
|
|
||||||
__all__ = ["VectorMatch", "VectorStore"]
|
__all__ = ["VectorMatch", "VectorStore", "LocalSQLiteVecStore"]
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ from typing import Any
|
||||||
class VectorMatch:
|
class VectorMatch:
|
||||||
"""A single result from a vector similarity search."""
|
"""A single result from a vector similarity search."""
|
||||||
|
|
||||||
id: str
|
entry_id: str
|
||||||
score: float # lower is better (L2 / cosine distance)
|
score: float # lower is better (L2 / cosine distance)
|
||||||
metadata: dict[str, Any] = field(default_factory=dict)
|
metadata: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
|
||||||
176
circuitforge_core/vector/sqlite_vec.py
Normal file
176
circuitforge_core/vector/sqlite_vec.py
Normal file
|
|
@ -0,0 +1,176 @@
|
||||||
|
# circuitforge_core/vector/sqlite_vec.py
|
||||||
|
"""
|
||||||
|
circuitforge_core.vector.sqlite_vec -- sqlite-vec backed VectorStore.
|
||||||
|
|
||||||
|
Suitable for single-user local deployments. Cloud Paid tier replaces
|
||||||
|
this with QdrantStore via the same VectorStore ABC.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
import struct
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Generator
|
||||||
|
|
||||||
|
import sqlite_vec
|
||||||
|
|
||||||
|
from .base import VectorMatch, VectorStore
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _serialize(vector: list[float]) -> bytes:
|
||||||
|
return struct.pack(f"<{len(vector)}f", *vector)
|
||||||
|
|
||||||
|
|
||||||
|
class LocalSQLiteVecStore(VectorStore):
|
||||||
|
"""
|
||||||
|
VectorStore backed by sqlite-vec virtual tables.
|
||||||
|
|
||||||
|
Uses two tables per logical store:
|
||||||
|
- ``<table>_vecs``: vec0 virtual table (rowid-indexed float vectors)
|
||||||
|
- ``<table>_meta``: companion table mapping rowid to string ID + JSON metadata
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db_path: Path to SQLite database file.
|
||||||
|
table: Logical name prefix (default ``"vecs"``).
|
||||||
|
dimensions: Vector length; must match the embedding model (default 768).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
db_path: str | Path,
|
||||||
|
table: str = "vecs",
|
||||||
|
dimensions: int = 768,
|
||||||
|
) -> None:
|
||||||
|
self.db_path = str(db_path)
|
||||||
|
self.table = table
|
||||||
|
self.dimensions = dimensions
|
||||||
|
self._init_tables()
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _conn(self) -> Generator[sqlite3.Connection, None, None]:
|
||||||
|
conn = sqlite3.connect(self.db_path)
|
||||||
|
conn.enable_load_extension(True)
|
||||||
|
sqlite_vec.load(conn)
|
||||||
|
conn.enable_load_extension(False)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
yield conn
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def _init_tables(self) -> None:
|
||||||
|
with self._conn() as conn:
|
||||||
|
conn.execute(f"""
|
||||||
|
CREATE VIRTUAL TABLE IF NOT EXISTS {self.table}_vecs
|
||||||
|
USING vec0(embedding float[{self.dimensions}])
|
||||||
|
""")
|
||||||
|
conn.execute(f"""
|
||||||
|
CREATE TABLE IF NOT EXISTS {self.table}_meta (
|
||||||
|
rowid INTEGER PRIMARY KEY,
|
||||||
|
entry_id TEXT NOT NULL UNIQUE,
|
||||||
|
metadata TEXT NOT NULL DEFAULT '{{}}'
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
def upsert(
|
||||||
|
self, entry_id: str, vector: list[float], metadata: dict[str, Any]
|
||||||
|
) -> None:
|
||||||
|
with self._conn() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
f"SELECT rowid FROM {self.table}_meta WHERE entry_id = ?", [entry_id]
|
||||||
|
).fetchone()
|
||||||
|
|
||||||
|
if row:
|
||||||
|
rowid = row["rowid"]
|
||||||
|
conn.execute(
|
||||||
|
f"UPDATE {self.table}_vecs SET embedding = ? WHERE rowid = ?",
|
||||||
|
[_serialize(vector), rowid],
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
f"UPDATE {self.table}_meta SET metadata = ? WHERE rowid = ?",
|
||||||
|
[json.dumps(metadata), rowid],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cursor = conn.execute(
|
||||||
|
f"INSERT INTO {self.table}_meta(entry_id, metadata) VALUES (?, ?)",
|
||||||
|
[entry_id, json.dumps(metadata)],
|
||||||
|
)
|
||||||
|
rowid = cursor.lastrowid
|
||||||
|
conn.execute(
|
||||||
|
f"INSERT INTO {self.table}_vecs(rowid, embedding) VALUES (?, ?)",
|
||||||
|
[rowid, _serialize(vector)],
|
||||||
|
)
|
||||||
|
|
||||||
|
def query(
|
||||||
|
self,
|
||||||
|
vector: list[float],
|
||||||
|
top_k: int = 10,
|
||||||
|
filter_metadata: dict[str, Any] | None = None,
|
||||||
|
) -> list[VectorMatch]:
|
||||||
|
with self._conn() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
f"""
|
||||||
|
SELECT m.entry_id, v.distance, m.metadata
|
||||||
|
FROM {self.table}_vecs v
|
||||||
|
JOIN {self.table}_meta m ON m.rowid = v.rowid
|
||||||
|
WHERE v.embedding MATCH ? AND k = ?
|
||||||
|
ORDER BY v.distance
|
||||||
|
""",
|
||||||
|
[_serialize(vector), top_k],
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
results = [
|
||||||
|
VectorMatch(
|
||||||
|
entry_id=r["entry_id"],
|
||||||
|
score=r["distance"],
|
||||||
|
metadata=json.loads(r["metadata"]),
|
||||||
|
)
|
||||||
|
for r in rows
|
||||||
|
]
|
||||||
|
|
||||||
|
if filter_metadata:
|
||||||
|
results = [
|
||||||
|
r
|
||||||
|
for r in results
|
||||||
|
if all(r.metadata.get(k) == v for k, v in filter_metadata.items())
|
||||||
|
]
|
||||||
|
return results
|
||||||
|
|
||||||
|
def delete(self, entry_id: str) -> None:
|
||||||
|
with self._conn() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
f"SELECT rowid FROM {self.table}_meta WHERE entry_id = ?", [entry_id]
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
rowid = row["rowid"]
|
||||||
|
conn.execute(f"DELETE FROM {self.table}_vecs WHERE rowid = ?", [rowid])
|
||||||
|
conn.execute(f"DELETE FROM {self.table}_meta WHERE rowid = ?", [rowid])
|
||||||
|
|
||||||
|
def delete_where(self, filter_metadata: dict[str, Any]) -> int:
|
||||||
|
if not filter_metadata:
|
||||||
|
raise ValueError(
|
||||||
|
"delete_where requires a non-empty filter; refusing to delete entire store"
|
||||||
|
)
|
||||||
|
with self._conn() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
f"SELECT rowid, metadata FROM {self.table}_meta"
|
||||||
|
).fetchall()
|
||||||
|
to_delete = [
|
||||||
|
r["rowid"]
|
||||||
|
for r in rows
|
||||||
|
if all(
|
||||||
|
json.loads(r["metadata"]).get(k) == v
|
||||||
|
for k, v in filter_metadata.items()
|
||||||
|
)
|
||||||
|
]
|
||||||
|
for rowid in to_delete:
|
||||||
|
conn.execute(f"DELETE FROM {self.table}_vecs WHERE rowid = ?", [rowid])
|
||||||
|
conn.execute(f"DELETE FROM {self.table}_meta WHERE rowid = ?", [rowid])
|
||||||
|
return len(to_delete)
|
||||||
|
|
@ -25,7 +25,7 @@ class _ConcreteStore(VectorStore):
|
||||||
filter_metadata: dict | None = None,
|
filter_metadata: dict | None = None,
|
||||||
) -> list[VectorMatch]:
|
) -> list[VectorMatch]:
|
||||||
results = [
|
results = [
|
||||||
VectorMatch(id=k, score=0.0, metadata=v[1])
|
VectorMatch(entry_id=k, score=0.0, metadata=v[1])
|
||||||
for k, v in self._data.items()
|
for k, v in self._data.items()
|
||||||
]
|
]
|
||||||
if filter_metadata:
|
if filter_metadata:
|
||||||
|
|
@ -51,13 +51,13 @@ class _ConcreteStore(VectorStore):
|
||||||
|
|
||||||
|
|
||||||
def test_vector_match_is_frozen():
|
def test_vector_match_is_frozen():
|
||||||
match = VectorMatch(id="a", score=0.1, metadata={})
|
match = VectorMatch(entry_id="a", score=0.1, metadata={})
|
||||||
with pytest.raises(FrozenInstanceError):
|
with pytest.raises(FrozenInstanceError):
|
||||||
match.score = 0.5 # type: ignore[misc]
|
match.score = 0.5 # type: ignore[misc]
|
||||||
|
|
||||||
|
|
||||||
def test_vector_match_metadata_is_dict():
|
def test_vector_match_metadata_is_dict():
|
||||||
match = VectorMatch(id="a", score=0.1, metadata={"k": "v"})
|
match = VectorMatch(entry_id="a", score=0.1, metadata={"k": "v"})
|
||||||
assert isinstance(match.metadata, dict)
|
assert isinstance(match.metadata, dict)
|
||||||
assert match.metadata["k"] == "v"
|
assert match.metadata["k"] == "v"
|
||||||
|
|
||||||
|
|
@ -67,7 +67,7 @@ def test_upsert_and_query():
|
||||||
store.upsert("chunk-1", [0.1, 0.2], {"doc_id": "book-a", "page": 1})
|
store.upsert("chunk-1", [0.1, 0.2], {"doc_id": "book-a", "page": 1})
|
||||||
results = store.query([0.1, 0.2])
|
results = store.query([0.1, 0.2])
|
||||||
assert len(results) == 1
|
assert len(results) == 1
|
||||||
assert results[0].id == "chunk-1"
|
assert results[0].entry_id == "chunk-1"
|
||||||
assert results[0].metadata["page"] == 1
|
assert results[0].metadata["page"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -77,7 +77,7 @@ def test_query_filter_metadata():
|
||||||
store.upsert("c2", [0.2], {"doc_id": "book-b"})
|
store.upsert("c2", [0.2], {"doc_id": "book-b"})
|
||||||
results = store.query([0.1], filter_metadata={"doc_id": "book-a"})
|
results = store.query([0.1], filter_metadata={"doc_id": "book-a"})
|
||||||
assert len(results) == 1
|
assert len(results) == 1
|
||||||
assert results[0].id == "c1"
|
assert results[0].entry_id == "c1"
|
||||||
|
|
||||||
|
|
||||||
def test_delete():
|
def test_delete():
|
||||||
|
|
|
||||||
77
tests/test_vector/test_sqlite_vec.py
Normal file
77
tests/test_vector/test_sqlite_vec.py
Normal file
|
|
@ -0,0 +1,77 @@
|
||||||
|
# tests/test_vector/test_sqlite_vec.py
|
||||||
|
"""Integration tests for LocalSQLiteVecStore (uses a real in-memory sqlite-vec DB)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore
|
||||||
|
|
||||||
|
DIMS = 4 # small dimension for tests
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def store(tmp_path) -> LocalSQLiteVecStore:
|
||||||
|
return LocalSQLiteVecStore(db_path=tmp_path / "vecs.db", dimensions=DIMS)
|
||||||
|
|
||||||
|
|
||||||
|
def _vec(val: float) -> list[float]:
|
||||||
|
return [val] * DIMS
|
||||||
|
|
||||||
|
|
||||||
|
def test_upsert_and_query_returns_match(store):
|
||||||
|
store.upsert("doc-1::p1", _vec(0.1), {"doc_id": "doc-1", "page": 1})
|
||||||
|
results = store.query(_vec(0.1), top_k=5)
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0].entry_id == "doc-1::p1"
|
||||||
|
assert results[0].metadata["page"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_upsert_replaces_existing(store):
|
||||||
|
store.upsert("chunk-1", _vec(0.1), {"page": 1})
|
||||||
|
store.upsert("chunk-1", _vec(0.2), {"page": 99})
|
||||||
|
results = store.query(_vec(0.2), top_k=5)
|
||||||
|
assert results[0].metadata["page"] == 99
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_respects_top_k(store):
|
||||||
|
for i in range(5):
|
||||||
|
store.upsert(f"chunk-{i}", _vec(float(i) * 0.1), {"i": i})
|
||||||
|
results = store.query(_vec(0.0), top_k=2)
|
||||||
|
assert len(results) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_metadata(store):
|
||||||
|
store.upsert("c1", _vec(0.1), {"doc_id": "book-a"})
|
||||||
|
store.upsert("c2", _vec(0.2), {"doc_id": "book-b"})
|
||||||
|
results = store.query(_vec(0.1), filter_metadata={"doc_id": "book-a"})
|
||||||
|
assert all(r.metadata["doc_id"] == "book-a" for r in results)
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete(store):
|
||||||
|
store.upsert("x", _vec(0.5), {})
|
||||||
|
store.delete("x")
|
||||||
|
assert store.query(_vec(0.5)) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_where(store):
|
||||||
|
store.upsert("c1", _vec(0.1), {"doc_id": "book-a"})
|
||||||
|
store.upsert("c2", _vec(0.2), {"doc_id": "book-a"})
|
||||||
|
store.upsert("c3", _vec(0.3), {"doc_id": "book-b"})
|
||||||
|
count = store.delete_where({"doc_id": "book-a"})
|
||||||
|
assert count == 2
|
||||||
|
assert len(store.query(_vec(0.1))) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_nonexistent_is_noop(store):
|
||||||
|
store.delete("does-not-exist") # should not raise
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_query_returns_empty(store):
|
||||||
|
assert store.query(_vec(0.1)) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_where_raises_on_empty_filter(store):
|
||||||
|
store.upsert("c1", _vec(0.1), {"doc_id": "book-a"})
|
||||||
|
with pytest.raises(ValueError, match="empty"):
|
||||||
|
store.delete_where({})
|
||||||
Loading…
Reference in a new issue