Pin mem0ai[nlp]==2.0.2 and fastembed for the new hybrid-search pipeline. Drop OSS graph memory (removed upstream in 2.0.0, PR #4805): remove Neo4j service, env vars, volumes, and driver deps; mark /graph/relationships deprecated. Rewrite Memory.search/get_all/chat/health call sites to use the v2 filters={} + top_k API (entity IDs at top level now raise ValueError). Tighten MCP remove_memory ownership check to O(1) verify_memory_ownership so it doesn't silently truncate at the new top_k=20 default. Downgrade base image to python:3.12-slim for spaCy. Adds scripts/migrate_qdrant_to_v3.py (scroll+upsert with per-user count parity check) and docs/MIGRATION_RUNBOOK.md covering snapshot, dump, collection rebuild, cutover, and rollback procedures.
189 lines
6.7 KiB
Python
189 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Migrate a legacy mem0 v0.1.x/v1.x Qdrant collection to a v2-compatible one.
|
|
|
|
Why this script exists
|
|
----------------------
|
|
mem0 v2 stores a `bm25` sparse vector alongside each dense vector to enable
|
|
hybrid search. Pre-v2 collections lack that slot — mem0's Qdrant adapter
|
|
silently downgrades to semantic-only writes on them. To unlock BM25 you must
|
|
recreate the collection with the sparse slot AND copy the existing points
|
|
over (preserving id, vector, payload — no re-embed needed).
|
|
|
|
How it works
|
|
------------
|
|
1. Connect to Qdrant.
|
|
2. Scroll all points from the source collection (with vectors + payload).
|
|
3. Upsert them into the target collection in batches.
|
|
4. Verify counts match per `user_id`.
|
|
|
|
The target collection MUST already exist with the BM25 slot. The recommended
|
|
way to create it is to boot the v2 backend pointed at `QDRANT_COLLECTION_NAME=<target>`
|
|
and trigger one `add()` call — mem0 lazy-creates the collection (and the sister
|
|
`<target>_entities` collection) with the right schema.
|
|
|
|
Usage
|
|
-----
|
|
# Dry run (no writes):
|
|
python scripts/migrate_qdrant_to_v3.py \\
|
|
--source mem0 --target mem0_v3 \\
|
|
--qdrant-host localhost --qdrant-port 6333 \\
|
|
--dry-run
|
|
|
|
# Real migration:
|
|
python scripts/migrate_qdrant_to_v3.py \\
|
|
--source mem0 --target mem0_v3 \\
|
|
--qdrant-host localhost --qdrant-port 6333
|
|
|
|
# From inside the backend container (where Qdrant resolves as `qdrant`):
|
|
docker compose exec backend python /app/../scripts/migrate_qdrant_to_v3.py \\
|
|
--source mem0 --target mem0_v3 --qdrant-host qdrant --qdrant-port 6333
|
|
|
|
Prereqs
|
|
-------
|
|
- qdrant-client>=1.12.0 installed
|
|
- A fresh Qdrant snapshot of the source collection (see docs/MIGRATION_RUNBOOK.md)
|
|
- The target collection created via a v2 backend warm-up add()
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
from collections import Counter
|
|
from typing import Optional
|
|
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.http import models
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
p.add_argument("--source", required=True, help="Source (legacy) collection name")
|
|
p.add_argument("--target", required=True, help="Target (v2-created) collection name")
|
|
p.add_argument("--qdrant-host", default="localhost")
|
|
p.add_argument("--qdrant-port", type=int, default=6333)
|
|
p.add_argument("--batch-size", type=int, default=256, help="Scroll/upsert batch size")
|
|
p.add_argument("--dry-run", action="store_true", help="Read-only — show counts, no writes")
|
|
return p.parse_args()
|
|
|
|
|
|
def collection_must_exist(client: QdrantClient, name: str) -> models.CollectionInfo:
|
|
if not client.collection_exists(name):
|
|
print(f"ERROR: collection {name!r} does not exist on Qdrant.", file=sys.stderr)
|
|
sys.exit(2)
|
|
return client.get_collection(name)
|
|
|
|
|
|
def verify_target_has_bm25(target_info: models.CollectionInfo) -> None:
|
|
sparse = getattr(target_info.config.params, "sparse_vectors", None)
|
|
if not sparse or "bm25" not in sparse:
|
|
print(
|
|
"ERROR: target collection has no `bm25` sparse-vector slot. Did you create "
|
|
"it via a v2 backend warm-up add()? See docs/MIGRATION_RUNBOOK.md.",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(2)
|
|
|
|
|
|
def count_per_user(client: QdrantClient, collection: str) -> Counter:
|
|
counts: Counter = Counter()
|
|
offset: Optional[models.PointId] = None
|
|
while True:
|
|
points, offset = client.scroll(
|
|
collection_name=collection,
|
|
limit=1024,
|
|
with_payload=["user_id"],
|
|
with_vectors=False,
|
|
offset=offset,
|
|
)
|
|
for p in points:
|
|
uid = (p.payload or {}).get("user_id", "<none>")
|
|
counts[uid] += 1
|
|
if offset is None:
|
|
break
|
|
return counts
|
|
|
|
|
|
def migrate(
|
|
client: QdrantClient, source: str, target: str, batch_size: int, dry_run: bool
|
|
) -> int:
|
|
transferred = 0
|
|
offset: Optional[models.PointId] = None
|
|
while True:
|
|
points, offset = client.scroll(
|
|
collection_name=source,
|
|
limit=batch_size,
|
|
with_payload=True,
|
|
with_vectors=True,
|
|
offset=offset,
|
|
)
|
|
if not points:
|
|
break
|
|
|
|
if not dry_run:
|
|
client.upsert(
|
|
collection_name=target,
|
|
points=[
|
|
models.PointStruct(id=p.id, vector=p.vector, payload=p.payload)
|
|
for p in points
|
|
],
|
|
wait=True,
|
|
)
|
|
transferred += len(points)
|
|
print(f" ... transferred {transferred} points")
|
|
|
|
if offset is None:
|
|
break
|
|
return transferred
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
client = QdrantClient(host=args.qdrant_host, port=args.qdrant_port)
|
|
|
|
src_info = collection_must_exist(client, args.source)
|
|
tgt_info = collection_must_exist(client, args.target)
|
|
verify_target_has_bm25(tgt_info)
|
|
|
|
src_count = client.count(args.source, exact=True).count
|
|
tgt_count_before = client.count(args.target, exact=True).count
|
|
print(f"Source {args.source!r}: {src_count} points")
|
|
print(f"Target {args.target!r}: {tgt_count_before} points (before)")
|
|
if tgt_count_before > 1:
|
|
print(
|
|
"WARNING: target collection is non-empty (>1 point). Migration will "
|
|
"upsert into it; ids collide → existing points overwritten."
|
|
)
|
|
|
|
print("\nPer-user count (source):")
|
|
src_per_user = count_per_user(client, args.source)
|
|
for uid, c in src_per_user.most_common():
|
|
print(f" {uid}: {c}")
|
|
|
|
if args.dry_run:
|
|
print("\nDRY RUN — no writes performed.")
|
|
sys.exit(0)
|
|
|
|
print("\nMigrating points (preserving id + vector + payload, no re-embed)...")
|
|
transferred = migrate(client, args.source, args.target, args.batch_size, dry_run=False)
|
|
|
|
tgt_count_after = client.count(args.target, exact=True).count
|
|
print(f"\nDone. Transferred {transferred} points.")
|
|
print(f"Target {args.target!r}: {tgt_count_after} points (after)")
|
|
|
|
print("\nPer-user count (target, after):")
|
|
tgt_per_user = count_per_user(client, args.target)
|
|
mismatches = 0
|
|
for uid, src_c in src_per_user.most_common():
|
|
tgt_c = tgt_per_user.get(uid, 0)
|
|
marker = "OK" if tgt_c == src_c else f"MISMATCH ({tgt_c})"
|
|
print(f" {uid}: src={src_c} tgt={tgt_c} [{marker}]")
|
|
if tgt_c != src_c:
|
|
mismatches += 1
|
|
|
|
if mismatches:
|
|
print(f"\nERROR: {mismatches} user(s) have count mismatches. Investigate before swap.", file=sys.stderr)
|
|
sys.exit(3)
|
|
print("\nAll per-user counts match. Safe to proceed with collection swap.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|