knowledge-base/scripts/migrate_qdrant_to_v3.py
Pratik Narola 0f0addb36b chore: migrate to mem0ai v2.0.2 (V3 memory pipeline)
Pin mem0ai[nlp]==2.0.2 and fastembed for the new hybrid-search pipeline.
Drop OSS graph memory (removed upstream in 2.0.0, PR #4805): remove Neo4j
service, env vars, volumes, and driver deps; mark /graph/relationships
deprecated. Rewrite Memory.search/get_all/chat/health call sites to use
the v2 filters={} + top_k API (entity IDs at top level now raise
ValueError). Tighten MCP remove_memory ownership check to O(1)
verify_memory_ownership so it doesn't silently truncate at the new
top_k=20 default. Downgrade base image to python:3.12-slim for spaCy.

Adds scripts/migrate_qdrant_to_v3.py (scroll+upsert with per-user count
parity check) and docs/MIGRATION_RUNBOOK.md covering snapshot, dump,
collection rebuild, cutover, and rollback procedures.
2026-05-23 14:49:45 +05:30

189 lines
6.7 KiB
Python

#!/usr/bin/env python3
"""Migrate a legacy mem0 v0.1.x/v1.x Qdrant collection to a v2-compatible one.
Why this script exists
----------------------
mem0 v2 stores a `bm25` sparse vector alongside each dense vector to enable
hybrid search. Pre-v2 collections lack that slot — mem0's Qdrant adapter
silently downgrades to semantic-only writes on them. To unlock BM25 you must
recreate the collection with the sparse slot AND copy the existing points
over (preserving id, vector, payload — no re-embed needed).
How it works
------------
1. Connect to Qdrant.
2. Scroll all points from the source collection (with vectors + payload).
3. Upsert them into the target collection in batches.
4. Verify counts match per `user_id`.
The target collection MUST already exist with the BM25 slot. The recommended
way to create it is to boot the v2 backend pointed at `QDRANT_COLLECTION_NAME=<target>`
and trigger one `add()` call — mem0 lazy-creates the collection (and the sister
`<target>_entities` collection) with the right schema.
Usage
-----
# Dry run (no writes):
python scripts/migrate_qdrant_to_v3.py \\
--source mem0 --target mem0_v3 \\
--qdrant-host localhost --qdrant-port 6333 \\
--dry-run
# Real migration:
python scripts/migrate_qdrant_to_v3.py \\
--source mem0 --target mem0_v3 \\
--qdrant-host localhost --qdrant-port 6333
# From inside the backend container (where Qdrant resolves as `qdrant`):
docker compose exec backend python /app/../scripts/migrate_qdrant_to_v3.py \\
--source mem0 --target mem0_v3 --qdrant-host qdrant --qdrant-port 6333
Prereqs
-------
- qdrant-client>=1.12.0 installed
- A fresh Qdrant snapshot of the source collection (see docs/MIGRATION_RUNBOOK.md)
- The target collection created via a v2 backend warm-up add()
"""
import argparse
import sys
from collections import Counter
from typing import Optional
from qdrant_client import QdrantClient
from qdrant_client.http import models
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument("--source", required=True, help="Source (legacy) collection name")
p.add_argument("--target", required=True, help="Target (v2-created) collection name")
p.add_argument("--qdrant-host", default="localhost")
p.add_argument("--qdrant-port", type=int, default=6333)
p.add_argument("--batch-size", type=int, default=256, help="Scroll/upsert batch size")
p.add_argument("--dry-run", action="store_true", help="Read-only — show counts, no writes")
return p.parse_args()
def collection_must_exist(client: QdrantClient, name: str) -> models.CollectionInfo:
if not client.collection_exists(name):
print(f"ERROR: collection {name!r} does not exist on Qdrant.", file=sys.stderr)
sys.exit(2)
return client.get_collection(name)
def verify_target_has_bm25(target_info: models.CollectionInfo) -> None:
sparse = getattr(target_info.config.params, "sparse_vectors", None)
if not sparse or "bm25" not in sparse:
print(
"ERROR: target collection has no `bm25` sparse-vector slot. Did you create "
"it via a v2 backend warm-up add()? See docs/MIGRATION_RUNBOOK.md.",
file=sys.stderr,
)
sys.exit(2)
def count_per_user(client: QdrantClient, collection: str) -> Counter:
counts: Counter = Counter()
offset: Optional[models.PointId] = None
while True:
points, offset = client.scroll(
collection_name=collection,
limit=1024,
with_payload=["user_id"],
with_vectors=False,
offset=offset,
)
for p in points:
uid = (p.payload or {}).get("user_id", "<none>")
counts[uid] += 1
if offset is None:
break
return counts
def migrate(
client: QdrantClient, source: str, target: str, batch_size: int, dry_run: bool
) -> int:
transferred = 0
offset: Optional[models.PointId] = None
while True:
points, offset = client.scroll(
collection_name=source,
limit=batch_size,
with_payload=True,
with_vectors=True,
offset=offset,
)
if not points:
break
if not dry_run:
client.upsert(
collection_name=target,
points=[
models.PointStruct(id=p.id, vector=p.vector, payload=p.payload)
for p in points
],
wait=True,
)
transferred += len(points)
print(f" ... transferred {transferred} points")
if offset is None:
break
return transferred
def main() -> None:
args = parse_args()
client = QdrantClient(host=args.qdrant_host, port=args.qdrant_port)
src_info = collection_must_exist(client, args.source)
tgt_info = collection_must_exist(client, args.target)
verify_target_has_bm25(tgt_info)
src_count = client.count(args.source, exact=True).count
tgt_count_before = client.count(args.target, exact=True).count
print(f"Source {args.source!r}: {src_count} points")
print(f"Target {args.target!r}: {tgt_count_before} points (before)")
if tgt_count_before > 1:
print(
"WARNING: target collection is non-empty (>1 point). Migration will "
"upsert into it; ids collide → existing points overwritten."
)
print("\nPer-user count (source):")
src_per_user = count_per_user(client, args.source)
for uid, c in src_per_user.most_common():
print(f" {uid}: {c}")
if args.dry_run:
print("\nDRY RUN — no writes performed.")
sys.exit(0)
print("\nMigrating points (preserving id + vector + payload, no re-embed)...")
transferred = migrate(client, args.source, args.target, args.batch_size, dry_run=False)
tgt_count_after = client.count(args.target, exact=True).count
print(f"\nDone. Transferred {transferred} points.")
print(f"Target {args.target!r}: {tgt_count_after} points (after)")
print("\nPer-user count (target, after):")
tgt_per_user = count_per_user(client, args.target)
mismatches = 0
for uid, src_c in src_per_user.most_common():
tgt_c = tgt_per_user.get(uid, 0)
marker = "OK" if tgt_c == src_c else f"MISMATCH ({tgt_c})"
print(f" {uid}: src={src_c} tgt={tgt_c} [{marker}]")
if tgt_c != src_c:
mismatches += 1
if mismatches:
print(f"\nERROR: {mismatches} user(s) have count mismatches. Investigate before swap.", file=sys.stderr)
sys.exit(3)
print("\nAll per-user counts match. Safe to proceed with collection swap.")
if __name__ == "__main__":
main()