#!/usr/bin/env python3 """Migrate a legacy mem0 v0.1.x/v1.x Qdrant collection to a v2-compatible one. Why this script exists ---------------------- mem0 v2 stores a `bm25` sparse vector alongside each dense vector to enable hybrid search. Pre-v2 collections lack that slot — mem0's Qdrant adapter silently downgrades to semantic-only writes on them. To unlock BM25 you must recreate the collection with the sparse slot AND copy the existing points over (preserving id, vector, payload — no re-embed needed). How it works ------------ 1. Connect to Qdrant. 2. Scroll all points from the source collection (with vectors + payload). 3. Upsert them into the target collection in batches. 4. Verify counts match per `user_id`. The target collection MUST already exist with the BM25 slot. The recommended way to create it is to boot the v2 backend pointed at `QDRANT_COLLECTION_NAME=` and trigger one `add()` call — mem0 lazy-creates the collection (and the sister `_entities` collection) with the right schema. Usage ----- # Dry run (no writes): python scripts/migrate_qdrant_to_v3.py \\ --source mem0 --target mem0_v3 \\ --qdrant-host localhost --qdrant-port 6333 \\ --dry-run # Real migration: python scripts/migrate_qdrant_to_v3.py \\ --source mem0 --target mem0_v3 \\ --qdrant-host localhost --qdrant-port 6333 # From inside the backend container (where Qdrant resolves as `qdrant`): docker compose exec backend python /app/../scripts/migrate_qdrant_to_v3.py \\ --source mem0 --target mem0_v3 --qdrant-host qdrant --qdrant-port 6333 Prereqs ------- - qdrant-client>=1.12.0 installed - A fresh Qdrant snapshot of the source collection (see docs/MIGRATION_RUNBOOK.md) - The target collection created via a v2 backend warm-up add() """ import argparse import sys from collections import Counter from typing import Optional from qdrant_client import QdrantClient from qdrant_client.http import models def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--source", required=True, help="Source (legacy) collection name") p.add_argument("--target", required=True, help="Target (v2-created) collection name") p.add_argument("--qdrant-host", default="localhost") p.add_argument("--qdrant-port", type=int, default=6333) p.add_argument("--batch-size", type=int, default=256, help="Scroll/upsert batch size") p.add_argument("--dry-run", action="store_true", help="Read-only — show counts, no writes") return p.parse_args() def collection_must_exist(client: QdrantClient, name: str) -> models.CollectionInfo: if not client.collection_exists(name): print(f"ERROR: collection {name!r} does not exist on Qdrant.", file=sys.stderr) sys.exit(2) return client.get_collection(name) def verify_target_has_bm25(target_info: models.CollectionInfo) -> None: sparse = getattr(target_info.config.params, "sparse_vectors", None) if not sparse or "bm25" not in sparse: print( "ERROR: target collection has no `bm25` sparse-vector slot. Did you create " "it via a v2 backend warm-up add()? See docs/MIGRATION_RUNBOOK.md.", file=sys.stderr, ) sys.exit(2) def count_per_user(client: QdrantClient, collection: str) -> Counter: counts: Counter = Counter() offset: Optional[models.PointId] = None while True: points, offset = client.scroll( collection_name=collection, limit=1024, with_payload=["user_id"], with_vectors=False, offset=offset, ) for p in points: uid = (p.payload or {}).get("user_id", "") counts[uid] += 1 if offset is None: break return counts def migrate( client: QdrantClient, source: str, target: str, batch_size: int, dry_run: bool ) -> int: transferred = 0 offset: Optional[models.PointId] = None while True: points, offset = client.scroll( collection_name=source, limit=batch_size, with_payload=True, with_vectors=True, offset=offset, ) if not points: break if not dry_run: client.upsert( collection_name=target, points=[ models.PointStruct(id=p.id, vector=p.vector, payload=p.payload) for p in points ], wait=True, ) transferred += len(points) print(f" ... transferred {transferred} points") if offset is None: break return transferred def main() -> None: args = parse_args() client = QdrantClient(host=args.qdrant_host, port=args.qdrant_port) src_info = collection_must_exist(client, args.source) tgt_info = collection_must_exist(client, args.target) verify_target_has_bm25(tgt_info) src_count = client.count(args.source, exact=True).count tgt_count_before = client.count(args.target, exact=True).count print(f"Source {args.source!r}: {src_count} points") print(f"Target {args.target!r}: {tgt_count_before} points (before)") if tgt_count_before > 1: print( "WARNING: target collection is non-empty (>1 point). Migration will " "upsert into it; ids collide → existing points overwritten." ) print("\nPer-user count (source):") src_per_user = count_per_user(client, args.source) for uid, c in src_per_user.most_common(): print(f" {uid}: {c}") if args.dry_run: print("\nDRY RUN — no writes performed.") sys.exit(0) print("\nMigrating points (preserving id + vector + payload, no re-embed)...") transferred = migrate(client, args.source, args.target, args.batch_size, dry_run=False) tgt_count_after = client.count(args.target, exact=True).count print(f"\nDone. Transferred {transferred} points.") print(f"Target {args.target!r}: {tgt_count_after} points (after)") print("\nPer-user count (target, after):") tgt_per_user = count_per_user(client, args.target) mismatches = 0 for uid, src_c in src_per_user.most_common(): tgt_c = tgt_per_user.get(uid, 0) marker = "OK" if tgt_c == src_c else f"MISMATCH ({tgt_c})" print(f" {uid}: src={src_c} tgt={tgt_c} [{marker}]") if tgt_c != src_c: mismatches += 1 if mismatches: print(f"\nERROR: {mismatches} user(s) have count mismatches. Investigate before swap.", file=sys.stderr) sys.exit(3) print("\nAll per-user counts match. Safe to proceed with collection swap.") if __name__ == "__main__": main()