knowledge-base/scripts/migrate_qdrant_to_v3.py

#!/usr/bin/env python3
"""Migrate a legacy mem0 v0.1.x/v1.x Qdrant collection to a v2-compatible one.

Why this script exists
----------------------
mem0 v2 stores a `bm25` sparse vector alongside each dense vector to enable
hybrid search. Pre-v2 collections lack that slot — mem0's Qdrant adapter
silently downgrades to semantic-only writes on them. To unlock BM25 you must
recreate the collection with the sparse slot AND copy the existing points
over (preserving id, vector, payload — no re-embed needed).

How it works
------------
1. Connect to Qdrant.
2. Scroll all points from the source collection (with vectors + payload).
3. Upsert them into the target collection in batches.
4. Verify counts match per `user_id`.

The target collection MUST already exist with the BM25 slot. The recommended
way to create it is to boot the v2 backend pointed at `QDRANT_COLLECTION_NAME=<target>`
and trigger one `add()` call — mem0 lazy-creates the collection (and the sister
`<target>_entities` collection) with the right schema.

Usage
-----
    # Dry run (no writes):
    python scripts/migrate_qdrant_to_v3.py \\
        --source mem0 --target mem0_v3 \\
        --qdrant-host localhost --qdrant-port 6333 \\
        --dry-run

    # Real migration:
    python scripts/migrate_qdrant_to_v3.py \\
        --source mem0 --target mem0_v3 \\
        --qdrant-host localhost --qdrant-port 6333

    # From inside the backend container (where Qdrant resolves as `qdrant`):
    docker compose exec backend python /app/../scripts/migrate_qdrant_to_v3.py \\
        --source mem0 --target mem0_v3 --qdrant-host qdrant --qdrant-port 6333

Prereqs
-------
- qdrant-client>=1.12.0 installed
- A fresh Qdrant snapshot of the source collection (see docs/MIGRATION_RUNBOOK.md)
- The target collection created via a v2 backend warm-up add()
"""

import argparse
import sys
from collections import Counter
from typing import Optional

from qdrant_client import QdrantClient
from qdrant_client.http import models


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--source", required=True, help="Source (legacy) collection name")
    p.add_argument("--target", required=True, help="Target (v2-created) collection name")
    p.add_argument("--qdrant-host", default="localhost")
    p.add_argument("--qdrant-port", type=int, default=6333)
    p.add_argument("--batch-size", type=int, default=256, help="Scroll/upsert batch size")
    p.add_argument("--dry-run", action="store_true", help="Read-only — show counts, no writes")
    return p.parse_args()


def collection_must_exist(client: QdrantClient, name: str) -> models.CollectionInfo:
    if not client.collection_exists(name):
        print(f"ERROR: collection {name!r} does not exist on Qdrant.", file=sys.stderr)
        sys.exit(2)
    return client.get_collection(name)


def verify_target_has_bm25(target_info: models.CollectionInfo) -> None:
    sparse = getattr(target_info.config.params, "sparse_vectors", None)
    if not sparse or "bm25" not in sparse:
        print(
            "ERROR: target collection has no `bm25` sparse-vector slot. Did you create "
            "it via a v2 backend warm-up add()? See docs/MIGRATION_RUNBOOK.md.",
            file=sys.stderr,
        )
        sys.exit(2)


def count_per_user(client: QdrantClient, collection: str) -> Counter:
    counts: Counter = Counter()
    offset: Optional[models.PointId] = None
    while True:
        points, offset = client.scroll(
            collection_name=collection,
            limit=1024,
            with_payload=["user_id"],
            with_vectors=False,
            offset=offset,
        )
        for p in points:
            uid = (p.payload or {}).get("user_id", "<none>")
            counts[uid] += 1
        if offset is None:
            break
    return counts


def migrate(
    client: QdrantClient, source: str, target: str, batch_size: int, dry_run: bool
) -> int:
    transferred = 0
    offset: Optional[models.PointId] = None
    while True:
        points, offset = client.scroll(
            collection_name=source,
            limit=batch_size,
            with_payload=True,
            with_vectors=True,
            offset=offset,
        )
        if not points:
            break

        if not dry_run:
            client.upsert(
                collection_name=target,
                points=[
                    models.PointStruct(id=p.id, vector=p.vector, payload=p.payload)
                    for p in points
                ],
                wait=True,
            )
        transferred += len(points)
        print(f"  ... transferred {transferred} points")

        if offset is None:
            break
    return transferred


def main() -> None:
    args = parse_args()
    client = QdrantClient(host=args.qdrant_host, port=args.qdrant_port)

    src_info = collection_must_exist(client, args.source)
    tgt_info = collection_must_exist(client, args.target)
    verify_target_has_bm25(tgt_info)

    src_count = client.count(args.source, exact=True).count
    tgt_count_before = client.count(args.target, exact=True).count
    print(f"Source {args.source!r}: {src_count} points")
    print(f"Target {args.target!r}: {tgt_count_before} points (before)")
    if tgt_count_before > 1:
        print(
            "WARNING: target collection is non-empty (>1 point). Migration will "
            "upsert into it; ids collide → existing points overwritten."
        )

    print("\nPer-user count (source):")
    src_per_user = count_per_user(client, args.source)
    for uid, c in src_per_user.most_common():
        print(f"  {uid}: {c}")

    if args.dry_run:
        print("\nDRY RUN — no writes performed.")
        sys.exit(0)

    print("\nMigrating points (preserving id + vector + payload, no re-embed)...")
    transferred = migrate(client, args.source, args.target, args.batch_size, dry_run=False)

    tgt_count_after = client.count(args.target, exact=True).count
    print(f"\nDone. Transferred {transferred} points.")
    print(f"Target {args.target!r}: {tgt_count_after} points (after)")

    print("\nPer-user count (target, after):")
    tgt_per_user = count_per_user(client, args.target)
    mismatches = 0
    for uid, src_c in src_per_user.most_common():
        tgt_c = tgt_per_user.get(uid, 0)
        marker = "OK" if tgt_c == src_c else f"MISMATCH ({tgt_c})"
        print(f"  {uid}: src={src_c} tgt={tgt_c} [{marker}]")
        if tgt_c != src_c:
            mismatches += 1

    if mismatches:
        print(f"\nERROR: {mismatches} user(s) have count mismatches. Investigate before swap.", file=sys.stderr)
        sys.exit(3)
    print("\nAll per-user counts match. Safe to proceed with collection swap.")


if __name__ == "__main__":
    main()