Skip to content

Bug: retrieve_online_documents_v2 in PostgreSQL online store returns missing some requested feature #6097

@patelchaitany

Description

@patelchaitany

Expected Behavior

When requesting features like ["documents:text", "documents:embedding", "documents:category"], all three features should be returned for each matched document, along with the computed distance (and text_rank for hybrid search).

Image

Current Behavior

The text and category fields are missing or empty, even though they were written correctly via write_to_online_store(). When the happens on the retrieval with embeddings only and also with the query and embeddings.

Image

Steps to reproduce

"""
Reproduce: retrieve_online_documents_v2 returns only embedding, missing other features.

Prerequisites:
  1. Run a PostgreSQL container with pgvector:
     docker run -d --name postgres-pgvector \
       -e POSTGRES_USER=feast \
       -e POSTGRES_PASSWORD=feast \
       -e POSTGRES_DB=feast \
       -p 5432:5432 \
       pgvector/pgvector:pg16

  2. Enable the pgvector extension:
     docker exec -it postgres-pgvector \
       psql -U feast -c "CREATE EXTENSION IF NOT EXISTS vector;"

  3. pip install feast sentence-transformers psycopg[binary] psycopg_pool
"""

import os
import tempfile
from datetime import datetime, timedelta, timezone

import pandas as pd
from sentence_transformers import SentenceTransformer

from feast import Entity, FeatureStore, FeatureView, Field, FileSource, ValueType
from feast.types import Array, Float32, String

MODEL_NAME = "all-MiniLM-L6-v2"
TOP_K = 3

model = SentenceTransformer(MODEL_NAME)
EMBEDDING_DIM = model.get_sentence_embedding_dimension()

docs = [
    "Python is a high-level programming language known for its readability.",
    "Machine learning enables systems to learn from data.",
    "Neural networks are inspired by biological neural networks in animal brains.",
    "Feast is an open source feature store for machine learning.",
    "PostgreSQL is a powerful open source relational database.",
]

doc_embeddings = model.encode(docs, normalize_embeddings=True).tolist()

repo_dir = tempfile.mkdtemp(prefix="feast_pg_bug_")
data_dir = os.path.join(repo_dir, "data")
os.makedirs(data_dir, exist_ok=True)

yaml_content = """
project: test_pgvector_bug
registry: data/registry.db
provider: local
online_store:
    type: postgres
    host: localhost
    port: 5432
    database: feast
    db_schema: public
    user: feast
    password: feast
    vector_enabled: true
entity_key_serialization_version: 3
"""
with open(os.path.join(repo_dir, "feature_store.yaml"), "w") as f:
    f.write(yaml_content)

now = datetime.now(timezone.utc)
df = pd.DataFrame(
    {
        "doc_id": [f"doc-{i}" for i in range(len(docs))],
        "text": docs,
        "embedding": doc_embeddings,
        "category": ["programming", "ml", "ml", "ml", "database"],
        "event_timestamp": [now] * len(docs),
    }
)

parquet_path = os.path.join(data_dir, "documents.parquet")
df.to_parquet(parquet_path)

doc_entity = Entity(name="doc_id", join_keys=["doc_id"], value_type=ValueType.STRING)

doc_source = FileSource(
    name="doc_source", path=parquet_path, timestamp_field="event_timestamp"
)

doc_fv = FeatureView(
    name="documents",
    entities=[doc_entity],
    ttl=timedelta(days=1),
    schema=[
        Field(name="text", dtype=String),
        Field(name="category", dtype=String),
        Field(
            name="embedding",
            dtype=Array(Float32),
            vector_index=True,
            vector_length=EMBEDDING_DIM,
            vector_search_metric="cosine",
        ),
    ],
    source=doc_source,
    online=True,
)

store = FeatureStore(repo_path=repo_dir)
store.apply([doc_entity, doc_fv])
store.write_to_online_store(feature_view_name="documents", df=df)
print(f"Wrote {len(df)} documents to online store\n")

# --- Case 1: Vector search only ---
print("=== Case 1: Vector search only ===")
query_embedding = model.encode(["What is Python?"], normalize_embeddings=True)[0].tolist()
results = store.retrieve_online_documents_v2(
    features=["documents:text", "documents:embedding", "documents:category"],
    query=query_embedding,
    top_k=TOP_K,
    distance_metric="cosine",
).to_dict()

print(f"  'text' values:     {results.get('text', [])}")
print(f"  'category' values: {results.get('category', [])}")
print(f"  'distance' values: {results.get('distance', [])}")
has_embedding = bool(results.get("embedding", []))
print(f"  'embedding' present: {has_embedding}")

if not results.get("text") or not results.get("category"):
    print("  BUG: 'text' and/or 'category' are missing from results!")
else:
    print("  OK: all features returned")

# --- Case 2: Hybrid search (vector + query_string) ---
print("\n=== Case 2: Hybrid search (vector + query_string) ===")
results = store.retrieve_online_documents_v2(
    features=["documents:text", "documents:embedding", "documents:category"],
    query=query_embedding,
    query_string="machine learning",
    top_k=TOP_K,
    distance_metric="cosine",
).to_dict()

print(f"  'text' values:     {results.get('text', [])}")
print(f"  'category' values: {results.get('category', [])}")
print(f"  'distance' values: {results.get('distance', [])}")
has_embedding = bool(results.get("embedding", []))
print(f"  'embedding' present: {has_embedding}")

if not results.get("text") or not results.get("category"):
    print("  BUG: 'text' and/or 'category' are missing from results!")
else:
    print("  OK: all features returned")

# --- Case 3: Text search only ---
print("\n=== Case 3: Text search only (query_string only) ===")
results = store.retrieve_online_documents_v2(
    features=["documents:text", "documents:embedding", "documents:category"],
    query_string="machine learning",
    top_k=TOP_K,
    distance_metric="cosine",
).to_dict()

print(f"  'text' values:     {results.get('text', [])}")
print(f"  'category' values: {results.get('category', [])}")
print(f"  'text_rank' values:{results.get('text_rank', [])}")
has_embedding = bool(results.get("embedding", []))
print(f"  'embedding' present: {has_embedding}")

# if not results.get("text") or not results.get("category"):
#     print("  BUG: 'text' and/or 'category' are missing from results!")
# else:
#     print("  OK: all features returned")

print("\nDone.")

Specifications

  • Version: 26.3
  • Platform: Macos
  • Subsystem:

Possible Solution

Fixing the Retrieval Query in the retrieve_online_documents_v2 function in the postgres.py

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions