-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Open
Labels
Description
Expected Behavior
When requesting features like ["documents:text", "documents:embedding", "documents:category"], all three features should be returned for each matched document, along with the computed distance (and text_rank for hybrid search).
Current Behavior
The text and category fields are missing or empty, even though they were written correctly via write_to_online_store(). When the happens on the retrieval with embeddings only and also with the query and embeddings.
Steps to reproduce
"""
Reproduce: retrieve_online_documents_v2 returns only embedding, missing other features.
Prerequisites:
1. Run a PostgreSQL container with pgvector:
docker run -d --name postgres-pgvector \
-e POSTGRES_USER=feast \
-e POSTGRES_PASSWORD=feast \
-e POSTGRES_DB=feast \
-p 5432:5432 \
pgvector/pgvector:pg16
2. Enable the pgvector extension:
docker exec -it postgres-pgvector \
psql -U feast -c "CREATE EXTENSION IF NOT EXISTS vector;"
3. pip install feast sentence-transformers psycopg[binary] psycopg_pool
"""
import os
import tempfile
from datetime import datetime, timedelta, timezone
import pandas as pd
from sentence_transformers import SentenceTransformer
from feast import Entity, FeatureStore, FeatureView, Field, FileSource, ValueType
from feast.types import Array, Float32, String
MODEL_NAME = "all-MiniLM-L6-v2"
TOP_K = 3
model = SentenceTransformer(MODEL_NAME)
EMBEDDING_DIM = model.get_sentence_embedding_dimension()
docs = [
"Python is a high-level programming language known for its readability.",
"Machine learning enables systems to learn from data.",
"Neural networks are inspired by biological neural networks in animal brains.",
"Feast is an open source feature store for machine learning.",
"PostgreSQL is a powerful open source relational database.",
]
doc_embeddings = model.encode(docs, normalize_embeddings=True).tolist()
repo_dir = tempfile.mkdtemp(prefix="feast_pg_bug_")
data_dir = os.path.join(repo_dir, "data")
os.makedirs(data_dir, exist_ok=True)
yaml_content = """
project: test_pgvector_bug
registry: data/registry.db
provider: local
online_store:
type: postgres
host: localhost
port: 5432
database: feast
db_schema: public
user: feast
password: feast
vector_enabled: true
entity_key_serialization_version: 3
"""
with open(os.path.join(repo_dir, "feature_store.yaml"), "w") as f:
f.write(yaml_content)
now = datetime.now(timezone.utc)
df = pd.DataFrame(
{
"doc_id": [f"doc-{i}" for i in range(len(docs))],
"text": docs,
"embedding": doc_embeddings,
"category": ["programming", "ml", "ml", "ml", "database"],
"event_timestamp": [now] * len(docs),
}
)
parquet_path = os.path.join(data_dir, "documents.parquet")
df.to_parquet(parquet_path)
doc_entity = Entity(name="doc_id", join_keys=["doc_id"], value_type=ValueType.STRING)
doc_source = FileSource(
name="doc_source", path=parquet_path, timestamp_field="event_timestamp"
)
doc_fv = FeatureView(
name="documents",
entities=[doc_entity],
ttl=timedelta(days=1),
schema=[
Field(name="text", dtype=String),
Field(name="category", dtype=String),
Field(
name="embedding",
dtype=Array(Float32),
vector_index=True,
vector_length=EMBEDDING_DIM,
vector_search_metric="cosine",
),
],
source=doc_source,
online=True,
)
store = FeatureStore(repo_path=repo_dir)
store.apply([doc_entity, doc_fv])
store.write_to_online_store(feature_view_name="documents", df=df)
print(f"Wrote {len(df)} documents to online store\n")
# --- Case 1: Vector search only ---
print("=== Case 1: Vector search only ===")
query_embedding = model.encode(["What is Python?"], normalize_embeddings=True)[0].tolist()
results = store.retrieve_online_documents_v2(
features=["documents:text", "documents:embedding", "documents:category"],
query=query_embedding,
top_k=TOP_K,
distance_metric="cosine",
).to_dict()
print(f" 'text' values: {results.get('text', [])}")
print(f" 'category' values: {results.get('category', [])}")
print(f" 'distance' values: {results.get('distance', [])}")
has_embedding = bool(results.get("embedding", []))
print(f" 'embedding' present: {has_embedding}")
if not results.get("text") or not results.get("category"):
print(" BUG: 'text' and/or 'category' are missing from results!")
else:
print(" OK: all features returned")
# --- Case 2: Hybrid search (vector + query_string) ---
print("\n=== Case 2: Hybrid search (vector + query_string) ===")
results = store.retrieve_online_documents_v2(
features=["documents:text", "documents:embedding", "documents:category"],
query=query_embedding,
query_string="machine learning",
top_k=TOP_K,
distance_metric="cosine",
).to_dict()
print(f" 'text' values: {results.get('text', [])}")
print(f" 'category' values: {results.get('category', [])}")
print(f" 'distance' values: {results.get('distance', [])}")
has_embedding = bool(results.get("embedding", []))
print(f" 'embedding' present: {has_embedding}")
if not results.get("text") or not results.get("category"):
print(" BUG: 'text' and/or 'category' are missing from results!")
else:
print(" OK: all features returned")
# --- Case 3: Text search only ---
print("\n=== Case 3: Text search only (query_string only) ===")
results = store.retrieve_online_documents_v2(
features=["documents:text", "documents:embedding", "documents:category"],
query_string="machine learning",
top_k=TOP_K,
distance_metric="cosine",
).to_dict()
print(f" 'text' values: {results.get('text', [])}")
print(f" 'category' values: {results.get('category', [])}")
print(f" 'text_rank' values:{results.get('text_rank', [])}")
has_embedding = bool(results.get("embedding", []))
print(f" 'embedding' present: {has_embedding}")
# if not results.get("text") or not results.get("category"):
# print(" BUG: 'text' and/or 'category' are missing from results!")
# else:
# print(" OK: all features returned")
print("\nDone.")Specifications
- Version: 26.3
- Platform: Macos
- Subsystem:
Possible Solution
Fixing the Retrieval Query in the retrieve_online_documents_v2 function in the postgres.py
Reactions are currently unavailable