fix: Resolve duplicate chunk issue for PGVector [COG-895] (#705)

<!-- .github/pull_request_template.md -->

## Description
Resolve issues with duplicate chunks for PGVector

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Igor Ilic 2025-04-07 18:03:36 +02:00 committed by GitHub
parent cd0d321eda
commit c4a6c94675
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -124,16 +124,32 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
self.payload = payload
self.vector = vector
pgvector_data_points = [
PGVectorDataPoint(
id=data_point.id,
vector=data_vectors[data_index],
payload=serialize_data(data_point.model_dump()),
)
for (data_index, data_point) in enumerate(data_points)
]
async with self.get_async_session() as session:
pgvector_data_points = []
for data_index, data_point in enumerate(data_points):
# Check to see if data should be updated or a new data item should be created
data_point_db = (
await session.execute(
select(PGVectorDataPoint).filter(PGVectorDataPoint.id == data_point.id)
)
).scalar_one_or_none()
# If data point exists update it, if not create a new one
if data_point_db:
data_point_db.id = data_point.id
data_point_db.vector = data_vectors[data_index]
data_point_db.payload = serialize_data(data_point.model_dump())
pgvector_data_points.append(data_point_db)
else:
pgvector_data_points.append(
PGVectorDataPoint(
id=data_point.id,
vector=data_vectors[data_index],
payload=serialize_data(data_point.model_dump()),
)
)
session.add_all(pgvector_data_points)
await session.commit()