Remove content field from DocProcessingStatus, update MongoDB and PostgreSQL implementation

This commit is contained in:
yangdx 2025-07-29 14:52:45 +08:00
parent 8274ed52d1
commit 24c36d876c
3 changed files with 8 additions and 11 deletions

View file

@ -629,8 +629,6 @@ class DocStatus(str, Enum):
class DocProcessingStatus:
"""Document processing status data structure"""
content: str
"""Original content of the document"""
content_summary: str
"""First 100 chars of document content, used for preview"""
content_length: int

View file

@ -374,14 +374,13 @@ class MongoDocStatusStorage(DocStatusStorage):
result = await cursor.to_list()
return {
doc["_id"]: DocProcessingStatus(
content=doc["content"],
content_summary=doc.get("content_summary"),
content_length=doc["content_length"],
file_path=doc.get("file_path", doc["_id"]),
status=doc["status"],
created_at=doc.get("created_at"),
updated_at=doc.get("updated_at"),
chunks_count=doc.get("chunks_count", -1),
file_path=doc.get("file_path", doc["_id"]),
chunks_list=doc.get("chunks_list", []),
)
for doc in result

View file

@ -1673,7 +1673,7 @@ class PGDocStatusStorage(DocStatusStorage):
updated_at = self._format_datetime_with_timezone(result[0]["updated_at"])
return dict(
content=result[0]["content"],
# content=result[0]["content"],
content_length=result[0]["content_length"],
content_summary=result[0]["content_summary"],
status=result[0]["status"],
@ -1713,7 +1713,7 @@ class PGDocStatusStorage(DocStatusStorage):
processed_results.append(
{
"content": row["content"],
# "content": row["content"],
"content_length": row["content_length"],
"content_summary": row["content_summary"],
"status": row["status"],
@ -1762,7 +1762,7 @@ class PGDocStatusStorage(DocStatusStorage):
updated_at = self._format_datetime_with_timezone(element["updated_at"])
docs_by_status[element["id"]] = DocProcessingStatus(
content=element["content"],
# content=element["content"],
content_summary=element["content_summary"],
content_length=element["content_length"],
status=element["status"],
@ -1845,10 +1845,9 @@ class PGDocStatusStorage(DocStatusStorage):
# Modified SQL to include created_at, updated_at, and chunks_list in both INSERT and UPDATE operations
# All fields are updated from the input data in both INSERT and UPDATE cases
sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content,content_summary,content_length,chunks_count,status,file_path,chunks_list,created_at,updated_at)
values($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content_summary,content_length,chunks_count,status,file_path,chunks_list,created_at,updated_at)
values($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
on conflict(id,workspace) do update set
content = EXCLUDED.content,
content_summary = EXCLUDED.content_summary,
content_length = EXCLUDED.content_length,
chunks_count = EXCLUDED.chunks_count,
@ -1868,7 +1867,7 @@ class PGDocStatusStorage(DocStatusStorage):
{
"workspace": self.db.workspace,
"id": k,
"content": v["content"],
# "content": v["content"],
"content_summary": v["content_summary"],
"content_length": v["content_length"],
"chunks_count": v["chunks_count"] if "chunks_count" in v else -1,
@ -3364,6 +3363,7 @@ TABLES = {
CONSTRAINT LIGHTRAG_LLM_CACHE_PK PRIMARY KEY (workspace, mode, id)
)"""
},
# content column in LIGHTRAG_DOC_STATUS is deprecated, use the same column in LIGHTRAG_DOC_FULL instead
"LIGHTRAG_DOC_STATUS": {
"ddl": """CREATE TABLE LIGHTRAG_DOC_STATUS (
workspace varchar(255) NOT NULL,