Initial commit

This commit is contained in:
2025-10-14 14:17:21 +08:00
commit ac715a8b88
35011 changed files with 3834178 additions and 0 deletions

View File

View File

@@ -0,0 +1,118 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.rag.index_processor.constant.index_type import IndexType
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.rag.models.document import ChildDocument, Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DatasetAutoDisableLog, DocumentSegment
from models.dataset import Document as DatasetDocument
@shared_task(queue="dataset")
def add_document_to_index_task(dataset_document_id: str):
"""
Async Add document to index
:param dataset_document_id:
Usage: add_document_to_index.delay(dataset_document_id)
"""
logging.info(click.style("Start add document to index: {}".format(dataset_document_id), fg="green"))
start_at = time.perf_counter()
dataset_document = db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document_id).first()
if not dataset_document:
raise NotFound("Document not found")
if dataset_document.indexing_status != "completed":
return
indexing_cache_key = "document_{}_indexing".format(dataset_document.id)
try:
segments = (
db.session.query(DocumentSegment)
.filter(
DocumentSegment.document_id == dataset_document.id,
DocumentSegment.enabled == False,
DocumentSegment.status == "completed",
)
.order_by(DocumentSegment.position.asc())
.all()
)
documents = []
for segment in segments:
document = Document(
page_content=segment.content,
metadata={
"doc_id": segment.index_node_id,
"doc_hash": segment.index_node_hash,
"document_id": segment.document_id,
"dataset_id": segment.dataset_id,
},
)
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
child_chunks = segment.child_chunks
if child_chunks:
child_documents = []
for child_chunk in child_chunks:
child_document = ChildDocument(
page_content=child_chunk.content,
metadata={
"doc_id": child_chunk.index_node_id,
"doc_hash": child_chunk.index_node_hash,
"document_id": segment.document_id,
"dataset_id": segment.dataset_id,
},
)
child_documents.append(child_document)
document.children = child_documents
documents.append(document)
dataset = dataset_document.dataset
if not dataset:
raise Exception("Document has no dataset")
index_type = dataset.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
index_processor.load(dataset, documents)
# delete auto disable log
db.session.query(DatasetAutoDisableLog).filter(
DatasetAutoDisableLog.document_id == dataset_document.id
).delete()
# update segment to enable
db.session.query(DocumentSegment).filter(DocumentSegment.document_id == dataset_document.id).update(
{
DocumentSegment.enabled: True,
DocumentSegment.disabled_at: None,
DocumentSegment.disabled_by: None,
DocumentSegment.updated_at: datetime.datetime.now(datetime.UTC).replace(tzinfo=None),
}
)
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style(
"Document added to index: {} latency: {}".format(dataset_document.id, end_at - start_at), fg="green"
)
)
except Exception as e:
logging.exception("add document to index failed")
dataset_document.enabled = False
dataset_document.disabled_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
dataset_document.status = "error"
dataset_document.error = str(e)
db.session.commit()
finally:
redis_client.delete(indexing_cache_key)

View File

@@ -0,0 +1,57 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from core.rag.datasource.vdb.vector_factory import Vector
from core.rag.models.document import Document
from models.dataset import Dataset
from services.dataset_service import DatasetCollectionBindingService
@shared_task(queue="dataset")
def add_annotation_to_index_task(
annotation_id: str, question: str, tenant_id: str, app_id: str, collection_binding_id: str
):
"""
Add annotation to index.
:param annotation_id: annotation id
:param question: question
:param tenant_id: tenant id
:param app_id: app id
:param collection_binding_id: embedding binding id
Usage: clean_dataset_task.delay(dataset_id, tenant_id, indexing_technique, index_struct)
"""
logging.info(click.style("Start build index for annotation: {}".format(annotation_id), fg="green"))
start_at = time.perf_counter()
try:
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding_by_id_and_type(
collection_binding_id, "annotation"
)
dataset = Dataset(
id=app_id,
tenant_id=tenant_id,
indexing_technique="high_quality",
embedding_model_provider=dataset_collection_binding.provider_name,
embedding_model=dataset_collection_binding.model_name,
collection_binding_id=dataset_collection_binding.id,
)
document = Document(
page_content=question, metadata={"annotation_id": annotation_id, "app_id": app_id, "doc_id": annotation_id}
)
vector = Vector(dataset, attributes=["doc_id", "annotation_id", "app_id"])
vector.create([document], duplicate_check=True)
end_at = time.perf_counter()
logging.info(
click.style(
"Build index successful for annotation: {} latency: {}".format(annotation_id, end_at - start_at),
fg="green",
)
)
except Exception:
logging.exception("Build index for annotation failed")

View File

@@ -0,0 +1,90 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.rag.datasource.vdb.vector_factory import Vector
from core.rag.models.document import Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset
from models.model import App, AppAnnotationSetting, MessageAnnotation
from services.dataset_service import DatasetCollectionBindingService
@shared_task(queue="dataset")
def batch_import_annotations_task(job_id: str, content_list: list[dict], app_id: str, tenant_id: str, user_id: str):
"""
Add annotation to index.
:param job_id: job_id
:param content_list: content list
:param app_id: app id
:param tenant_id: tenant id
:param user_id: user_id
"""
logging.info(click.style("Start batch import annotation: {}".format(job_id), fg="green"))
start_at = time.perf_counter()
indexing_cache_key = "app_annotation_batch_import_{}".format(str(job_id))
# get app info
app = db.session.query(App).filter(App.id == app_id, App.tenant_id == tenant_id, App.status == "normal").first()
if app:
try:
documents = []
for content in content_list:
annotation = MessageAnnotation(
app_id=app.id, content=content["answer"], question=content["question"], account_id=user_id
)
db.session.add(annotation)
db.session.flush()
document = Document(
page_content=content["question"],
metadata={"annotation_id": annotation.id, "app_id": app_id, "doc_id": annotation.id},
)
documents.append(document)
# if annotation reply is enabled , batch add annotations' index
app_annotation_setting = (
db.session.query(AppAnnotationSetting).filter(AppAnnotationSetting.app_id == app_id).first()
)
if app_annotation_setting:
dataset_collection_binding = (
DatasetCollectionBindingService.get_dataset_collection_binding_by_id_and_type(
app_annotation_setting.collection_binding_id, "annotation"
)
)
if not dataset_collection_binding:
raise NotFound("App annotation setting not found")
dataset = Dataset(
id=app_id,
tenant_id=tenant_id,
indexing_technique="high_quality",
embedding_model_provider=dataset_collection_binding.provider_name,
embedding_model=dataset_collection_binding.model_name,
collection_binding_id=dataset_collection_binding.id,
)
vector = Vector(dataset, attributes=["doc_id", "annotation_id", "app_id"])
vector.create(documents, duplicate_check=True)
db.session.commit()
redis_client.setex(indexing_cache_key, 600, "completed")
end_at = time.perf_counter()
logging.info(
click.style(
"Build index successful for batch import annotation: {} latency: {}".format(
job_id, end_at - start_at
),
fg="green",
)
)
except Exception as e:
db.session.rollback()
redis_client.setex(indexing_cache_key, 600, "error")
indexing_error_msg_key = "app_annotation_batch_import_error_msg_{}".format(str(job_id))
redis_client.setex(indexing_error_msg_key, 600, str(e))
logging.exception("Build index for batch import annotations failed")

View File

@@ -0,0 +1,41 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from core.rag.datasource.vdb.vector_factory import Vector
from models.dataset import Dataset
from services.dataset_service import DatasetCollectionBindingService
@shared_task(queue="dataset")
def delete_annotation_index_task(annotation_id: str, app_id: str, tenant_id: str, collection_binding_id: str):
"""
Async delete annotation index task
"""
logging.info(click.style("Start delete app annotation index: {}".format(app_id), fg="green"))
start_at = time.perf_counter()
try:
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding_by_id_and_type(
collection_binding_id, "annotation"
)
dataset = Dataset(
id=app_id,
tenant_id=tenant_id,
indexing_technique="high_quality",
collection_binding_id=dataset_collection_binding.id,
)
try:
vector = Vector(dataset, attributes=["doc_id", "annotation_id", "app_id"])
vector.delete_by_metadata_field("annotation_id", annotation_id)
except Exception:
logging.exception("Delete annotation index failed when annotation deleted.")
end_at = time.perf_counter()
logging.info(
click.style("App annotations index deleted : {} latency: {}".format(app_id, end_at - start_at), fg="green")
)
except Exception as e:
logging.exception("Annotation deleted index failed")

View File

@@ -0,0 +1,68 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.rag.datasource.vdb.vector_factory import Vector
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset
from models.model import App, AppAnnotationSetting, MessageAnnotation
@shared_task(queue="dataset")
def disable_annotation_reply_task(job_id: str, app_id: str, tenant_id: str):
"""
Async enable annotation reply task
"""
logging.info(click.style("Start delete app annotations index: {}".format(app_id), fg="green"))
start_at = time.perf_counter()
# get app info
app = db.session.query(App).filter(App.id == app_id, App.tenant_id == tenant_id, App.status == "normal").first()
annotations_count = db.session.query(MessageAnnotation).filter(MessageAnnotation.app_id == app_id).count()
if not app:
raise NotFound("App not found")
app_annotation_setting = (
db.session.query(AppAnnotationSetting).filter(AppAnnotationSetting.app_id == app_id).first()
)
if not app_annotation_setting:
raise NotFound("App annotation setting not found")
disable_app_annotation_key = "disable_app_annotation_{}".format(str(app_id))
disable_app_annotation_job_key = "disable_app_annotation_job_{}".format(str(job_id))
try:
dataset = Dataset(
id=app_id,
tenant_id=tenant_id,
indexing_technique="high_quality",
collection_binding_id=app_annotation_setting.collection_binding_id,
)
try:
if annotations_count > 0:
vector = Vector(dataset, attributes=["doc_id", "annotation_id", "app_id"])
vector.delete_by_metadata_field("app_id", app_id)
except Exception:
logging.exception("Delete annotation index failed when annotation deleted.")
redis_client.setex(disable_app_annotation_job_key, 600, "completed")
# delete annotation setting
db.session.delete(app_annotation_setting)
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style("App annotations index deleted : {} latency: {}".format(app_id, end_at - start_at), fg="green")
)
except Exception as e:
logging.exception("Annotation batch deleted index failed")
redis_client.setex(disable_app_annotation_job_key, 600, "error")
disable_app_annotation_error_key = "disable_app_annotation_error_{}".format(str(job_id))
redis_client.setex(disable_app_annotation_error_key, 600, str(e))
finally:
redis_client.delete(disable_app_annotation_key)

View File

@@ -0,0 +1,102 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.rag.datasource.vdb.vector_factory import Vector
from core.rag.models.document import Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset
from models.model import App, AppAnnotationSetting, MessageAnnotation
from services.dataset_service import DatasetCollectionBindingService
@shared_task(queue="dataset")
def enable_annotation_reply_task(
job_id: str,
app_id: str,
user_id: str,
tenant_id: str,
score_threshold: float,
embedding_provider_name: str,
embedding_model_name: str,
):
"""
Async enable annotation reply task
"""
logging.info(click.style("Start add app annotation to index: {}".format(app_id), fg="green"))
start_at = time.perf_counter()
# get app info
app = db.session.query(App).filter(App.id == app_id, App.tenant_id == tenant_id, App.status == "normal").first()
if not app:
raise NotFound("App not found")
annotations = db.session.query(MessageAnnotation).filter(MessageAnnotation.app_id == app_id).all()
enable_app_annotation_key = "enable_app_annotation_{}".format(str(app_id))
enable_app_annotation_job_key = "enable_app_annotation_job_{}".format(str(job_id))
try:
documents = []
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_provider_name, embedding_model_name, "annotation"
)
annotation_setting = (
db.session.query(AppAnnotationSetting).filter(AppAnnotationSetting.app_id == app_id).first()
)
if annotation_setting:
annotation_setting.score_threshold = score_threshold
annotation_setting.collection_binding_id = dataset_collection_binding.id
annotation_setting.updated_user_id = user_id
annotation_setting.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.add(annotation_setting)
else:
new_app_annotation_setting = AppAnnotationSetting(
app_id=app_id,
score_threshold=score_threshold,
collection_binding_id=dataset_collection_binding.id,
created_user_id=user_id,
updated_user_id=user_id,
)
db.session.add(new_app_annotation_setting)
dataset = Dataset(
id=app_id,
tenant_id=tenant_id,
indexing_technique="high_quality",
embedding_model_provider=embedding_provider_name,
embedding_model=embedding_model_name,
collection_binding_id=dataset_collection_binding.id,
)
if annotations:
for annotation in annotations:
document = Document(
page_content=annotation.question,
metadata={"annotation_id": annotation.id, "app_id": app_id, "doc_id": annotation.id},
)
documents.append(document)
vector = Vector(dataset, attributes=["doc_id", "annotation_id", "app_id"])
try:
vector.delete_by_metadata_field("app_id", app_id)
except Exception as e:
logging.info(click.style("Delete annotation index error: {}".format(str(e)), fg="red"))
vector.create(documents)
db.session.commit()
redis_client.setex(enable_app_annotation_job_key, 600, "completed")
end_at = time.perf_counter()
logging.info(
click.style("App annotations added to index: {} latency: {}".format(app_id, end_at - start_at), fg="green")
)
except Exception as e:
logging.exception("Annotation batch created index failed")
redis_client.setex(enable_app_annotation_job_key, 600, "error")
enable_app_annotation_error_key = "enable_app_annotation_error_{}".format(str(job_id))
redis_client.setex(enable_app_annotation_error_key, 600, str(e))
db.session.rollback()
finally:
redis_client.delete(enable_app_annotation_key)

View File

@@ -0,0 +1,58 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from core.rag.datasource.vdb.vector_factory import Vector
from core.rag.models.document import Document
from models.dataset import Dataset
from services.dataset_service import DatasetCollectionBindingService
@shared_task(queue="dataset")
def update_annotation_to_index_task(
annotation_id: str, question: str, tenant_id: str, app_id: str, collection_binding_id: str
):
"""
Update annotation to index.
:param annotation_id: annotation id
:param question: question
:param tenant_id: tenant id
:param app_id: app id
:param collection_binding_id: embedding binding id
Usage: clean_dataset_task.delay(dataset_id, tenant_id, indexing_technique, index_struct)
"""
logging.info(click.style("Start update index for annotation: {}".format(annotation_id), fg="green"))
start_at = time.perf_counter()
try:
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding_by_id_and_type(
collection_binding_id, "annotation"
)
dataset = Dataset(
id=app_id,
tenant_id=tenant_id,
indexing_technique="high_quality",
embedding_model_provider=dataset_collection_binding.provider_name,
embedding_model=dataset_collection_binding.model_name,
collection_binding_id=dataset_collection_binding.id,
)
document = Document(
page_content=question, metadata={"annotation_id": annotation_id, "app_id": app_id, "doc_id": annotation_id}
)
vector = Vector(dataset, attributes=["doc_id", "annotation_id", "app_id"])
vector.delete_by_metadata_field("annotation_id", annotation_id)
vector.add_texts([document])
end_at = time.perf_counter()
logging.info(
click.style(
"Build index successful for annotation: {} latency: {}".format(annotation_id, end_at - start_at),
fg="green",
)
)
except Exception:
logging.exception("Build index for annotation failed")

View File

@@ -0,0 +1,76 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.tools.utils.web_reader_tool import get_image_upload_file_ids
from extensions.ext_database import db
from extensions.ext_storage import storage
from models.dataset import Dataset, DocumentSegment
from models.model import UploadFile
@shared_task(queue="dataset")
def batch_clean_document_task(document_ids: list[str], dataset_id: str, doc_form: str, file_ids: list[str]):
"""
Clean document when document deleted.
:param document_ids: document ids
:param dataset_id: dataset id
:param doc_form: doc_form
:param file_ids: file ids
Usage: clean_document_task.delay(document_id, dataset_id)
"""
logging.info(click.style("Start batch clean documents when documents deleted", fg="green"))
start_at = time.perf_counter()
try:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
raise Exception("Document has no dataset")
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id.in_(document_ids)).all()
# check segment is exist
if segments:
index_node_ids = [segment.index_node_id for segment in segments]
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
image_upload_file_ids = get_image_upload_file_ids(segment.content)
for upload_file_id in image_upload_file_ids:
image_file = db.session.query(UploadFile).filter(UploadFile.id == upload_file_id).first()
try:
if image_file and image_file.key:
storage.delete(image_file.key)
except Exception:
logging.exception(
"Delete image_files failed when storage deleted, \
image_upload_file_is: {}".format(upload_file_id)
)
db.session.delete(image_file)
db.session.delete(segment)
db.session.commit()
if file_ids:
files = db.session.query(UploadFile).filter(UploadFile.id.in_(file_ids)).all()
for file in files:
try:
storage.delete(file.key)
except Exception:
logging.exception("Delete file failed when document deleted, file_id: {}".format(file.id))
db.session.delete(file)
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style(
"Cleaned documents when documents deleted latency: {}".format(end_at - start_at),
fg="green",
)
)
except Exception:
logging.exception("Cleaned documents when documents deleted failed")

View File

@@ -0,0 +1,129 @@
import datetime
import logging
import time
import uuid
import click
from celery import shared_task # type: ignore
from sqlalchemy import func, select
from sqlalchemy.orm import Session
from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from libs import helper
from models.dataset import Dataset, Document, DocumentSegment
from services.vector_service import VectorService
@shared_task(queue="dataset")
def batch_create_segment_to_index_task(
job_id: str,
content: list,
dataset_id: str,
document_id: str,
tenant_id: str,
user_id: str,
):
"""
Async batch create segment to index
:param job_id:
:param content:
:param dataset_id:
:param document_id:
:param tenant_id:
:param user_id:
Usage: batch_create_segment_to_index_task.delay(segment_id)
"""
logging.info(click.style("Start batch create segment jobId: {}".format(job_id), fg="green"))
start_at = time.perf_counter()
indexing_cache_key = "segment_batch_import_{}".format(job_id)
try:
with Session(db.engine) as session:
dataset = session.get(Dataset, dataset_id)
if not dataset:
raise ValueError("Dataset not exist.")
dataset_document = session.get(Document, document_id)
if not dataset_document:
raise ValueError("Document not exist.")
if (
not dataset_document.enabled
or dataset_document.archived
or dataset_document.indexing_status != "completed"
):
raise ValueError("Document is not available.")
document_segments = []
embedding_model = None
if dataset.indexing_technique == "high_quality":
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=dataset.tenant_id,
provider=dataset.embedding_model_provider,
model_type=ModelType.TEXT_EMBEDDING,
model=dataset.embedding_model,
)
word_count_change = 0
segments_to_insert: list[str] = []
max_position_stmt = select(func.max(DocumentSegment.position)).where(
DocumentSegment.document_id == dataset_document.id
)
word_count_change = 0
if embedding_model:
tokens_list = embedding_model.get_text_embedding_num_tokens(
texts=[segment["content"] for segment in content]
)
else:
tokens_list = [0] * len(content)
for segment, tokens in zip(content, tokens_list):
content = segment["content"]
doc_id = str(uuid.uuid4())
segment_hash = helper.generate_text_hash(content) # type: ignore
max_position = (
db.session.query(func.max(DocumentSegment.position))
.filter(DocumentSegment.document_id == dataset_document.id)
.scalar()
)
segment_document = DocumentSegment(
tenant_id=tenant_id,
dataset_id=dataset_id,
document_id=document_id,
index_node_id=doc_id,
index_node_hash=segment_hash,
position=max_position + 1 if max_position else 1,
content=content,
word_count=len(content),
tokens=tokens,
created_by=user_id,
indexing_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None),
status="completed",
completed_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None),
)
if dataset_document.doc_form == "qa_model":
segment_document.answer = segment["answer"]
segment_document.word_count += len(segment["answer"])
word_count_change += segment_document.word_count
db.session.add(segment_document)
document_segments.append(segment_document)
# update document word count
dataset_document.word_count += word_count_change
db.session.add(dataset_document)
# add index to db
VectorService.create_segments_vector(None, document_segments, dataset, dataset_document.doc_form)
db.session.commit()
redis_client.setex(indexing_cache_key, 600, "completed")
end_at = time.perf_counter()
logging.info(
click.style(
"Segment batch created job: {} latency: {}".format(job_id, end_at - start_at),
fg="green",
)
)
except Exception:
logging.exception("Segments batch created index failed")
redis_client.setex(indexing_cache_key, 600, "error")

View File

@@ -0,0 +1,119 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.tools.utils.rag_web_reader import get_image_upload_file_ids
from extensions.ext_database import db
from extensions.ext_storage import storage
from models.dataset import (
AppDatasetJoin,
Dataset,
DatasetProcessRule,
DatasetQuery,
Document,
DocumentSegment,
)
from models.model import UploadFile
# Add import statement for ValueError
@shared_task(queue="dataset")
def clean_dataset_task(
dataset_id: str,
tenant_id: str,
indexing_technique: str,
index_struct: str,
collection_binding_id: str,
doc_form: str,
):
"""
Clean dataset when dataset deleted.
:param dataset_id: dataset id
:param tenant_id: tenant id
:param indexing_technique: indexing technique
:param index_struct: index struct dict
:param collection_binding_id: collection binding id
:param doc_form: dataset form
Usage: clean_dataset_task.delay(dataset_id, tenant_id, indexing_technique, index_struct)
"""
logging.info(click.style("Start clean dataset when dataset deleted: {}".format(dataset_id), fg="green"))
start_at = time.perf_counter()
try:
dataset = Dataset(
id=dataset_id,
tenant_id=tenant_id,
indexing_technique=indexing_technique,
index_struct=index_struct,
collection_binding_id=collection_binding_id,
)
documents = db.session.query(Document).filter(Document.dataset_id == dataset_id).all()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.dataset_id == dataset_id).all()
if documents is None or len(documents) == 0:
logging.info(click.style("No documents found for dataset: {}".format(dataset_id), fg="green"))
else:
logging.info(click.style("Cleaning documents for dataset: {}".format(dataset_id), fg="green"))
# Specify the index type before initializing the index processor
if doc_form is None:
raise ValueError("Index type must be specified.")
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
index_processor.clean(dataset, None, with_keywords=True, delete_child_chunks=True)
for document in documents:
db.session.delete(document)
for segment in segments:
image_upload_file_ids = get_image_upload_file_ids(segment.content)
for upload_file_id in image_upload_file_ids:
image_file = db.session.query(UploadFile).filter(UploadFile.id == upload_file_id).first()
if image_file is None:
continue
try:
storage.delete(image_file.key)
except Exception:
logging.exception(
"Delete image_files failed when storage deleted, \
image_upload_file_is: {}".format(upload_file_id)
)
db.session.delete(image_file)
db.session.delete(segment)
db.session.query(DatasetProcessRule).filter(DatasetProcessRule.dataset_id == dataset_id).delete()
db.session.query(DatasetQuery).filter(DatasetQuery.dataset_id == dataset_id).delete()
db.session.query(AppDatasetJoin).filter(AppDatasetJoin.dataset_id == dataset_id).delete()
# delete files
if documents:
for document in documents:
try:
if document.data_source_type == "upload_file":
if document.data_source_info:
data_source_info = document.data_source_info_dict
if data_source_info and "upload_file_id" in data_source_info:
file_id = data_source_info["upload_file_id"]
file = (
db.session.query(UploadFile)
.filter(UploadFile.tenant_id == document.tenant_id, UploadFile.id == file_id)
.first()
)
if not file:
continue
storage.delete(file.key)
db.session.delete(file)
except Exception:
continue
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style(
"Cleaned dataset when dataset deleted: {} latency: {}".format(dataset_id, end_at - start_at), fg="green"
)
)
except Exception:
logging.exception("Cleaned dataset when dataset deleted failed")

View File

@@ -0,0 +1,78 @@
import logging
import time
from typing import Optional
import click
from celery import shared_task # type: ignore
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.tools.utils.rag_web_reader import get_image_upload_file_ids
from extensions.ext_database import db
from extensions.ext_storage import storage
from models.dataset import Dataset, DocumentSegment
from models.model import UploadFile
@shared_task(queue="dataset")
def clean_document_task(document_id: str, dataset_id: str, doc_form: str, file_id: Optional[str]):
"""
Clean document when document deleted.
:param document_id: document id
:param dataset_id: dataset id
:param doc_form: doc_form
:param file_id: file id
Usage: clean_document_task.delay(document_id, dataset_id)
"""
logging.info(click.style("Start clean document when document deleted: {}".format(document_id), fg="green"))
start_at = time.perf_counter()
try:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
raise Exception("Document has no dataset")
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
# check segment is exist
if segments:
index_node_ids = [segment.index_node_id for segment in segments]
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
image_upload_file_ids = get_image_upload_file_ids(segment.content)
for upload_file_id in image_upload_file_ids:
image_file = db.session.query(UploadFile).filter(UploadFile.id == upload_file_id).first()
if image_file is None:
continue
try:
storage.delete(image_file.key)
except Exception:
logging.exception(
"Delete image_files failed when storage deleted, \
image_upload_file_is: {}".format(upload_file_id)
)
db.session.delete(image_file)
db.session.delete(segment)
db.session.commit()
if file_id:
file = db.session.query(UploadFile).filter(UploadFile.id == file_id).first()
if file:
try:
storage.delete(file.key)
except Exception:
logging.exception("Delete file failed when document deleted, file_id: {}".format(file_id))
db.session.delete(file)
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style(
"Cleaned document when document deleted: {} latency: {}".format(document_id, end_at - start_at),
fg="green",
)
)
except Exception:
logging.exception("Cleaned document when document deleted failed")

View File

@@ -0,0 +1,55 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, Document, DocumentSegment
@shared_task(queue="dataset")
def clean_notion_document_task(document_ids: list[str], dataset_id: str):
"""
Clean document when document deleted.
:param document_ids: document ids
:param dataset_id: dataset id
Usage: clean_notion_document_task.delay(document_ids, dataset_id)
"""
logging.info(
click.style("Start clean document when import form notion document deleted: {}".format(dataset_id), fg="green")
)
start_at = time.perf_counter()
try:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
raise Exception("Document has no dataset")
index_type = dataset.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
for document_id in document_ids:
document = db.session.query(Document).filter(Document.id == document_id).first()
db.session.delete(document)
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
index_node_ids = [segment.index_node_id for segment in segments]
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
db.session.delete(segment)
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style(
"Clean document when import form notion document deleted end :: {} latency: {}".format(
dataset_id, end_at - start_at
),
fg="green",
)
)
except Exception:
logging.exception("Cleaned document when import form notion document deleted failed")

View File

@@ -0,0 +1,95 @@
import datetime
import logging
import time
from typing import Optional
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.rag.models.document import Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DocumentSegment
@shared_task(queue="dataset")
def create_segment_to_index_task(segment_id: str, keywords: Optional[list[str]] = None):
"""
Async create segment to index
:param segment_id:
:param keywords:
Usage: create_segment_to_index_task.delay(segment_id)
"""
logging.info(click.style("Start create segment to index: {}".format(segment_id), fg="green"))
start_at = time.perf_counter()
segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first()
if not segment:
raise NotFound("Segment not found")
if segment.status != "waiting":
return
indexing_cache_key = "segment_{}_indexing".format(segment.id)
try:
# update segment status to indexing
update_params = {
DocumentSegment.status: "indexing",
DocumentSegment.indexing_at: datetime.datetime.now(datetime.UTC).replace(tzinfo=None),
}
DocumentSegment.query.filter_by(id=segment.id).update(update_params)
db.session.commit()
document = Document(
page_content=segment.content,
metadata={
"doc_id": segment.index_node_id,
"doc_hash": segment.index_node_hash,
"document_id": segment.document_id,
"dataset_id": segment.dataset_id,
},
)
dataset = segment.dataset
if not dataset:
logging.info(click.style("Segment {} has no dataset, pass.".format(segment.id), fg="cyan"))
return
dataset_document = segment.document
if not dataset_document:
logging.info(click.style("Segment {} has no document, pass.".format(segment.id), fg="cyan"))
return
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":
logging.info(click.style("Segment {} document status is invalid, pass.".format(segment.id), fg="cyan"))
return
index_type = dataset.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
index_processor.load(dataset, [document])
# update segment to completed
update_params = {
DocumentSegment.status: "completed",
DocumentSegment.completed_at: datetime.datetime.now(datetime.UTC).replace(tzinfo=None),
}
DocumentSegment.query.filter_by(id=segment.id).update(update_params)
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style("Segment created to index: {} latency: {}".format(segment.id, end_at - start_at), fg="green")
)
except Exception as e:
logging.exception("create segment to index failed")
segment.enabled = False
segment.disabled_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
segment.status = "error"
segment.error = str(e)
db.session.commit()
finally:
redis_client.delete(indexing_cache_key)

View File

@@ -0,0 +1,169 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from core.rag.index_processor.constant.index_type import IndexType
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.rag.models.document import ChildDocument, Document
from extensions.ext_database import db
from models.dataset import Dataset, DocumentSegment
from models.dataset import Document as DatasetDocument
@shared_task(queue="dataset")
def deal_dataset_vector_index_task(dataset_id: str, action: str):
"""
Async deal dataset from index
:param dataset_id: dataset_id
:param action: action
Usage: deal_dataset_vector_index_task.delay(dataset_id, action)
"""
logging.info(click.style("Start deal dataset vector index: {}".format(dataset_id), fg="green"))
start_at = time.perf_counter()
try:
dataset = Dataset.query.filter_by(id=dataset_id).first()
if not dataset:
raise Exception("Dataset not found")
index_type = dataset.doc_form or IndexType.PARAGRAPH_INDEX
index_processor = IndexProcessorFactory(index_type).init_index_processor()
if action == "remove":
index_processor.clean(dataset, None, with_keywords=False)
elif action == "add":
dataset_documents = (
db.session.query(DatasetDocument)
.filter(
DatasetDocument.dataset_id == dataset_id,
DatasetDocument.indexing_status == "completed",
DatasetDocument.enabled == True,
DatasetDocument.archived == False,
)
.all()
)
if dataset_documents:
dataset_documents_ids = [doc.id for doc in dataset_documents]
db.session.query(DatasetDocument).filter(DatasetDocument.id.in_(dataset_documents_ids)).update(
{"indexing_status": "indexing"}, synchronize_session=False
)
db.session.commit()
for dataset_document in dataset_documents:
try:
# add from vector index
segments = (
db.session.query(DocumentSegment)
.filter(DocumentSegment.document_id == dataset_document.id, DocumentSegment.enabled == True)
.order_by(DocumentSegment.position.asc())
.all()
)
if segments:
documents = []
for segment in segments:
document = Document(
page_content=segment.content,
metadata={
"doc_id": segment.index_node_id,
"doc_hash": segment.index_node_hash,
"document_id": segment.document_id,
"dataset_id": segment.dataset_id,
},
)
documents.append(document)
# save vector index
index_processor.load(dataset, documents, with_keywords=False)
db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update(
{"indexing_status": "completed"}, synchronize_session=False
)
db.session.commit()
except Exception as e:
db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update(
{"indexing_status": "error", "error": str(e)}, synchronize_session=False
)
db.session.commit()
elif action == "update":
dataset_documents = (
db.session.query(DatasetDocument)
.filter(
DatasetDocument.dataset_id == dataset_id,
DatasetDocument.indexing_status == "completed",
DatasetDocument.enabled == True,
DatasetDocument.archived == False,
)
.all()
)
# add new index
if dataset_documents:
# update document status
dataset_documents_ids = [doc.id for doc in dataset_documents]
db.session.query(DatasetDocument).filter(DatasetDocument.id.in_(dataset_documents_ids)).update(
{"indexing_status": "indexing"}, synchronize_session=False
)
db.session.commit()
# clean index
index_processor.clean(dataset, None, with_keywords=False, delete_child_chunks=False)
for dataset_document in dataset_documents:
# update from vector index
try:
segments = (
db.session.query(DocumentSegment)
.filter(DocumentSegment.document_id == dataset_document.id, DocumentSegment.enabled == True)
.order_by(DocumentSegment.position.asc())
.all()
)
if segments:
documents = []
for segment in segments:
document = Document(
page_content=segment.content,
metadata={
"doc_id": segment.index_node_id,
"doc_hash": segment.index_node_hash,
"document_id": segment.document_id,
"dataset_id": segment.dataset_id,
},
)
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
child_chunks = segment.child_chunks
if child_chunks:
child_documents = []
for child_chunk in child_chunks:
child_document = ChildDocument(
page_content=child_chunk.content,
metadata={
"doc_id": child_chunk.index_node_id,
"doc_hash": child_chunk.index_node_hash,
"document_id": segment.document_id,
"dataset_id": segment.dataset_id,
},
)
child_documents.append(child_document)
document.children = child_documents
documents.append(document)
# save vector index
index_processor.load(dataset, documents, with_keywords=False)
db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update(
{"indexing_status": "completed"}, synchronize_session=False
)
db.session.commit()
except Exception as e:
db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update(
{"indexing_status": "error", "error": str(e)}, synchronize_session=False
)
db.session.commit()
else:
# clean collection
index_processor.clean(dataset, None, with_keywords=False, delete_child_chunks=False)
end_at = time.perf_counter()
logging.info(
click.style("Deal dataset vector index: {} latency: {}".format(dataset_id, end_at - start_at), fg="green")
)
except Exception:
logging.exception("Deal dataset vector index failed")

View File

@@ -0,0 +1,26 @@
import logging
from celery import shared_task # type: ignore
from extensions.ext_database import db
from models.account import Account
from services.billing_service import BillingService
from tasks.mail_account_deletion_task import send_deletion_success_task
logger = logging.getLogger(__name__)
@shared_task(queue="dataset")
def delete_account_task(account_id):
account = db.session.query(Account).filter(Account.id == account_id).first()
try:
BillingService.delete_account(account_id)
except Exception as e:
logger.exception(f"Failed to delete account {account_id} from billing service.")
raise
if not account:
logger.error(f"Account {account_id} not found.")
return
# send success email
send_deletion_success_task.delay(account.email)

View File

@@ -0,0 +1,43 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, Document
@shared_task(queue="dataset")
def delete_segment_from_index_task(index_node_ids: list, dataset_id: str, document_id: str):
"""
Async Remove segment from index
:param index_node_ids:
:param dataset_id:
:param document_id:
Usage: delete_segment_from_index_task.delay(segment_ids)
"""
logging.info(click.style("Start delete segment from index", fg="green"))
start_at = time.perf_counter()
try:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
return
dataset_document = db.session.query(Document).filter(Document.id == document_id).first()
if not dataset_document:
return
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":
return
index_type = dataset_document.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
end_at = time.perf_counter()
logging.info(click.style("Segment deleted from index latency: {}".format(end_at - start_at), fg="green"))
except Exception:
logging.exception("delete segment from index failed")

View File

@@ -0,0 +1,64 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DocumentSegment
@shared_task(queue="dataset")
def disable_segment_from_index_task(segment_id: str):
"""
Async disable segment from index
:param segment_id:
Usage: disable_segment_from_index_task.delay(segment_id)
"""
logging.info(click.style("Start disable segment from index: {}".format(segment_id), fg="green"))
start_at = time.perf_counter()
segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first()
if not segment:
raise NotFound("Segment not found")
if segment.status != "completed":
raise NotFound("Segment is not completed , disable action is not allowed.")
indexing_cache_key = "segment_{}_indexing".format(segment.id)
try:
dataset = segment.dataset
if not dataset:
logging.info(click.style("Segment {} has no dataset, pass.".format(segment.id), fg="cyan"))
return
dataset_document = segment.document
if not dataset_document:
logging.info(click.style("Segment {} has no document, pass.".format(segment.id), fg="cyan"))
return
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":
logging.info(click.style("Segment {} document status is invalid, pass.".format(segment.id), fg="cyan"))
return
index_type = dataset_document.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
index_processor.clean(dataset, [segment.index_node_id])
end_at = time.perf_counter()
logging.info(
click.style("Segment removed from index: {} latency: {}".format(segment.id, end_at - start_at), fg="green")
)
except Exception:
logging.exception("remove segment from index failed")
segment.enabled = True
db.session.commit()
finally:
redis_client.delete(indexing_cache_key)

View File

@@ -0,0 +1,76 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset, DocumentSegment
from models.dataset import Document as DatasetDocument
@shared_task(queue="dataset")
def disable_segments_from_index_task(segment_ids: list, dataset_id: str, document_id: str):
"""
Async disable segments from index
:param segment_ids:
Usage: disable_segments_from_index_task.delay(segment_ids, dataset_id, document_id)
"""
start_at = time.perf_counter()
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
logging.info(click.style("Dataset {} not found, pass.".format(dataset_id), fg="cyan"))
return
dataset_document = db.session.query(DatasetDocument).filter(DatasetDocument.id == document_id).first()
if not dataset_document:
logging.info(click.style("Document {} not found, pass.".format(document_id), fg="cyan"))
return
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":
logging.info(click.style("Document {} status is invalid, pass.".format(document_id), fg="cyan"))
return
# sync index processor
index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor()
segments = (
db.session.query(DocumentSegment)
.filter(
DocumentSegment.id.in_(segment_ids),
DocumentSegment.dataset_id == dataset_id,
DocumentSegment.document_id == document_id,
)
.all()
)
if not segments:
return
try:
index_node_ids = [segment.index_node_id for segment in segments]
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=False)
end_at = time.perf_counter()
logging.info(click.style("Segments removed from index latency: {}".format(end_at - start_at), fg="green"))
except Exception:
# update segment error msg
db.session.query(DocumentSegment).filter(
DocumentSegment.id.in_(segment_ids),
DocumentSegment.dataset_id == dataset_id,
DocumentSegment.document_id == document_id,
).update(
{
"disabled_at": None,
"disabled_by": None,
"enabled": True,
}
)
db.session.commit()
finally:
for segment in segments:
indexing_cache_key = "segment_{}_indexing".format(segment.id)
redis_client.delete(indexing_cache_key)

View File

@@ -0,0 +1,112 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.indexing_runner import DocumentIsPausedError, IndexingRunner
from core.rag.extractor.notion_extractor import NotionExtractor
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, Document, DocumentSegment
from models.source import DataSourceOauthBinding
@shared_task(queue="dataset")
def document_indexing_sync_task(dataset_id: str, document_id: str):
"""
Async update document
:param dataset_id:
:param document_id:
Usage: document_indexing_sync_task.delay(dataset_id, document_id)
"""
logging.info(click.style("Start sync document: {}".format(document_id), fg="green"))
start_at = time.perf_counter()
document = db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
if not document:
raise NotFound("Document not found")
data_source_info = document.data_source_info_dict
if document.data_source_type == "notion_import":
if (
not data_source_info
or "notion_page_id" not in data_source_info
or "notion_workspace_id" not in data_source_info
):
raise ValueError("no notion page found")
workspace_id = data_source_info["notion_workspace_id"]
page_id = data_source_info["notion_page_id"]
page_type = data_source_info["type"]
page_edited_time = data_source_info["last_edited_time"]
data_source_binding = DataSourceOauthBinding.query.filter(
db.and_(
DataSourceOauthBinding.tenant_id == document.tenant_id,
DataSourceOauthBinding.provider == "notion",
DataSourceOauthBinding.disabled == False,
DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
)
).first()
if not data_source_binding:
raise ValueError("Data source binding not found.")
loader = NotionExtractor(
notion_workspace_id=workspace_id,
notion_obj_id=page_id,
notion_page_type=page_type,
notion_access_token=data_source_binding.access_token,
tenant_id=document.tenant_id,
)
last_edited_time = loader.get_notion_last_edited_time()
# check the page is updated
if last_edited_time != page_edited_time:
document.indexing_status = "parsing"
document.processing_started_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.commit()
# delete all document segment and index
try:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
raise Exception("Dataset not found")
index_type = document.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
db.session.delete(segment)
end_at = time.perf_counter()
logging.info(
click.style(
"Cleaned document when document update data source or process rule: {} latency: {}".format(
document_id, end_at - start_at
),
fg="green",
)
)
except Exception:
logging.exception("Cleaned document when document update data source or process rule failed")
try:
indexing_runner = IndexingRunner()
indexing_runner.run([document])
end_at = time.perf_counter()
logging.info(
click.style("update document: {} latency: {}".format(document.id, end_at - start_at), fg="green")
)
except DocumentIsPausedError as ex:
logging.info(click.style(str(ex), fg="yellow"))
except Exception:
pass

View File

@@ -0,0 +1,80 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from configs import dify_config
from core.indexing_runner import DocumentIsPausedError, IndexingRunner
from extensions.ext_database import db
from models.dataset import Dataset, Document
from services.feature_service import FeatureService
@shared_task(queue="dataset")
def document_indexing_task(dataset_id: str, document_ids: list):
"""
Async process document
:param dataset_id:
:param document_ids:
Usage: document_indexing_task.delay(dataset_id, document_id)
"""
documents = []
start_at = time.perf_counter()
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
logging.info(click.style("Dataset is not found: {}".format(dataset_id), fg="yellow"))
return
# check document limit
features = FeatureService.get_features(dataset.tenant_id)
try:
if features.billing.enabled:
vector_space = features.vector_space
count = len(document_ids)
batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT)
if count > batch_upload_limit:
raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
if 0 < vector_space.limit <= vector_space.size:
raise ValueError(
"Your total number of documents plus the number of uploads have over the limit of "
"your subscription."
)
except Exception as e:
for document_id in document_ids:
document = (
db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
)
if document:
document.indexing_status = "error"
document.error = str(e)
document.stopped_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
return
for document_id in document_ids:
logging.info(click.style("Start process document: {}".format(document_id), fg="green"))
document = (
db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
)
if document:
document.indexing_status = "parsing"
document.processing_started_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
documents.append(document)
db.session.add(document)
db.session.commit()
try:
indexing_runner = IndexingRunner()
indexing_runner.run(documents)
end_at = time.perf_counter()
logging.info(click.style("Processed dataset: {} latency: {}".format(dataset_id, end_at - start_at), fg="green"))
except DocumentIsPausedError as ex:
logging.info(click.style(str(ex), fg="yellow"))
except Exception:
pass

View File

@@ -0,0 +1,75 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.indexing_runner import DocumentIsPausedError, IndexingRunner
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, Document, DocumentSegment
@shared_task(queue="dataset")
def document_indexing_update_task(dataset_id: str, document_id: str):
"""
Async update document
:param dataset_id:
:param document_id:
Usage: document_indexing_update_task.delay(dataset_id, document_id)
"""
logging.info(click.style("Start update document: {}".format(document_id), fg="green"))
start_at = time.perf_counter()
document = db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
if not document:
raise NotFound("Document not found")
document.indexing_status = "parsing"
document.processing_started_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.commit()
# delete all document segment and index
try:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
raise Exception("Dataset not found")
index_type = document.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
if segments:
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
db.session.delete(segment)
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style(
"Cleaned document when document update data source or process rule: {} latency: {}".format(
document_id, end_at - start_at
),
fg="green",
)
)
except Exception:
logging.exception("Cleaned document when document update data source or process rule failed")
try:
indexing_runner = IndexingRunner()
indexing_runner.run([document])
end_at = time.perf_counter()
logging.info(click.style("update document: {} latency: {}".format(document.id, end_at - start_at), fg="green"))
except DocumentIsPausedError as ex:
logging.info(click.style(str(ex), fg="yellow"))
except Exception:
pass

View File

@@ -0,0 +1,96 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from configs import dify_config
from core.indexing_runner import DocumentIsPausedError, IndexingRunner
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, Document, DocumentSegment
from services.feature_service import FeatureService
@shared_task(queue="dataset")
def duplicate_document_indexing_task(dataset_id: str, document_ids: list):
"""
Async process document
:param dataset_id:
:param document_ids:
Usage: duplicate_document_indexing_task.delay(dataset_id, document_id)
"""
documents = []
start_at = time.perf_counter()
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if dataset is None:
raise ValueError("Dataset not found")
# check document limit
features = FeatureService.get_features(dataset.tenant_id)
try:
if features.billing.enabled:
vector_space = features.vector_space
count = len(document_ids)
batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT)
if count > batch_upload_limit:
raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
if 0 < vector_space.limit <= vector_space.size:
raise ValueError(
"Your total number of documents plus the number of uploads have over the limit of "
"your subscription."
)
except Exception as e:
for document_id in document_ids:
document = (
db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
)
if document:
document.indexing_status = "error"
document.error = str(e)
document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
return
for document_id in document_ids:
logging.info(click.style("Start process document: {}".format(document_id), fg="green"))
document = (
db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
)
if document:
# clean old data
index_type = document.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
if segments:
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
db.session.delete(segment)
db.session.commit()
document.indexing_status = "parsing"
document.processing_started_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
documents.append(document)
db.session.add(document)
db.session.commit()
try:
indexing_runner = IndexingRunner()
indexing_runner.run(documents)
end_at = time.perf_counter()
logging.info(click.style("Processed dataset: {} latency: {}".format(dataset_id, end_at - start_at), fg="green"))
except DocumentIsPausedError as ex:
logging.info(click.style(str(ex), fg="yellow"))
except Exception:
pass

View File

@@ -0,0 +1,96 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.rag.index_processor.constant.index_type import IndexType
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.rag.models.document import ChildDocument, Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DocumentSegment
@shared_task(queue="dataset")
def enable_segment_to_index_task(segment_id: str):
"""
Async enable segment to index
:param segment_id:
Usage: enable_segment_to_index_task.delay(segment_id)
"""
logging.info(click.style("Start enable segment to index: {}".format(segment_id), fg="green"))
start_at = time.perf_counter()
segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first()
if not segment:
raise NotFound("Segment not found")
if segment.status != "completed":
raise NotFound("Segment is not completed, enable action is not allowed.")
indexing_cache_key = "segment_{}_indexing".format(segment.id)
try:
document = Document(
page_content=segment.content,
metadata={
"doc_id": segment.index_node_id,
"doc_hash": segment.index_node_hash,
"document_id": segment.document_id,
"dataset_id": segment.dataset_id,
},
)
dataset = segment.dataset
if not dataset:
logging.info(click.style("Segment {} has no dataset, pass.".format(segment.id), fg="cyan"))
return
dataset_document = segment.document
if not dataset_document:
logging.info(click.style("Segment {} has no document, pass.".format(segment.id), fg="cyan"))
return
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":
logging.info(click.style("Segment {} document status is invalid, pass.".format(segment.id), fg="cyan"))
return
index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor()
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
child_chunks = segment.child_chunks
if child_chunks:
child_documents = []
for child_chunk in child_chunks:
child_document = ChildDocument(
page_content=child_chunk.content,
metadata={
"doc_id": child_chunk.index_node_id,
"doc_hash": child_chunk.index_node_hash,
"document_id": segment.document_id,
"dataset_id": segment.dataset_id,
},
)
child_documents.append(child_document)
document.children = child_documents
# save vector index
index_processor.load(dataset, [document])
end_at = time.perf_counter()
logging.info(
click.style("Segment enabled to index: {} latency: {}".format(segment.id, end_at - start_at), fg="green")
)
except Exception as e:
logging.exception("enable segment to index failed")
segment.enabled = False
segment.disabled_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
segment.status = "error"
segment.error = str(e)
db.session.commit()
finally:
redis_client.delete(indexing_cache_key)

View File

@@ -0,0 +1,108 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from core.rag.index_processor.constant.index_type import IndexType
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.rag.models.document import ChildDocument, Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset, DocumentSegment
from models.dataset import Document as DatasetDocument
@shared_task(queue="dataset")
def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_id: str):
"""
Async enable segments to index
:param segment_ids:
Usage: enable_segments_to_index_task.delay(segment_ids)
"""
start_at = time.perf_counter()
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
logging.info(click.style("Dataset {} not found, pass.".format(dataset_id), fg="cyan"))
return
dataset_document = db.session.query(DatasetDocument).filter(DatasetDocument.id == document_id).first()
if not dataset_document:
logging.info(click.style("Document {} not found, pass.".format(document_id), fg="cyan"))
return
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":
logging.info(click.style("Document {} status is invalid, pass.".format(document_id), fg="cyan"))
return
# sync index processor
index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor()
segments = (
db.session.query(DocumentSegment)
.filter(
DocumentSegment.id.in_(segment_ids),
DocumentSegment.dataset_id == dataset_id,
DocumentSegment.document_id == document_id,
)
.all()
)
if not segments:
return
try:
documents = []
for segment in segments:
document = Document(
page_content=segment.content,
metadata={
"doc_id": segment.index_node_id,
"doc_hash": segment.index_node_hash,
"document_id": document_id,
"dataset_id": dataset_id,
},
)
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
child_chunks = segment.child_chunks
if child_chunks:
child_documents = []
for child_chunk in child_chunks:
child_document = ChildDocument(
page_content=child_chunk.content,
metadata={
"doc_id": child_chunk.index_node_id,
"doc_hash": child_chunk.index_node_hash,
"document_id": document_id,
"dataset_id": dataset_id,
},
)
child_documents.append(child_document)
document.children = child_documents
documents.append(document)
# save vector index
index_processor.load(dataset, documents)
end_at = time.perf_counter()
logging.info(click.style("Segments enabled to index latency: {}".format(end_at - start_at), fg="green"))
except Exception as e:
logging.exception("enable segments to index failed")
# update segment error msg
db.session.query(DocumentSegment).filter(
DocumentSegment.id.in_(segment_ids),
DocumentSegment.dataset_id == dataset_id,
DocumentSegment.document_id == document_id,
).update(
{
"error": str(e),
"status": "error",
"disabled_at": datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None),
"enabled": False,
}
)
db.session.commit()
finally:
for segment in segments:
indexing_cache_key = "segment_{}_indexing".format(segment.id)
redis_client.delete(indexing_cache_key)

View File

@@ -0,0 +1,91 @@
import json
import logging
import time
import click
from celery import shared_task # type: ignore
from core.indexing_runner import DocumentIsPausedError
from extensions.ext_database import db
from extensions.ext_storage import storage
from models.dataset import Dataset, ExternalKnowledgeApis
from models.model import UploadFile
from services.external_knowledge_service import ExternalDatasetService
@shared_task(queue="dataset")
def external_document_indexing_task(
dataset_id: str, external_knowledge_api_id: str, data_source: dict, process_parameter: dict
):
"""
Async process document
:param dataset_id:
:param external_knowledge_api_id:
:param data_source:
:param process_parameter:
Usage: external_document_indexing_task.delay(dataset_id, document_id)
"""
start_at = time.perf_counter()
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
logging.info(
click.style("Processed external dataset: {} failed, dataset not exit.".format(dataset_id), fg="red")
)
return
# get external api template
external_knowledge_api = (
db.session.query(ExternalKnowledgeApis)
.filter(
ExternalKnowledgeApis.id == external_knowledge_api_id, ExternalKnowledgeApis.tenant_id == dataset.tenant_id
)
.first()
)
if not external_knowledge_api:
logging.info(
click.style(
"Processed external dataset: {} failed, api template: {} not exit.".format(
dataset_id, external_knowledge_api_id
),
fg="red",
)
)
return
files = {}
if data_source["type"] == "upload_file":
upload_file_list = data_source["info_list"]["file_info_list"]["file_ids"]
for file_id in upload_file_list:
file = (
db.session.query(UploadFile)
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
.first()
)
if file:
files[file.id] = (file.name, storage.load_once(file.key), file.mime_type)
try:
settings = ExternalDatasetService.get_external_knowledge_api_settings(
json.loads(external_knowledge_api.settings)
)
# do http request
response = ExternalDatasetService.process_external_api(settings, files)
job_id = response.json().get("job_id")
if job_id:
# save job_id to dataset
dataset.job_id = job_id
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style(
"Processed external dataset: {} successful, latency: {}".format(dataset.id, end_at - start_at),
fg="green",
)
)
except DocumentIsPausedError as ex:
logging.info(click.style(str(ex), fg="yellow"))
except Exception:
pass

View File

@@ -0,0 +1,70 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from flask import render_template
from extensions.ext_mail import mail
@shared_task(queue="mail")
def send_deletion_success_task(to):
"""Send email to user regarding account deletion.
Args:
log (AccountDeletionLog): Account deletion log object
"""
if not mail.is_inited():
return
logging.info(click.style(f"Start send account deletion success email to {to}", fg="green"))
start_at = time.perf_counter()
try:
html_content = render_template(
"delete_account_success_template_en-US.html",
to=to,
email=to,
)
mail.send(to=to, subject="Your Dify.AI Account Has Been Successfully Deleted", html=html_content)
end_at = time.perf_counter()
logging.info(
click.style(
"Send account deletion success email to {}: latency: {}".format(to, end_at - start_at), fg="green"
)
)
except Exception:
logging.exception("Send account deletion success email to {} failed".format(to))
@shared_task(queue="mail")
def send_account_deletion_verification_code(to, code):
"""Send email to user regarding account deletion verification code.
Args:
to (str): Recipient email address
code (str): Verification code
"""
if not mail.is_inited():
return
logging.info(click.style(f"Start send account deletion verification code email to {to}", fg="green"))
start_at = time.perf_counter()
try:
html_content = render_template("delete_account_code_email_template_en-US.html", to=to, code=code)
mail.send(to=to, subject="Dify.AI Account Deletion and Verification", html=html_content)
end_at = time.perf_counter()
logging.info(
click.style(
"Send account deletion verification code email to {} succeeded: latency: {}".format(
to, end_at - start_at
),
fg="green",
)
)
except Exception:
logging.exception("Send account deletion verification code email to {} failed".format(to))

View File

@@ -0,0 +1,41 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from flask import render_template
from extensions.ext_mail import mail
@shared_task(queue="mail")
def send_email_code_login_mail_task(language: str, to: str, code: str):
"""
Async Send email code login mail
:param language: Language in which the email should be sent (e.g., 'en', 'zh')
:param to: Recipient email address
:param code: Email code to be included in the email
"""
if not mail.is_inited():
return
logging.info(click.style("Start email code login mail to {}".format(to), fg="green"))
start_at = time.perf_counter()
# send email code login mail using different languages
try:
if language == "zh-Hans":
html_content = render_template("email_code_login_mail_template_zh-CN.html", to=to, code=code)
mail.send(to=to, subject="邮箱验证码", html=html_content)
else:
html_content = render_template("email_code_login_mail_template_en-US.html", to=to, code=code)
mail.send(to=to, subject="Email Code", html=html_content)
end_at = time.perf_counter()
logging.info(
click.style(
"Send email code login mail to {} succeeded: latency: {}".format(to, end_at - start_at), fg="green"
)
)
except Exception:
logging.exception("Send email code login mail to {} failed".format(to))

View File

@@ -0,0 +1,61 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from flask import render_template
from configs import dify_config
from extensions.ext_mail import mail
@shared_task(queue="mail")
def send_invite_member_mail_task(language: str, to: str, token: str, inviter_name: str, workspace_name: str):
"""
Async Send invite member mail
:param language
:param to
:param token
:param inviter_name
:param workspace_name
Usage: send_invite_member_mail_task.delay(language, to, token, inviter_name, workspace_name)
"""
if not mail.is_inited():
return
logging.info(
click.style("Start send invite member mail to {} in workspace {}".format(to, workspace_name), fg="green")
)
start_at = time.perf_counter()
# send invite member mail using different languages
try:
url = f"{dify_config.CONSOLE_WEB_URL}/activate?token={token}"
if language == "zh-Hans":
html_content = render_template(
"invite_member_mail_template_zh-CN.html",
to=to,
inviter_name=inviter_name,
workspace_name=workspace_name,
url=url,
)
mail.send(to=to, subject="立即加入 Dify 工作空间", html=html_content)
else:
html_content = render_template(
"invite_member_mail_template_en-US.html",
to=to,
inviter_name=inviter_name,
workspace_name=workspace_name,
url=url,
)
mail.send(to=to, subject="Join Dify Workspace Now", html=html_content)
end_at = time.perf_counter()
logging.info(
click.style(
"Send invite member mail to {} succeeded: latency: {}".format(to, end_at - start_at), fg="green"
)
)
except Exception:
logging.exception("Send invite member mail to {} failed".format(to))

View File

@@ -0,0 +1,41 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from flask import render_template
from extensions.ext_mail import mail
@shared_task(queue="mail")
def send_reset_password_mail_task(language: str, to: str, code: str):
"""
Async Send reset password mail
:param language: Language in which the email should be sent (e.g., 'en', 'zh')
:param to: Recipient email address
:param code: Reset password code
"""
if not mail.is_inited():
return
logging.info(click.style("Start password reset mail to {}".format(to), fg="green"))
start_at = time.perf_counter()
# send reset password mail using different languages
try:
if language == "zh-Hans":
html_content = render_template("reset_password_mail_template_zh-CN.html", to=to, code=code)
mail.send(to=to, subject="设置您的 Dify 密码", html=html_content)
else:
html_content = render_template("reset_password_mail_template_en-US.html", to=to, code=code)
mail.send(to=to, subject="Set Your Dify Password", html=html_content)
end_at = time.perf_counter()
logging.info(
click.style(
"Send password reset mail to {} succeeded: latency: {}".format(to, end_at - start_at), fg="green"
)
)
except Exception:
logging.exception("Send password reset mail to {} failed".format(to))

View File

@@ -0,0 +1,54 @@
import json
import logging
from celery import shared_task # type: ignore
from flask import current_app
from core.ops.entities.config_entity import OPS_FILE_PATH, OPS_TRACE_FAILED_KEY
from core.ops.entities.trace_entity import trace_info_info_map
from core.rag.models.document import Document
from extensions.ext_redis import redis_client
from extensions.ext_storage import storage
from models.model import Message
from models.workflow import WorkflowRun
@shared_task(queue="ops_trace")
def process_trace_tasks(file_info):
"""
Async process trace tasks
:param tasks_data: List of dictionaries containing task data
Usage: process_trace_tasks.delay(tasks_data)
"""
from core.ops.ops_trace_manager import OpsTraceManager
app_id = file_info.get("app_id")
file_id = file_info.get("file_id")
file_path = f"{OPS_FILE_PATH}{app_id}/{file_id}.json"
file_data = json.loads(storage.load(file_path))
trace_info = file_data.get("trace_info")
trace_info_type = file_data.get("trace_info_type")
trace_instance = OpsTraceManager.get_ops_trace_instance(app_id)
if trace_info.get("message_data"):
trace_info["message_data"] = Message.from_dict(data=trace_info["message_data"])
if trace_info.get("workflow_data"):
trace_info["workflow_data"] = WorkflowRun.from_dict(data=trace_info["workflow_data"])
if trace_info.get("documents"):
trace_info["documents"] = [Document(**doc) for doc in trace_info["documents"]]
try:
if trace_instance:
with current_app.app_context():
trace_type = trace_info_info_map.get(trace_info_type)
if trace_type:
trace_info = trace_type(**trace_info)
trace_instance.trace(trace_info)
logging.info(f"Processing trace tasks success, app_id: {app_id}")
except Exception:
failed_key = f"{OPS_TRACE_FAILED_KEY}_{app_id}"
redis_client.incr(failed_key)
logging.info(f"Processing trace tasks failed, app_id: {app_id}")
finally:
storage.delete(file_path)

View File

@@ -0,0 +1,45 @@
import logging
import time
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.indexing_runner import DocumentIsPausedError, IndexingRunner
from extensions.ext_database import db
from models.dataset import Document
@shared_task(queue="dataset")
def recover_document_indexing_task(dataset_id: str, document_id: str):
"""
Async recover document
:param dataset_id:
:param document_id:
Usage: recover_document_indexing_task.delay(dataset_id, document_id)
"""
logging.info(click.style("Recover document: {}".format(document_id), fg="green"))
start_at = time.perf_counter()
document = db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
if not document:
raise NotFound("Document not found")
try:
indexing_runner = IndexingRunner()
if document.indexing_status in {"waiting", "parsing", "cleaning"}:
indexing_runner.run([document])
elif document.indexing_status == "splitting":
indexing_runner.run_in_splitting_status(document)
elif document.indexing_status == "indexing":
indexing_runner.run_in_indexing_status(document)
end_at = time.perf_counter()
logging.info(
click.style("Processed document: {} latency: {}".format(document.id, end_at - start_at), fg="green")
)
except DocumentIsPausedError as ex:
logging.info(click.style(str(ex), fg="yellow"))
except Exception:
pass

View File

@@ -0,0 +1,329 @@
import logging
import time
from collections.abc import Callable
import click
from celery import shared_task # type: ignore
from sqlalchemy import delete
from sqlalchemy.exc import SQLAlchemyError
from extensions.ext_database import db
from models.dataset import AppDatasetJoin
from models.model import (
ApiToken,
AppAnnotationHitHistory,
AppAnnotationSetting,
AppModelConfig,
Conversation,
EndUser,
InstalledApp,
Message,
MessageAgentThought,
MessageAnnotation,
MessageChain,
MessageFeedback,
MessageFile,
RecommendedApp,
Site,
TagBinding,
TraceAppConfig,
)
from models.tools import WorkflowToolProvider
from models.web import PinnedConversation, SavedMessage
from models.workflow import ConversationVariable, Workflow, WorkflowAppLog, WorkflowNodeExecution, WorkflowRun
@shared_task(queue="app_deletion", bind=True, max_retries=3)
def remove_app_and_related_data_task(self, tenant_id: str, app_id: str):
logging.info(click.style(f"Start deleting app and related data: {tenant_id}:{app_id}", fg="green"))
start_at = time.perf_counter()
try:
# Delete related data
_delete_app_model_configs(tenant_id, app_id)
_delete_app_site(tenant_id, app_id)
_delete_app_api_tokens(tenant_id, app_id)
_delete_installed_apps(tenant_id, app_id)
_delete_recommended_apps(tenant_id, app_id)
_delete_app_annotation_data(tenant_id, app_id)
_delete_app_dataset_joins(tenant_id, app_id)
_delete_app_workflows(tenant_id, app_id)
_delete_app_workflow_runs(tenant_id, app_id)
_delete_app_workflow_node_executions(tenant_id, app_id)
_delete_app_workflow_app_logs(tenant_id, app_id)
_delete_app_conversations(tenant_id, app_id)
_delete_app_messages(tenant_id, app_id)
_delete_workflow_tool_providers(tenant_id, app_id)
_delete_app_tag_bindings(tenant_id, app_id)
_delete_end_users(tenant_id, app_id)
_delete_trace_app_configs(tenant_id, app_id)
_delete_conversation_variables(app_id=app_id)
end_at = time.perf_counter()
logging.info(click.style(f"App and related data deleted: {app_id} latency: {end_at - start_at}", fg="green"))
except SQLAlchemyError as e:
logging.exception(
click.style(f"Database error occurred while deleting app {app_id} and related data", fg="red")
)
raise self.retry(exc=e, countdown=60) # Retry after 60 seconds
except Exception as e:
logging.exception(click.style(f"Error occurred while deleting app {app_id} and related data", fg="red"))
raise self.retry(exc=e, countdown=60) # Retry after 60 seconds
def _delete_app_model_configs(tenant_id: str, app_id: str):
def del_model_config(model_config_id: str):
db.session.query(AppModelConfig).filter(AppModelConfig.id == model_config_id).delete(synchronize_session=False)
_delete_records(
"""select id from app_model_configs where app_id=:app_id limit 1000""",
{"app_id": app_id},
del_model_config,
"app model config",
)
def _delete_app_site(tenant_id: str, app_id: str):
def del_site(site_id: str):
db.session.query(Site).filter(Site.id == site_id).delete(synchronize_session=False)
_delete_records("""select id from sites where app_id=:app_id limit 1000""", {"app_id": app_id}, del_site, "site")
def _delete_app_api_tokens(tenant_id: str, app_id: str):
def del_api_token(api_token_id: str):
db.session.query(ApiToken).filter(ApiToken.id == api_token_id).delete(synchronize_session=False)
_delete_records(
"""select id from api_tokens where app_id=:app_id limit 1000""", {"app_id": app_id}, del_api_token, "api token"
)
def _delete_installed_apps(tenant_id: str, app_id: str):
def del_installed_app(installed_app_id: str):
db.session.query(InstalledApp).filter(InstalledApp.id == installed_app_id).delete(synchronize_session=False)
_delete_records(
"""select id from installed_apps where tenant_id=:tenant_id and app_id=:app_id limit 1000""",
{"tenant_id": tenant_id, "app_id": app_id},
del_installed_app,
"installed app",
)
def _delete_recommended_apps(tenant_id: str, app_id: str):
def del_recommended_app(recommended_app_id: str):
db.session.query(RecommendedApp).filter(RecommendedApp.id == recommended_app_id).delete(
synchronize_session=False
)
_delete_records(
"""select id from recommended_apps where app_id=:app_id limit 1000""",
{"app_id": app_id},
del_recommended_app,
"recommended app",
)
def _delete_app_annotation_data(tenant_id: str, app_id: str):
def del_annotation_hit_history(annotation_hit_history_id: str):
db.session.query(AppAnnotationHitHistory).filter(
AppAnnotationHitHistory.id == annotation_hit_history_id
).delete(synchronize_session=False)
_delete_records(
"""select id from app_annotation_hit_histories where app_id=:app_id limit 1000""",
{"app_id": app_id},
del_annotation_hit_history,
"annotation hit history",
)
def del_annotation_setting(annotation_setting_id: str):
db.session.query(AppAnnotationSetting).filter(AppAnnotationSetting.id == annotation_setting_id).delete(
synchronize_session=False
)
_delete_records(
"""select id from app_annotation_settings where app_id=:app_id limit 1000""",
{"app_id": app_id},
del_annotation_setting,
"annotation setting",
)
def _delete_app_dataset_joins(tenant_id: str, app_id: str):
def del_dataset_join(dataset_join_id: str):
db.session.query(AppDatasetJoin).filter(AppDatasetJoin.id == dataset_join_id).delete(synchronize_session=False)
_delete_records(
"""select id from app_dataset_joins where app_id=:app_id limit 1000""",
{"app_id": app_id},
del_dataset_join,
"dataset join",
)
def _delete_app_workflows(tenant_id: str, app_id: str):
def del_workflow(workflow_id: str):
db.session.query(Workflow).filter(Workflow.id == workflow_id).delete(synchronize_session=False)
_delete_records(
"""select id from workflows where tenant_id=:tenant_id and app_id=:app_id limit 1000""",
{"tenant_id": tenant_id, "app_id": app_id},
del_workflow,
"workflow",
)
def _delete_app_workflow_runs(tenant_id: str, app_id: str):
def del_workflow_run(workflow_run_id: str):
db.session.query(WorkflowRun).filter(WorkflowRun.id == workflow_run_id).delete(synchronize_session=False)
_delete_records(
"""select id from workflow_runs where tenant_id=:tenant_id and app_id=:app_id limit 1000""",
{"tenant_id": tenant_id, "app_id": app_id},
del_workflow_run,
"workflow run",
)
def _delete_app_workflow_node_executions(tenant_id: str, app_id: str):
def del_workflow_node_execution(workflow_node_execution_id: str):
db.session.query(WorkflowNodeExecution).filter(WorkflowNodeExecution.id == workflow_node_execution_id).delete(
synchronize_session=False
)
_delete_records(
"""select id from workflow_node_executions where tenant_id=:tenant_id and app_id=:app_id limit 1000""",
{"tenant_id": tenant_id, "app_id": app_id},
del_workflow_node_execution,
"workflow node execution",
)
def _delete_app_workflow_app_logs(tenant_id: str, app_id: str):
def del_workflow_app_log(workflow_app_log_id: str):
db.session.query(WorkflowAppLog).filter(WorkflowAppLog.id == workflow_app_log_id).delete(
synchronize_session=False
)
_delete_records(
"""select id from workflow_app_logs where tenant_id=:tenant_id and app_id=:app_id limit 1000""",
{"tenant_id": tenant_id, "app_id": app_id},
del_workflow_app_log,
"workflow app log",
)
def _delete_app_conversations(tenant_id: str, app_id: str):
def del_conversation(conversation_id: str):
db.session.query(PinnedConversation).filter(PinnedConversation.conversation_id == conversation_id).delete(
synchronize_session=False
)
db.session.query(Conversation).filter(Conversation.id == conversation_id).delete(synchronize_session=False)
_delete_records(
"""select id from conversations where app_id=:app_id limit 1000""",
{"app_id": app_id},
del_conversation,
"conversation",
)
def _delete_conversation_variables(*, app_id: str):
stmt = delete(ConversationVariable).where(ConversationVariable.app_id == app_id)
with db.engine.connect() as conn:
conn.execute(stmt)
conn.commit()
logging.info(click.style(f"Deleted conversation variables for app {app_id}", fg="green"))
def _delete_app_messages(tenant_id: str, app_id: str):
def del_message(message_id: str):
db.session.query(MessageFeedback).filter(MessageFeedback.message_id == message_id).delete(
synchronize_session=False
)
db.session.query(MessageAnnotation).filter(MessageAnnotation.message_id == message_id).delete(
synchronize_session=False
)
db.session.query(MessageChain).filter(MessageChain.message_id == message_id).delete(synchronize_session=False)
db.session.query(MessageAgentThought).filter(MessageAgentThought.message_id == message_id).delete(
synchronize_session=False
)
db.session.query(MessageFile).filter(MessageFile.message_id == message_id).delete(synchronize_session=False)
db.session.query(SavedMessage).filter(SavedMessage.message_id == message_id).delete(synchronize_session=False)
db.session.query(Message).filter(Message.id == message_id).delete()
_delete_records(
"""select id from messages where app_id=:app_id limit 1000""", {"app_id": app_id}, del_message, "message"
)
def _delete_workflow_tool_providers(tenant_id: str, app_id: str):
def del_tool_provider(tool_provider_id: str):
db.session.query(WorkflowToolProvider).filter(WorkflowToolProvider.id == tool_provider_id).delete(
synchronize_session=False
)
_delete_records(
"""select id from tool_workflow_providers where tenant_id=:tenant_id and app_id=:app_id limit 1000""",
{"tenant_id": tenant_id, "app_id": app_id},
del_tool_provider,
"tool workflow provider",
)
def _delete_app_tag_bindings(tenant_id: str, app_id: str):
def del_tag_binding(tag_binding_id: str):
db.session.query(TagBinding).filter(TagBinding.id == tag_binding_id).delete(synchronize_session=False)
_delete_records(
"""select id from tag_bindings where tenant_id=:tenant_id and target_id=:app_id limit 1000""",
{"tenant_id": tenant_id, "app_id": app_id},
del_tag_binding,
"tag binding",
)
def _delete_end_users(tenant_id: str, app_id: str):
def del_end_user(end_user_id: str):
db.session.query(EndUser).filter(EndUser.id == end_user_id).delete(synchronize_session=False)
_delete_records(
"""select id from end_users where tenant_id=:tenant_id and app_id=:app_id limit 1000""",
{"tenant_id": tenant_id, "app_id": app_id},
del_end_user,
"end user",
)
def _delete_trace_app_configs(tenant_id: str, app_id: str):
def del_trace_app_config(trace_app_config_id: str):
db.session.query(TraceAppConfig).filter(TraceAppConfig.id == trace_app_config_id).delete(
synchronize_session=False
)
_delete_records(
"""select id from trace_app_config where app_id=:app_id limit 1000""",
{"app_id": app_id},
del_trace_app_config,
"trace app config",
)
def _delete_records(query_sql: str, params: dict, delete_func: Callable, name: str) -> None:
while True:
with db.engine.begin() as conn:
rs = conn.execute(db.text(query_sql), params)
if rs.rowcount == 0:
break
for i in rs:
record_id = str(i.id)
try:
delete_func(record_id)
db.session.commit()
logging.info(click.style(f"Deleted {name} {record_id}", fg="green"))
except Exception:
logging.exception(f"Error occurred while deleting {name} {record_id}")
continue
rs.close()

View File

@@ -0,0 +1,73 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from werkzeug.exceptions import NotFound
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Document, DocumentSegment
@shared_task(queue="dataset")
def remove_document_from_index_task(document_id: str):
"""
Async Remove document from index
:param document_id: document id
Usage: remove_document_from_index.delay(document_id)
"""
logging.info(click.style("Start remove document segments from index: {}".format(document_id), fg="green"))
start_at = time.perf_counter()
document = db.session.query(Document).filter(Document.id == document_id).first()
if not document:
raise NotFound("Document not found")
if document.indexing_status != "completed":
return
indexing_cache_key = "document_{}_indexing".format(document.id)
try:
dataset = document.dataset
if not dataset:
raise Exception("Document has no dataset")
index_processor = IndexProcessorFactory(document.doc_form).init_index_processor()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document.id).all()
index_node_ids = [segment.index_node_id for segment in segments]
if index_node_ids:
try:
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=False)
except Exception:
logging.exception(f"clean dataset {dataset.id} from index failed")
# update segment to disable
db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document.id).update(
{
DocumentSegment.enabled: False,
DocumentSegment.disabled_at: datetime.datetime.now(datetime.UTC).replace(tzinfo=None),
DocumentSegment.disabled_by: document.disabled_by,
DocumentSegment.updated_at: datetime.datetime.now(datetime.UTC).replace(tzinfo=None),
}
)
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style(
"Document removed from index: {} latency: {}".format(document.id, end_at - start_at), fg="green"
)
)
except Exception:
logging.exception("remove document from index failed")
if not document.archived:
document.enabled = True
db.session.commit()
finally:
redis_client.delete(indexing_cache_key)

View File

@@ -0,0 +1,96 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from core.indexing_runner import IndexingRunner
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset, Document, DocumentSegment
from services.feature_service import FeatureService
@shared_task(queue="dataset")
def retry_document_indexing_task(dataset_id: str, document_ids: list[str]):
"""
Async process document
:param dataset_id:
:param document_ids:
Usage: retry_document_indexing_task.delay(dataset_id, document_id)
"""
documents: list[Document] = []
start_at = time.perf_counter()
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
raise ValueError("Dataset not found")
for document_id in document_ids:
retry_indexing_cache_key = "document_{}_is_retried".format(document_id)
# check document limit
features = FeatureService.get_features(dataset.tenant_id)
try:
if features.billing.enabled:
vector_space = features.vector_space
if 0 < vector_space.limit <= vector_space.size:
raise ValueError(
"Your total number of documents plus the number of uploads have over the limit of "
"your subscription."
)
except Exception as e:
document = (
db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
)
if document:
document.indexing_status = "error"
document.error = str(e)
document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
redis_client.delete(retry_indexing_cache_key)
return
logging.info(click.style("Start retry document: {}".format(document_id), fg="green"))
document = (
db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
)
if not document:
logging.info(click.style("Document not found: {}".format(document_id), fg="yellow"))
return
try:
# clean old data
index_processor = IndexProcessorFactory(document.doc_form).init_index_processor()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
if segments:
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
db.session.delete(segment)
db.session.commit()
document.indexing_status = "parsing"
document.processing_started_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
indexing_runner = IndexingRunner()
indexing_runner.run([document])
redis_client.delete(retry_indexing_cache_key)
except Exception as ex:
document.indexing_status = "error"
document.error = str(ex)
document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
logging.info(click.style(str(ex), fg="yellow"))
redis_client.delete(retry_indexing_cache_key)
pass
end_at = time.perf_counter()
logging.info(click.style("Retry dataset: {} latency: {}".format(dataset_id, end_at - start_at), fg="green"))

View File

@@ -0,0 +1,92 @@
import datetime
import logging
import time
import click
from celery import shared_task # type: ignore
from core.indexing_runner import IndexingRunner
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset, Document, DocumentSegment
from services.feature_service import FeatureService
@shared_task(queue="dataset")
def sync_website_document_indexing_task(dataset_id: str, document_id: str):
"""
Async process document
:param dataset_id:
:param document_id:
Usage: sync_website_document_indexing_task.delay(dataset_id, document_id)
"""
start_at = time.perf_counter()
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if dataset is None:
raise ValueError("Dataset not found")
sync_indexing_cache_key = "document_{}_is_sync".format(document_id)
# check document limit
features = FeatureService.get_features(dataset.tenant_id)
try:
if features.billing.enabled:
vector_space = features.vector_space
if 0 < vector_space.limit <= vector_space.size:
raise ValueError(
"Your total number of documents plus the number of uploads have over the limit of "
"your subscription."
)
except Exception as e:
document = (
db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
)
if document:
document.indexing_status = "error"
document.error = str(e)
document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
redis_client.delete(sync_indexing_cache_key)
return
logging.info(click.style("Start sync website document: {}".format(document_id), fg="green"))
document = db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
if not document:
logging.info(click.style("Document not found: {}".format(document_id), fg="yellow"))
return
try:
# clean old data
index_processor = IndexProcessorFactory(document.doc_form).init_index_processor()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
if segments:
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
db.session.delete(segment)
db.session.commit()
document.indexing_status = "parsing"
document.processing_started_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
indexing_runner = IndexingRunner()
indexing_runner.run([document])
redis_client.delete(sync_indexing_cache_key)
except Exception as ex:
document.indexing_status = "error"
document.error = str(ex)
document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
logging.info(click.style(str(ex), fg="yellow"))
redis_client.delete(sync_indexing_cache_key)
pass
end_at = time.perf_counter()
logging.info(click.style("Sync document: {} latency: {}".format(document_id, end_at - start_at), fg="green"))