cecilia-uu
commited on
Commit
·
12defec
1
Parent(s):
67bae62
API: completed delete_doc api (#1290)
Browse files### What problem does this PR solve?
Adds the functionality of deleting documentation
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/documents_api.py +58 -2
- sdk/python/ragflow/ragflow.py +6 -3
- sdk/python/test/test_document.py +87 -3
api/apps/documents_api.py
CHANGED
|
@@ -24,6 +24,7 @@ from flask_login import login_required, current_user
|
|
| 24 |
from api.db import FileType, ParserType
|
| 25 |
from api.db.services import duplicate_name
|
| 26 |
from api.db.services.document_service import DocumentService
|
|
|
|
| 27 |
from api.db.services.file_service import FileService
|
| 28 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 29 |
from api.settings import RetCode
|
|
@@ -31,6 +32,8 @@ from api.utils import get_uuid
|
|
| 31 |
from api.utils.api_utils import construct_json_result
|
| 32 |
from api.utils.file_utils import filename_type, thumbnail
|
| 33 |
from rag.utils.minio_conn import MINIO
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
MAXIMUM_OF_UPLOADING_FILES = 256
|
|
@@ -89,6 +92,7 @@ def upload(dataset_id):
|
|
| 89 |
# grab all the errs
|
| 90 |
err = []
|
| 91 |
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
|
|
|
| 92 |
for file in file_objs:
|
| 93 |
try:
|
| 94 |
# TODO: get this value from the database as some tenants have this limit while others don't
|
|
@@ -132,6 +136,7 @@ def upload(dataset_id):
|
|
| 132 |
DocumentService.insert(doc)
|
| 133 |
|
| 134 |
FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
|
|
|
|
| 135 |
except Exception as e:
|
| 136 |
err.append(file.filename + ": " + str(e))
|
| 137 |
|
|
@@ -139,14 +144,65 @@ def upload(dataset_id):
|
|
| 139 |
# return all the errors
|
| 140 |
return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
|
| 141 |
# success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
return construct_json_result(data=True, code=RetCode.SUCCESS)
|
| 143 |
|
| 144 |
# ----------------------------upload online files------------------------------------------------
|
| 145 |
|
| 146 |
# ----------------------------download a file-----------------------------------------------------
|
| 147 |
|
| 148 |
-
# ----------------------------delete a file-----------------------------------------------------
|
| 149 |
-
|
| 150 |
# ----------------------------enable rename-----------------------------------------------------
|
| 151 |
|
| 152 |
# ----------------------------list files-----------------------------------------------------
|
|
|
|
| 24 |
from api.db import FileType, ParserType
|
| 25 |
from api.db.services import duplicate_name
|
| 26 |
from api.db.services.document_service import DocumentService
|
| 27 |
+
from api.db.services.file2document_service import File2DocumentService
|
| 28 |
from api.db.services.file_service import FileService
|
| 29 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 30 |
from api.settings import RetCode
|
|
|
|
| 32 |
from api.utils.api_utils import construct_json_result
|
| 33 |
from api.utils.file_utils import filename_type, thumbnail
|
| 34 |
from rag.utils.minio_conn import MINIO
|
| 35 |
+
from api.db.db_models import Task, File
|
| 36 |
+
from api.db import FileType, TaskStatus, ParserType, FileSource
|
| 37 |
|
| 38 |
|
| 39 |
MAXIMUM_OF_UPLOADING_FILES = 256
|
|
|
|
| 92 |
# grab all the errs
|
| 93 |
err = []
|
| 94 |
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
| 95 |
+
uploaded_docs_json = []
|
| 96 |
for file in file_objs:
|
| 97 |
try:
|
| 98 |
# TODO: get this value from the database as some tenants have this limit while others don't
|
|
|
|
| 136 |
DocumentService.insert(doc)
|
| 137 |
|
| 138 |
FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
|
| 139 |
+
uploaded_docs_json.append(doc)
|
| 140 |
except Exception as e:
|
| 141 |
err.append(file.filename + ": " + str(e))
|
| 142 |
|
|
|
|
| 144 |
# return all the errors
|
| 145 |
return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
|
| 146 |
# success
|
| 147 |
+
return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS)
|
| 148 |
+
|
| 149 |
+
# ----------------------------delete a file-----------------------------------------------------
|
| 150 |
+
@manager.route('/<dataset_id>/<document_id>', methods=['DELETE'])
|
| 151 |
+
@login_required
|
| 152 |
+
def delete(document_id, dataset_id): # string
|
| 153 |
+
# get the root folder
|
| 154 |
+
root_folder = FileService.get_root_folder(current_user.id)
|
| 155 |
+
# parent file's id
|
| 156 |
+
parent_file_id = root_folder["id"]
|
| 157 |
+
# consider the new user
|
| 158 |
+
FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
|
| 159 |
+
# store all the errors that may have
|
| 160 |
+
errors = ""
|
| 161 |
+
try:
|
| 162 |
+
# whether there is this document
|
| 163 |
+
exist, doc = DocumentService.get_by_id(document_id)
|
| 164 |
+
if not exist:
|
| 165 |
+
return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR)
|
| 166 |
+
# whether this doc is authorized by this tenant
|
| 167 |
+
tenant_id = DocumentService.get_tenant_id(document_id)
|
| 168 |
+
if not tenant_id:
|
| 169 |
+
return construct_json_result(message=f"You cannot delete this document {document_id} due to the authorization"
|
| 170 |
+
f" reason!", code=RetCode.AUTHENTICATION_ERROR)
|
| 171 |
+
|
| 172 |
+
# get the doc's id and location
|
| 173 |
+
real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id)
|
| 174 |
+
|
| 175 |
+
if real_dataset_id != dataset_id:
|
| 176 |
+
return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, "
|
| 177 |
+
f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR)
|
| 178 |
+
|
| 179 |
+
# there is an issue when removing
|
| 180 |
+
if not DocumentService.remove_document(doc, tenant_id):
|
| 181 |
+
return construct_json_result(
|
| 182 |
+
message="There was an error during the document removal process. Please check the status of the "
|
| 183 |
+
"RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR)
|
| 184 |
+
|
| 185 |
+
# fetch the File2Document record associated with the provided document ID.
|
| 186 |
+
file_to_doc = File2DocumentService.get_by_document_id(document_id)
|
| 187 |
+
# delete the associated File record.
|
| 188 |
+
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id])
|
| 189 |
+
# delete the File2Document record itself using the document ID. This removes the
|
| 190 |
+
# association between the document and the file after the File record has been deleted.
|
| 191 |
+
File2DocumentService.delete_by_document_id(document_id)
|
| 192 |
+
|
| 193 |
+
# delete it from minio
|
| 194 |
+
MINIO.rm(dataset_id, location)
|
| 195 |
+
except Exception as e:
|
| 196 |
+
errors += str(e)
|
| 197 |
+
if errors:
|
| 198 |
+
return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)
|
| 199 |
+
|
| 200 |
return construct_json_result(data=True, code=RetCode.SUCCESS)
|
| 201 |
|
| 202 |
# ----------------------------upload online files------------------------------------------------
|
| 203 |
|
| 204 |
# ----------------------------download a file-----------------------------------------------------
|
| 205 |
|
|
|
|
|
|
|
| 206 |
# ----------------------------enable rename-----------------------------------------------------
|
| 207 |
|
| 208 |
# ----------------------------list files-----------------------------------------------------
|
sdk/python/ragflow/ragflow.py
CHANGED
|
@@ -101,10 +101,13 @@ class RAGFlow:
|
|
| 101 |
result_dict = json.loads(res.text)
|
| 102 |
return result_dict
|
| 103 |
|
| 104 |
-
# ----------------------------upload remote files-----------------------------------------------------
|
| 105 |
-
# ----------------------------download a file-----------------------------------------------------
|
| 106 |
-
|
| 107 |
# ----------------------------delete a file-----------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
# ----------------------------enable rename-----------------------------------------------------
|
| 110 |
|
|
|
|
| 101 |
result_dict = json.loads(res.text)
|
| 102 |
return result_dict
|
| 103 |
|
|
|
|
|
|
|
|
|
|
| 104 |
# ----------------------------delete a file-----------------------------------------------------
|
| 105 |
+
def delete_files(self, document_id, dataset_id):
|
| 106 |
+
endpoint = f"{self.document_url}/{dataset_id}/{document_id}"
|
| 107 |
+
res = requests.delete(endpoint, headers=self.authorization_header)
|
| 108 |
+
return res.json()
|
| 109 |
+
|
| 110 |
+
# ----------------------------download a file-----------------------------------------------------
|
| 111 |
|
| 112 |
# ----------------------------enable rename-----------------------------------------------------
|
| 113 |
|
sdk/python/test/test_document.py
CHANGED
|
@@ -149,11 +149,95 @@ class TestFile(TestSdk):
|
|
| 149 |
res = ragflow.upload_local_file(dataset_id, file_paths)
|
| 150 |
assert res['code'] == RetCode.ARGUMENT_ERROR and res['message'] == 'Remote files have not unsupported.'
|
| 151 |
|
| 152 |
-
# ----------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
# ----------------------------enable rename-----------------------------------------------------
|
| 159 |
|
|
|
|
| 149 |
res = ragflow.upload_local_file(dataset_id, file_paths)
|
| 150 |
assert res['code'] == RetCode.ARGUMENT_ERROR and res['message'] == 'Remote files have not unsupported.'
|
| 151 |
|
| 152 |
+
# ----------------------------delete a file-----------------------------------------------------
|
| 153 |
+
def test_delete_one_file(self):
|
| 154 |
+
"""
|
| 155 |
+
Test deleting one file with success.
|
| 156 |
+
"""
|
| 157 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
| 158 |
+
created_res = ragflow.create_dataset("test_delete_one_file")
|
| 159 |
+
dataset_id = created_res['data']['dataset_id']
|
| 160 |
+
file_paths = ["test_data/test.txt"]
|
| 161 |
+
res = ragflow.upload_local_file(dataset_id, file_paths)
|
| 162 |
+
# get the doc_id
|
| 163 |
+
data = res['data'][0]
|
| 164 |
+
doc_id = data['id']
|
| 165 |
+
# delete the files
|
| 166 |
+
deleted_res = ragflow.delete_files(doc_id, dataset_id)
|
| 167 |
+
# assert value
|
| 168 |
+
assert deleted_res['code'] == RetCode.SUCCESS and deleted_res['data'] is True
|
| 169 |
|
| 170 |
+
def test_delete_document_with_not_existing_document(self):
|
| 171 |
+
"""
|
| 172 |
+
Test deleting a document that does not exist with failure.
|
| 173 |
+
"""
|
| 174 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
| 175 |
+
created_res = ragflow.create_dataset("test_delete_document_with_not_existing_document")
|
| 176 |
+
dataset_id = created_res['data']['dataset_id']
|
| 177 |
+
res = ragflow.delete_files("111", dataset_id)
|
| 178 |
+
assert res['code'] == RetCode.DATA_ERROR and res['message'] == 'Document 111 not found!'
|
| 179 |
|
| 180 |
+
def test_delete_document_with_creating_100_documents_and_deleting_100_documents(self):
|
| 181 |
+
"""
|
| 182 |
+
Test deleting documents when uploading 100 docs and deleting 100 docs.
|
| 183 |
+
"""
|
| 184 |
+
# upload 100 docs
|
| 185 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
| 186 |
+
created_res = ragflow.create_dataset("test_delete_one_file")
|
| 187 |
+
dataset_id = created_res['data']['dataset_id']
|
| 188 |
+
file_paths = ["test_data/test.txt"] * 100
|
| 189 |
+
res = ragflow.upload_local_file(dataset_id, file_paths)
|
| 190 |
+
|
| 191 |
+
# get the doc_id
|
| 192 |
+
data = res['data']
|
| 193 |
+
for d in data:
|
| 194 |
+
doc_id = d['id']
|
| 195 |
+
# delete the files
|
| 196 |
+
deleted_res = ragflow.delete_files(doc_id, dataset_id)
|
| 197 |
+
# assert value
|
| 198 |
+
assert deleted_res['code'] == RetCode.SUCCESS and deleted_res['data'] is True
|
| 199 |
+
|
| 200 |
+
def test_delete_document_from_nonexistent_dataset(self):
|
| 201 |
+
"""
|
| 202 |
+
Test deleting documents from a non-existent dataset
|
| 203 |
+
"""
|
| 204 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
| 205 |
+
created_res = ragflow.create_dataset("test_delete_one_file")
|
| 206 |
+
dataset_id = created_res['data']['dataset_id']
|
| 207 |
+
file_paths = ["test_data/test.txt"]
|
| 208 |
+
res = ragflow.upload_local_file(dataset_id, file_paths)
|
| 209 |
+
# get the doc_id
|
| 210 |
+
data = res['data'][0]
|
| 211 |
+
doc_id = data['id']
|
| 212 |
+
# delete the files
|
| 213 |
+
deleted_res = ragflow.delete_files(doc_id, "000")
|
| 214 |
+
# assert value
|
| 215 |
+
assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] ==
|
| 216 |
+
f'The document {doc_id} is not in the dataset: 000, but in the dataset: {dataset_id}.')
|
| 217 |
+
|
| 218 |
+
def test_delete_document_which_is_located_in_other_dataset(self):
|
| 219 |
+
"""
|
| 220 |
+
Test deleting a document which is located in other dataset.
|
| 221 |
+
"""
|
| 222 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
| 223 |
+
# upload a document
|
| 224 |
+
created_res = ragflow.create_dataset("test_delete_document_which_is_located_in_other_dataset")
|
| 225 |
+
created_res_id = created_res['data']['dataset_id']
|
| 226 |
+
file_paths = ["test_data/test.txt"]
|
| 227 |
+
res = ragflow.upload_local_file(created_res_id, file_paths)
|
| 228 |
+
# other dataset
|
| 229 |
+
other_res = ragflow.create_dataset("other_dataset")
|
| 230 |
+
other_dataset_id = other_res['data']['dataset_id']
|
| 231 |
+
# get the doc_id
|
| 232 |
+
data = res['data'][0]
|
| 233 |
+
doc_id = data['id']
|
| 234 |
+
# delete the files from the other dataset
|
| 235 |
+
deleted_res = ragflow.delete_files(doc_id, other_dataset_id)
|
| 236 |
+
# assert value
|
| 237 |
+
assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] ==
|
| 238 |
+
f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.')
|
| 239 |
+
|
| 240 |
+
# ----------------------------download a file-----------------------------------------------------
|
| 241 |
|
| 242 |
# ----------------------------enable rename-----------------------------------------------------
|
| 243 |
|