Kevin Hu
commited on
Commit
·
a92e785
1
Parent(s):
447446d
refactor auto keywords and auto question (#2990)
Browse files### What problem does this PR solve?
### Type of change
- [x] Refactoring
- rag/svr/task_executor.py +20 -17
rag/svr/task_executor.py
CHANGED
|
@@ -199,23 +199,6 @@ def build(row):
|
|
| 199 |
d["_id"] = md5.hexdigest()
|
| 200 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 201 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
| 202 |
-
|
| 203 |
-
if row["parser_config"].get("auto_keywords", 0):
|
| 204 |
-
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
|
| 205 |
-
d["important_kwd"] = keyword_extraction(chat_mdl, ck["content_with_weight"],
|
| 206 |
-
row["parser_config"]["auto_keywords"]).split(",")
|
| 207 |
-
d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
|
| 208 |
-
|
| 209 |
-
if row["parser_config"].get("auto_questions", 0):
|
| 210 |
-
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
|
| 211 |
-
qst = question_proposal(chat_mdl, ck["content_with_weight"], row["parser_config"]["auto_keywords"])
|
| 212 |
-
ck["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + ck["content_with_weight"]
|
| 213 |
-
qst = rag_tokenizer.tokenize(qst)
|
| 214 |
-
if "content_ltks" in ck:
|
| 215 |
-
ck["content_ltks"] += " " + qst
|
| 216 |
-
if "content_sm_ltks" in ck:
|
| 217 |
-
ck["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
|
| 218 |
-
|
| 219 |
if not d.get("image"):
|
| 220 |
docs.append(d)
|
| 221 |
continue
|
|
@@ -239,6 +222,26 @@ def build(row):
|
|
| 239 |
docs.append(d)
|
| 240 |
cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
return docs
|
| 243 |
|
| 244 |
|
|
|
|
| 199 |
d["_id"] = md5.hexdigest()
|
| 200 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 201 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
if not d.get("image"):
|
| 203 |
docs.append(d)
|
| 204 |
continue
|
|
|
|
| 222 |
docs.append(d)
|
| 223 |
cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
|
| 224 |
|
| 225 |
+
if row["parser_config"].get("auto_keywords", 0):
|
| 226 |
+
callback(msg="Start to generate keywords for every chunk ...")
|
| 227 |
+
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
|
| 228 |
+
for d in docs:
|
| 229 |
+
d["important_kwd"] = keyword_extraction(chat_mdl, d["content_with_weight"],
|
| 230 |
+
row["parser_config"]["auto_keywords"]).split(",")
|
| 231 |
+
d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
|
| 232 |
+
|
| 233 |
+
if row["parser_config"].get("auto_questions", 0):
|
| 234 |
+
callback(msg="Start to generate questions for every chunk ...")
|
| 235 |
+
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
|
| 236 |
+
for d in docs:
|
| 237 |
+
qst = question_proposal(chat_mdl, d["content_with_weight"], row["parser_config"]["auto_questions"])
|
| 238 |
+
d["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + d["content_with_weight"]
|
| 239 |
+
qst = rag_tokenizer.tokenize(qst)
|
| 240 |
+
if "content_ltks" in d:
|
| 241 |
+
d["content_ltks"] += " " + qst
|
| 242 |
+
if "content_sm_ltks" in d:
|
| 243 |
+
d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
|
| 244 |
+
|
| 245 |
return docs
|
| 246 |
|
| 247 |
|