Kevin Hu
commited on
Commit
·
0f25ebd
1
Parent(s):
916b3cc
make language judgement robuster (#3287)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
- rag/nlp/query.py +2 -1
rag/nlp/query.py
CHANGED
|
@@ -63,9 +63,9 @@ class EsQueryer:
|
|
| 63 |
rag_tokenizer.tradi2simp(
|
| 64 |
rag_tokenizer.strQ2B(
|
| 65 |
txt.lower()))).strip()
|
| 66 |
-
txt = EsQueryer.rmWWW(txt)
|
| 67 |
|
| 68 |
if not self.isChinese(txt):
|
|
|
|
| 69 |
tks = rag_tokenizer.tokenize(txt).split(" ")
|
| 70 |
tks_w = self.tw.weights(tks)
|
| 71 |
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
|
|
@@ -89,6 +89,7 @@ class EsQueryer:
|
|
| 89 |
return False
|
| 90 |
return True
|
| 91 |
|
|
|
|
| 92 |
qs, keywords = [], []
|
| 93 |
for tt in self.tw.split(txt)[:256]: # .split(" "):
|
| 94 |
if not tt:
|
|
|
|
| 63 |
rag_tokenizer.tradi2simp(
|
| 64 |
rag_tokenizer.strQ2B(
|
| 65 |
txt.lower()))).strip()
|
|
|
|
| 66 |
|
| 67 |
if not self.isChinese(txt):
|
| 68 |
+
txt = EsQueryer.rmWWW(txt)
|
| 69 |
tks = rag_tokenizer.tokenize(txt).split(" ")
|
| 70 |
tks_w = self.tw.weights(tks)
|
| 71 |
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
|
|
|
|
| 89 |
return False
|
| 90 |
return True
|
| 91 |
|
| 92 |
+
txt = EsQueryer.rmWWW(txt)
|
| 93 |
qs, keywords = [], []
|
| 94 |
for tt in self.tw.split(txt)[:256]: # .split(" "):
|
| 95 |
if not tt:
|