Sophia Koehler
commited on
Commit
·
a8a9cd5
1
Parent(s):
2fa43bc
fix3
Browse files
app.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
|
| 3 |
-
from dataclasses import dataclass
|
| 4 |
import pickle
|
| 5 |
import os
|
| 6 |
-
from
|
|
|
|
| 7 |
from nlp4web_codebase.ir.data_loaders.dm import Document
|
| 8 |
from collections import Counter
|
| 9 |
import tqdm
|
|
@@ -11,6 +12,10 @@ import re
|
|
| 11 |
import nltk
|
| 12 |
nltk.download("stopwords", quiet=True)
|
| 13 |
from nltk.corpus import stopwords as nltk_stopwords
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
LANGUAGE = "english"
|
| 16 |
word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
|
|
@@ -133,21 +138,8 @@ def run_counting(
|
|
| 133 |
doc_texts=doc_texts,
|
| 134 |
)
|
| 135 |
|
| 136 |
-
from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
|
| 137 |
-
sciq = load_sciq()
|
| 138 |
-
counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
|
| 139 |
-
|
| 140 |
"""### BM25 Index"""
|
| 141 |
|
| 142 |
-
from __future__ import annotations
|
| 143 |
-
from dataclasses import asdict, dataclass
|
| 144 |
-
import math
|
| 145 |
-
import os
|
| 146 |
-
from typing import Iterable, List, Optional, Type
|
| 147 |
-
import tqdm
|
| 148 |
-
from nlp4web_codebase.ir.data_loaders.dm import Document
|
| 149 |
-
|
| 150 |
-
|
| 151 |
@dataclass
|
| 152 |
class BM25Index(InvertedIndex):
|
| 153 |
|
|
@@ -237,11 +229,6 @@ class BM25Index(InvertedIndex):
|
|
| 237 |
|
| 238 |
"""### BM25 Retriever"""
|
| 239 |
|
| 240 |
-
from nlp4web_codebase.ir.models import BaseRetriever
|
| 241 |
-
from typing import Type
|
| 242 |
-
from abc import abstractmethod
|
| 243 |
-
|
| 244 |
-
|
| 245 |
class BaseInvertedIndexRetriever(BaseRetriever):
|
| 246 |
|
| 247 |
@property
|
|
@@ -301,9 +288,6 @@ class BM25Retriever(BaseInvertedIndexRetriever):
|
|
| 301 |
return BM25Index
|
| 302 |
|
| 303 |
|
| 304 |
-
import gradio as gr
|
| 305 |
-
from typing import TypedDict
|
| 306 |
-
|
| 307 |
class Hit(TypedDict):
|
| 308 |
cid: str
|
| 309 |
score: float
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
|
| 3 |
+
from dataclasses import asdict, dataclass
|
| 4 |
import pickle
|
| 5 |
import os
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar, TypedDict
|
| 8 |
from nlp4web_codebase.ir.data_loaders.dm import Document
|
| 9 |
from collections import Counter
|
| 10 |
import tqdm
|
|
|
|
| 12 |
import nltk
|
| 13 |
nltk.download("stopwords", quiet=True)
|
| 14 |
from nltk.corpus import stopwords as nltk_stopwords
|
| 15 |
+
import math
|
| 16 |
+
from nlp4web_codebase.ir.models import BaseRetriever
|
| 17 |
+
from abc import abstractmethod
|
| 18 |
+
import gradio as gr
|
| 19 |
|
| 20 |
LANGUAGE = "english"
|
| 21 |
word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
|
|
|
|
| 138 |
doc_texts=doc_texts,
|
| 139 |
)
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
"""### BM25 Index"""
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
@dataclass
|
| 144 |
class BM25Index(InvertedIndex):
|
| 145 |
|
|
|
|
| 229 |
|
| 230 |
"""### BM25 Retriever"""
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
class BaseInvertedIndexRetriever(BaseRetriever):
|
| 233 |
|
| 234 |
@property
|
|
|
|
| 288 |
return BM25Index
|
| 289 |
|
| 290 |
|
|
|
|
|
|
|
|
|
|
| 291 |
class Hit(TypedDict):
|
| 292 |
cid: str
|
| 293 |
score: float
|