Spaces:
Running
Running
Commit
·
38198b1
1
Parent(s):
f957de1
zeroGPU spaces duration can now be defined from environment variable
Browse files- funcs/embeddings.py +6 -2
- funcs/helper_functions.py +3 -0
- funcs/topic_core_funcs.py +3 -2
funcs/embeddings.py
CHANGED
|
@@ -1,19 +1,23 @@
|
|
|
|
|
| 1 |
import time
|
| 2 |
import numpy as np
|
| 3 |
import os
|
| 4 |
-
|
| 5 |
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from sklearn.pipeline import make_pipeline
|
| 8 |
from sklearn.decomposition import TruncatedSVD
|
| 9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
# If you want to disable cuda for testing purposes
|
| 13 |
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
| 14 |
|
| 15 |
|
| 16 |
-
@spaces.GPU(duration=
|
| 17 |
def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1", random_seed:int=42) -> np.ndarray:
|
| 18 |
"""
|
| 19 |
Create or load embeddings for the given documents.
|
|
|
|
| 1 |
+
import spaces
|
| 2 |
import time
|
| 3 |
import numpy as np
|
| 4 |
import os
|
| 5 |
+
|
| 6 |
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
from sklearn.pipeline import make_pipeline
|
| 9 |
from sklearn.decomposition import TruncatedSVD
|
| 10 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 11 |
+
from funcs.helper_functions import GPU_SPACE_DURATION
|
| 12 |
+
|
| 13 |
+
|
| 14 |
|
| 15 |
|
| 16 |
# If you want to disable cuda for testing purposes
|
| 17 |
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
| 18 |
|
| 19 |
|
| 20 |
+
@spaces.GPU(duration=GPU_SPACE_DURATION)
|
| 21 |
def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1", random_seed:int=42) -> np.ndarray:
|
| 22 |
"""
|
| 23 |
Create or load embeddings for the given documents.
|
funcs/helper_functions.py
CHANGED
|
@@ -33,6 +33,9 @@ default_value = 'output/'
|
|
| 33 |
output_folder = get_or_create_env_var(env_var_name, default_value)
|
| 34 |
print(f'The value of {env_var_name} is {output_folder}')
|
| 35 |
|
|
|
|
|
|
|
|
|
|
| 36 |
def ensure_output_folder_exists():
|
| 37 |
"""Checks if the 'output/' folder exists, creates it if not."""
|
| 38 |
|
|
|
|
| 33 |
output_folder = get_or_create_env_var(env_var_name, default_value)
|
| 34 |
print(f'The value of {env_var_name} is {output_folder}')
|
| 35 |
|
| 36 |
+
GPU_SPACE_DURATION = int(get_or_create_env_var('GPU_SPACE_DURATION', '60'))
|
| 37 |
+
print(f'The value of GPU_SPACE_DURATION is {GPU_SPACE_DURATION}')
|
| 38 |
+
|
| 39 |
def ensure_output_folder_exists():
|
| 40 |
"""Checks if the 'output/' folder exists, creates it if not."""
|
| 41 |
|
funcs/topic_core_funcs.py
CHANGED
|
@@ -13,7 +13,7 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
| 13 |
|
| 14 |
from funcs.clean_funcs import initial_clean, regex_clean
|
| 15 |
from funcs.anonymiser import expand_sentences_spacy
|
| 16 |
-
from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs, output_folder, get_or_create_env_var, custom_regex_load
|
| 17 |
from funcs.embeddings import make_or_load_embeddings
|
| 18 |
from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
|
| 19 |
|
|
@@ -28,6 +28,7 @@ umap_min_dist = 0.0
|
|
| 28 |
umap_metric = 'cosine'
|
| 29 |
random_seed = 42
|
| 30 |
|
|
|
|
| 31 |
today = datetime.now().strftime("%d%m%Y")
|
| 32 |
today_rev = datetime.now().strftime("%Y%m%d")
|
| 33 |
|
|
@@ -546,7 +547,7 @@ def reduce_outliers(topic_model: BERTopic, docs: List[str], embeddings_out: np.n
|
|
| 546 |
|
| 547 |
return output_text, output_list, topic_model
|
| 548 |
|
| 549 |
-
@spaces.GPU(duration=
|
| 550 |
def represent_topics(topic_model: BERTopic, docs: List[str], data_file_name_no_ext: str, high_quality_mode: str, save_topic_model: str, representation_type: str, vectoriser_model: CountVectorizer, split_sentence_drop: str, data: PandasDataFrame, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
|
| 551 |
"""
|
| 552 |
Represents topics using the specified representation model and updates the topic labels accordingly.
|
|
|
|
| 13 |
|
| 14 |
from funcs.clean_funcs import initial_clean, regex_clean
|
| 15 |
from funcs.anonymiser import expand_sentences_spacy
|
| 16 |
+
from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs, output_folder, get_or_create_env_var, custom_regex_load, GPU_SPACE_DURATION
|
| 17 |
from funcs.embeddings import make_or_load_embeddings
|
| 18 |
from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
|
| 19 |
|
|
|
|
| 28 |
umap_metric = 'cosine'
|
| 29 |
random_seed = 42
|
| 30 |
|
| 31 |
+
|
| 32 |
today = datetime.now().strftime("%d%m%Y")
|
| 33 |
today_rev = datetime.now().strftime("%Y%m%d")
|
| 34 |
|
|
|
|
| 547 |
|
| 548 |
return output_text, output_list, topic_model
|
| 549 |
|
| 550 |
+
@spaces.GPU(duration=GPU_SPACE_DURATION)
|
| 551 |
def represent_topics(topic_model: BERTopic, docs: List[str], data_file_name_no_ext: str, high_quality_mode: str, save_topic_model: str, representation_type: str, vectoriser_model: CountVectorizer, split_sentence_drop: str, data: PandasDataFrame, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
|
| 552 |
"""
|
| 553 |
Represents topics using the specified representation model and updates the topic labels accordingly.
|