Spaces:

seanpedrickcase
/

topic_modelling

Running

seanpedrickcase commited on Oct 27

Commit

38198b1

1 Parent(s): f957de1

zeroGPU spaces duration can now be defined from environment variable

Files changed (3) hide show

funcs/embeddings.py CHANGED Viewed

@@ -1,19 +1,23 @@
 import time
 import numpy as np
 import os
-import spaces
 from sentence_transformers import SentenceTransformer
 from sklearn.pipeline import make_pipeline
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import TfidfVectorizer
 # If you want to disable cuda for testing purposes
 #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
-@spaces.GPU(duration=120)
 def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1", random_seed:int=42) -> np.ndarray:
     """
     Create or load embeddings for the given documents.

+import spaces
 import time
 import numpy as np
 import os
 from sentence_transformers import SentenceTransformer
 from sklearn.pipeline import make_pipeline
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import TfidfVectorizer
+from funcs.helper_functions import GPU_SPACE_DURATION
 # If you want to disable cuda for testing purposes
 #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+@spaces.GPU(duration=GPU_SPACE_DURATION)
 def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1", random_seed:int=42) -> np.ndarray:
     """
     Create or load embeddings for the given documents.

funcs/helper_functions.py CHANGED Viewed

@@ -33,6 +33,9 @@ default_value = 'output/'
 output_folder = get_or_create_env_var(env_var_name, default_value)
 print(f'The value of {env_var_name} is {output_folder}')
 def ensure_output_folder_exists():
     """Checks if the 'output/' folder exists, creates it if not."""

 output_folder = get_or_create_env_var(env_var_name, default_value)
 print(f'The value of {env_var_name} is {output_folder}')
+GPU_SPACE_DURATION = int(get_or_create_env_var('GPU_SPACE_DURATION', '60'))
+print(f'The value of GPU_SPACE_DURATION is {GPU_SPACE_DURATION}')
 def ensure_output_folder_exists():
     """Checks if the 'output/' folder exists, creates it if not."""

funcs/topic_core_funcs.py CHANGED Viewed

@@ -13,7 +13,7 @@ PandasDataFrame = Type[pd.DataFrame]
 from funcs.clean_funcs import initial_clean, regex_clean
 from funcs.anonymiser import expand_sentences_spacy
-from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs, output_folder, get_or_create_env_var, custom_regex_load
 from funcs.embeddings import make_or_load_embeddings
 from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
@@ -28,6 +28,7 @@ umap_min_dist = 0.0
 umap_metric = 'cosine'
 random_seed = 42
 today = datetime.now().strftime("%d%m%Y")
 today_rev = datetime.now().strftime("%Y%m%d")
@@ -546,7 +547,7 @@ def reduce_outliers(topic_model: BERTopic, docs: List[str], embeddings_out: np.n
     return output_text, output_list, topic_model
-@spaces.GPU(duration=120)
 def represent_topics(topic_model: BERTopic, docs: List[str], data_file_name_no_ext: str, high_quality_mode: str, save_topic_model: str, representation_type: str, vectoriser_model: CountVectorizer, split_sentence_drop: str, data: PandasDataFrame, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
     """
     Represents topics using the specified representation model and updates the topic labels accordingly.

 from funcs.clean_funcs import initial_clean, regex_clean
 from funcs.anonymiser import expand_sentences_spacy
+from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs, output_folder, get_or_create_env_var, custom_regex_load, GPU_SPACE_DURATION
 from funcs.embeddings import make_or_load_embeddings
 from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
 umap_metric = 'cosine'
 random_seed = 42
 today = datetime.now().strftime("%d%m%Y")
 today_rev = datetime.now().strftime("%Y%m%d")
     return output_text, output_list, topic_model
+@spaces.GPU(duration=GPU_SPACE_DURATION)
 def represent_topics(topic_model: BERTopic, docs: List[str], data_file_name_no_ext: str, high_quality_mode: str, save_topic_model: str, representation_type: str, vectoriser_model: CountVectorizer, split_sentence_drop: str, data: PandasDataFrame, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
     """
     Represents topics using the specified representation model and updates the topic labels accordingly.