import wx from wx import Image, Bitmap, StaticBitmap import os import sys import re import json import threading from pathlib import Path from rapidfuzz import fuzz, process from typing import List, Tuple, Optional, Match, Dict, Set import logging # --- Configure logging --- # Create a custom logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # Default to info level # Create handler and set level handler = logging.StreamHandler() handler.setLevel(logging.INFO) # Create formatter formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # Add handler to logger logger.addHandler(handler) def resource_path(relative_path): """Get absolute path to resource, works for dev and PyInstaller""" try: # PyInstaller creates a temp folder and stores path in _MEIPASS base_path = sys._MEIPASS except AttributeError: base_path = os.path.abspath(".") return os.path.join(base_path, relative_path) # --- Engine --- class TextProcessor: def __init__(self, file_path: str): self.file_path = Path(file_path) self.text_bytes: Optional[bytes] = None # Explicit type annotation self.decoded_text: Optional[str] = None self.char_to_byte: Optional[List[int]] = None self.load_and_process_file() def load_and_process_file(self): try: logger.debug(f"Loading file: {self.file_path}") self.text_bytes = self.file_path.read_bytes() self.decoded_text = self.text_bytes.decode("utf-8", errors="surrogateescape") self._build_char_to_byte_mapping() except Exception as e: raise RuntimeError(f"Failed to read file {self.file_path}: {str(e)}") def _build_char_to_byte_mapping(self): if self.decoded_text is None: return # Safety check logger.debug("Building character-to-byte mapping") self.char_to_byte = [0] for ch in self.decoded_text: self.char_to_byte.append(self.char_to_byte[-1] + len(ch.encode("utf-8", errors="surrogateescape"))) class Match: def __init__(self, pattern: str, text: str, start_char: int, end_char: int): self.pattern = pattern self.text = text self.start_char = start_char self.end_char = end_char self.byte_start: Optional[int] = None self.byte_end: Optional[int] = None def set_byte_positions(self, char_to_byte_map: List[int]) -> None: """Convert character positions to byte positions using the mapping.""" if char_to_byte_map and len(char_to_byte_map) > self.start_char: self.byte_start = char_to_byte_map[self.start_char] if char_to_byte_map and len(char_to_byte_map) > self.end_char: self.byte_end = char_to_byte_map[self.end_char] class SnippetExtractor: # Pre-compiled regex patterns for performance _regex_cache: dict[str, str] = {} @staticmethod def wildcards_to_regex(pattern: str) -> str: """ Convert wildcard pattern to regex with caching. - '?' → matches exactly one character of any type - '*' → matches zero or more non-whitespace chars """ try: # Use cache for better performance if pattern in SnippetExtractor._regex_cache: return SnippetExtractor._regex_cache[pattern] logger.debug(f"Converting wildcard pattern to regex: {pattern}") regex_parts = [] i = 0 while i < len(pattern): ch = pattern[i] if ch == '?': regex_parts.append('.') i += 1 elif ch == '*': regex_parts.append(r'(?:\S*)') i += 1 else: regex_parts.append(re.escape(ch)) i += 1 result = "".join(regex_parts) SnippetExtractor._regex_cache[pattern] = result logger.debug(f"Converted pattern '{pattern}' to regex: {result}") return result except Exception as e: raise RuntimeError(f"Failed to convert wildcard pattern '{pattern}' to regex: {str(e)}") @staticmethod def expand_to_word_boundaries(text: str, start_char: int, end_char: int, pattern: str): """ Expand match boundaries depending on '*' position. """ try: # Exact match for '?' only patterns if '?' in pattern and '*' not in pattern: return text[start_char:end_char], start_char, end_char expanded_start = start_char expanded_end = end_char logger.debug(f"Expanding boundaries for pattern '{pattern}'") if '*' in pattern: if pattern.startswith('*') and not pattern.endswith('*'): # expand LEFT until whitespace - optimized with backward search while expanded_start > 0 and not text[expanded_start - 1].isspace(): expanded_start -= 1 elif pattern.endswith('*') and not pattern.startswith('*'): # expand RIGHT until whitespace - optimized forward search while expanded_end < len(text) and not text[expanded_end].isspace(): expanded_end += 1 else: # '*' is inside → expand both sides until visible character if expanded_start > 0: expanded_start -= 1 if expanded_end < len(text): expanded_end += 1 return text[expanded_start:expanded_end], expanded_start, expanded_end except Exception as e: raise RuntimeError(f"Failed to expand word boundaries for pattern '{pattern}': {str(e)}") @staticmethod def find_matches(patterns, decoded_text: str, char_to_byte_map): """ Find all matches. These are also passed on to fuzzy match. """ try: logger.debug("Finding wildcard matches") matches = [] # Pre-compile all patterns once - cached version compiled_patterns = {} for pattern in patterns: if not pattern: continue if '*' in pattern or '?' in pattern: regex_pattern = SnippetExtractor.wildcards_to_regex(pattern) compiled_patterns[pattern] = re.compile(regex_pattern, re.IGNORECASE | re.DOTALL) else: escaped_pattern = re.escape(pattern) regex_pattern = r'\b' + escaped_pattern + r'\b' compiled_patterns[pattern] = re.compile(regex_pattern, re.IGNORECASE) for pattern, compiled_pattern in compiled_patterns.items(): try: # Check stop event before each iteration for match in compiled_pattern.finditer(decoded_text): start_pos, end_pos = match.start(), match.end() match_text = decoded_text[start_pos:end_pos] if '*' in pattern or '?' in pattern: expanded_match_text, expanded_start, expanded_end = SnippetExtractor.expand_to_word_boundaries( decoded_text, start_pos, end_pos, pattern ) match_text = expanded_match_text start_pos = expanded_start end_pos = expanded_end match_obj = Match(pattern, match_text, start_pos, end_pos) match_obj.set_byte_positions(char_to_byte_map) matches.append(match_obj) except re.error as e: raise RuntimeError(f"Regex compilation error for pattern '{pattern}': {str(e)}") logger.debug(f"Found {len(matches)} wildcard matches") return matches except Exception as e: raise RuntimeError(f"Failed to find matches: {str(e)}") @staticmethod def filter_by_distance(matches, distance: int, buzzwords): """ filter matches by distance limit given by user input. """ try: logger.debug("Filtering matches by distance") if not matches: return [] # Use sets for faster membership checks and avoid redundant lookups pattern_positions: dict[str, set[tuple[int, int]]] = {word: set() for word in buzzwords} for m in matches: if m.pattern in pattern_positions: pattern_positions[m.pattern].add((m.start_char, m.end_char)) if any(not pos_set for pos_set in pattern_positions.values()): return [] combined_spans = [] first_word = list(buzzwords)[0] for start1, end1 in pattern_positions[first_word]: span_candidates = [(start1, end1)] for other_word in buzzwords: if other_word == first_word: continue best_match = None min_distance = float('inf') # Direct set iteration - much faster than list lookup for start2, end2 in pattern_positions[other_word]: dist = abs(start1 - start2) if dist <= distance and dist < min_distance: min_distance = dist best_match = (start2, end2) if best_match: span_candidates.append(best_match) if len(span_candidates) == len(buzzwords): min_pos = min(s for s, _ in span_candidates) max_pos = max(e for _, e in span_candidates) combined_spans.append((min_pos, max_pos)) logger.debug(f"After distance filtering: {len(combined_spans)} matches") return combined_spans except Exception as e: raise RuntimeError(f"Failed to filter by distance: {str(e)}") @staticmethod def extract_snippets(matches, snippet_size, pre_ratio, post_ratio, decoded_text): try: logger.debug("Extracting snippets for wildcard matches") snippets = [] for start, end in matches: pre_chars = int(snippet_size * pre_ratio) post_chars = int(snippet_size * post_ratio) snippet_start = max(0, start - pre_chars) snippet_end = min(len(decoded_text), end + post_chars) snippets.append((snippet_start, snippet_end)) return snippets except Exception as e: raise RuntimeError(f"Failed to extract snippets: {str(e)}") @staticmethod def merge_snippets(snippets): try: logger.debug("Merging wildcard snippets") if not snippets: return [], 0 total_snippets = len(snippets) # Sort once instead of repeatedly during merging sorted_snippets = sorted(snippets, key=lambda x: x[0]) merged = [sorted_snippets[0]] for current in sorted_snippets[1:]: last_end = merged[-1][1] if current[0] <= last_end: # Fast merge - no need to check all previous ones merged[-1] = (merged[-1][0], max(last_end, current[1])) else: merged.append(current) logger.debug(f"Merged snippets: {len(merged)} from {total_snippets}") return merged, total_snippets except Exception as e: raise RuntimeError(f"Failed to merge snippets: {str(e)}") # ---------- # Fuzzy part # ---------- @staticmethod def find_fuzzy_matches(decoded_text: str, wildcard_matches: List[Match], threshold: float, stop_event=None): """ Search the entire text using matches from wildcard search as fuzzily searched words. Returns list of tuples (match_start, match_end, score, original_word) where score >= threshold. """ try: logger.debug("Starting fuzzy matching") fuzzy_results: List[Tuple[int, int, float, str]] = [] # Get all unique texts from wildcard matches to use as buzzwords buzzwords = [match.text for match in wildcard_matches if match.text.strip()] if not buzzwords: logger.debug("No buzzwords found for fuzzy matching") return fuzzy_results logger.debug(f"Using {len(buzzwords)} buzzwords for fuzzy matching") # Use rapidfuzz.process.extract for efficient fuzzy matching words = decoded_text.split() processed_words = [] # Create a list of (word, start_pos, end_pos) tuples to track positions current_pos = 0 for word in words: if stop_event and stop_event.is_set(): raise RuntimeError("Fuzzy search was aborted") # Find exact position of this word in original text try: pos = decoded_text.index(word, current_pos) processed_words.append((word, pos, pos + len(word))) current_pos = pos + len(word) except ValueError: # Word not found - skip it continue # For each word in the document, check fuzzy matches against our buzzwords for word, start_pos, end_pos in processed_words: if stop_event and stop_event.is_set(): raise RuntimeError("Fuzzy search was aborted") # Find best match among buzzwords using rapidfuzz try: # Get top match with score >= threshold matches = process.extract( word, buzzwords, limit=1, scorer=fuzz.ratio, score_cutoff=threshold ) if matches and len(matches) > 0: best_match_text, score, _ = matches[0] # Add the position of this match in original text + the actual word that was matched fuzzy_results.append((start_pos, end_pos, score, word)) except Exception as e: # Continue with other words if one fails logger.warning(f"Fuzzy matching failed for word '{word}': {str(e)}") continue logger.debug(f"Found {len(fuzzy_results)} fuzzy matches") return fuzzy_results except Exception as e: raise RuntimeError(f"Failed to find fuzzy matches: {str(e)}") @staticmethod def filter_by_distance_fuzzy(fuzzy_matches, distance_threshold): """ Filter fuzzy matches requiring all buzzwords within distance threshold. Groups matching words together and only keeps groups where all required buzzwords appear within the specified distance. Args: fuzzy_matches: List of tuples (start_pos, end_pos, score, original_word) distance_threshold: Maximum character distance between matches Returns: List of filtered fuzzy match tuples """ try: logger.debug("Filtering fuzzy matches by distance") if not fuzzy_matches: return [] # Group matches by their original word (buzzword) word_groups = {} for start, end, score, word in fuzzy_matches: if word not in word_groups: word_groups[word] = [] word_groups[word].append((start, end, score)) logger.debug(f"Processing {len(word_groups)} unique words from fuzzy matches") for word, positions in word_groups.items(): logger.debug(f" Word '{word}': {len(positions)} matches at positions {[pos[0] for pos in positions]}") # Get all buzzwords that were actually found found_buzzwords = list(word_groups.keys()) if len(found_buzzwords) < 2: logger.debug("Only one unique word found - returning all matches") return fuzzy_matches # For multiple words, create sliding windows to find valid groups results = [] # Sort all positions by start position to make grouping easier all_positions = [] for word, pos_list in word_groups.items(): for start, end, score in pos_list: all_positions.append((start, end, score, word)) all_positions.sort(key=lambda x: x[0]) # Sort by start position logger.debug(f"Total positions to process: {len(all_positions)}") # Try to find groups where multiple buzzwords appear within distance i = 0 while i < len(all_positions): current_start = all_positions[i][0] current_end = all_positions[i][1] # Create a window around this position window_end = current_start + distance_threshold # Collect all words in this window window_words = {} j = i while j < len(all_positions) and all_positions[j][0] <= window_end: pos_start, pos_end, score, word = all_positions[j] if word not in window_words: window_words[word] = [] window_words[word].append((pos_start, pos_end, score)) j += 1 # Check if we have matches for ALL required buzzwords if len(window_words) >= 2: # At least two different words found together logger.debug(f"Found group with {len(window_words)} words in range [{current_start}, {window_end}]") for word, positions in window_words.items(): logger.debug(f" Word '{word}': {[pos[0] for pos in positions]}") # Add all matches from this valid window for word, positions in window_words.items(): for start, end, score in positions: results.append((start, end, score, word)) else: logger.debug(f"Window [{current_start}, {window_end}] only had {len(window_words)} unique words") i = j # Remove duplicates while preserving order seen = set() final_results = [] for item in results: if item not in seen: seen.add(item) final_results.append(item) logger.debug(f"Final filtered results count: {len(final_results)}") return final_results except Exception as e: raise RuntimeError(f"Failed to filter fuzzy matches by distance: {str(e)}") @staticmethod def extract_snippets_fuzzy(matches, snippet_size, pre_ratio, post_ratio, decoded_text): """ Extract snippets from fuzzy matches. """ try: logger.debug("Extracting snippets for fuzzy matches") snippets = [] for start, end, score, original_word in matches: # Apply ratio-based padding to include more context pre_chars = int(snippet_size * pre_ratio) post_chars = int(snippet_size * post_ratio) snippet_start = max(0, start - pre_chars) snippet_end = min(len(decoded_text), end + post_chars) snippets.append((snippet_start, snippet_end, score, original_word)) return snippets except Exception as e: raise RuntimeError(f"Failed to extract fuzzy snippets: {str(e)}") @staticmethod def merge_snippets_fuzzy(snippets): """ Merge overlapping or adjacent fuzzy snippets. """ try: logger.debug("Merging fuzzy snippets") if not snippets: return [], 0 total_snippets = len(snippets) # Sort by start position sorted_snippets = sorted(snippets, key=lambda x: x[0]) merged = [sorted_snippets[0]] for current in sorted_snippets[1:]: last_end = merged[-1][1] if current[0] <= last_end: # Merge overlapping or adjacent snippets new_start = merged[-1][0] new_end = max(last_end, current[1]) # Update the score to be average of both scores (or keep highest) avg_score = (merged[-1][2] + current[2]) / 2.0 merged[-1] = (new_start, new_end, avg_score, merged[-1][3]) # Keep original word from first else: merged.append(current) logger.debug(f"Merged fuzzy snippets: {len(merged)} from {total_snippets}") return merged, total_snippets except Exception as e: raise RuntimeError(f"Failed to merge fuzzy snippets: {str(e)}") # --- Main search function --- def run_search_for_file(file_path: str, config: dict, stop_event: threading.Event): """ Run search for a single file. Writes output_snippets.txt and output_fuzzy_snippets.txt. Returns (wildcard_text, fuzzy_text) strings for UI display. Optimized version with faster operations. """ try: processor = TextProcessor(file_path) if processor.text_bytes is None or processor.char_to_byte is None: raise RuntimeError("Failed to load file properly") buzzwords = [bw for bw in config.get("buzzwords", []) if bw.strip()] # Use set for filter_by_distance membership but keep list for order preservation buzzwords_set = list(dict.fromkeys(buzzwords)) # unique preserving order # wildcard-part - optimized all_matches = SnippetExtractor.find_matches( buzzwords_set, processor.decoded_text, processor.char_to_byte ) if config.get("search_type", "AND") == "AND": final_matches = SnippetExtractor.filter_by_distance( all_matches, config.get("distance_match", 100), buzzwords_set ) else: final_matches = [(m.start_char, m.end_char) for m in all_matches] snippets = SnippetExtractor.extract_snippets( final_matches, config.get("snippet_size", 2000), config.get("pre_ratio", 0.3), config.get("post_ratio", 0.7), processor.decoded_text ) merged_snippets, total_snippets = SnippetExtractor.merge_snippets(snippets) # Count wildcard metrics all_wildcard_matches = len(all_matches) merged_wildcard_snippets = len(merged_snippets) # Calculate characters in merged snippets for wildcard total_wildcard_chars = 0 for start, end in merged_snippets: if stop_event.is_set(): raise RuntimeError("Search was aborted") s_b = processor.char_to_byte[start] e_b = processor.char_to_byte[end] snippet_bytes = processor.text_bytes[s_b:e_b] snippet_text = snippet_bytes.decode("utf-8", errors="surrogateescape") cleaned = re.sub(r'\s+', ' ', snippet_text) # without \n and \r total_wildcard_chars += len(cleaned) # Calculate token for wildcard (characters / 4.2) wildcard_token = round(total_wildcard_chars / 4.2, 1) if total_wildcard_chars > 0 else 0 # Build wildcard textual output - optimized with pre-calculated values wildcard_blocks = [] for idx, (start, end) in enumerate(merged_snippets): if stop_event.is_set(): raise RuntimeError("Search was aborted") s_b = processor.char_to_byte[start] e_b = processor.char_to_byte[end] snippet_bytes = processor.text_bytes[s_b:e_b] snippet_text = snippet_bytes.decode("utf-8", errors="surrogateescape") cleaned = re.sub(r'\s+', ' ', snippet_text) # without \n and \r # Find first match match_text = None byte_start = None for m in all_matches: if start <= m.start_char and end >= m.end_char: match_text = m.text byte_start = m.byte_start break block = [ {"Excerpt": idx + 1}, {"Match Buzzword": match_text}, {"Position, match_text": byte_start}, {"Content": cleaned}, ] wildcard_blocks.append(json.dumps(block, ensure_ascii=False, indent=1)) wildcard_text = "\n\n".join(wildcard_blocks) # fuzzy part, similar approach like wildcard ft = config.get("fuzzy_threshold", 94) if not isinstance(ft, (int, float)) or not (0 <= ft <= 100): ft = 94.0 # default threshold # Use all wildcard matches as input for fuzzy search fuzzy_matches = SnippetExtractor.find_fuzzy_matches( processor.decoded_text, all_matches, ft ) if config.get("search_type", "AND") == "AND": if len(buzzwords) > 1: filtered_fuzzy_matches = SnippetExtractor.filter_by_distance_fuzzy( fuzzy_matches, config.get("distance_match", 100) ) else: # fallback to OR behavior when only one buzzword filtered_fuzzy_matches = fuzzy_matches else: filtered_fuzzy_matches = fuzzy_matches # Extract snippets for fuzzy matches fuzzy_snippets = SnippetExtractor.extract_snippets_fuzzy( filtered_fuzzy_matches, config.get("snippet_size", 2000), config.get("pre_ratio", 0.3), config.get("post_ratio", 0.7), processor.decoded_text ) # Merge fuzzy snippets merged_fuzzy_snippets, total_fuzzy_snippets = SnippetExtractor.merge_snippets_fuzzy(fuzzy_snippets) # Count fuzzy metrics all_fuzzy_matches = len(fuzzy_matches) merged_fuzzy_snippets_count = len(merged_fuzzy_snippets) # Calculate characters in merged snippets for fuzzy total_fuzzy_chars = 0 for start, end, score, original_word in merged_fuzzy_snippets: if stop_event.is_set(): raise RuntimeError("Search was aborted") s_b = processor.char_to_byte[start] e_b = processor.char_to_byte[end] snippet_bytes = processor.text_bytes[s_b:e_b] snippet_text = snippet_bytes.decode("utf-8", errors="surrogateescape") cleaned_snippet = re.sub(r'\s+', ' ', snippet_text) # without \n and \r total_fuzzy_chars += len(cleaned_snippet) # Calculate token for fuzzy (characters / 4.2) fuzzy_token = round(total_fuzzy_chars / 4.2, 1) if total_fuzzy_chars > 0 else 0 # Build fuzzy textual output - now with actual matched text and byte positions fuzzy_blocks = [] for idx, (start, end, score, original_word) in enumerate(merged_fuzzy_snippets): if stop_event.is_set(): raise RuntimeError("Search was aborted") s_b = processor.char_to_byte[start] e_b = processor.char_to_byte[end] snippet_bytes = processor.text_bytes[s_b:e_b] snippet_text = snippet_bytes.decode("utf-8", errors="surrogateescape") cleaned_snippet = re.sub(r'\s+', ' ', snippet_text) # without \n and \r # Get the actual byte start position of the matched word in the original file match_byte_start = None for fm in fuzzy_matches: # Use original fuzzy_matches, not filtered_fuzzy_matches if fm[3] == original_word and fm[0] >= start and fm[1] <= end: # Found the exact fuzzy match that corresponds to this merged snippet match_byte_start = processor.char_to_byte[fm[0]] break block = [ {"Excerpt": idx + 1}, {"Match Buzzword": original_word}, # Show the actual word that was matched {"Score": score}, {"Position": match_byte_start}, # Add byte position to JSON output {"Content": cleaned_snippet}, ] fuzzy_blocks.append(json.dumps(block, ensure_ascii=False, indent=1)) fuzzy_text = "\n\n".join(fuzzy_blocks) logger.debug("Search completed successfully") return wildcard_text, fuzzy_text, all_wildcard_matches, merged_wildcard_snippets, total_wildcard_chars, wildcard_token, all_fuzzy_matches, merged_fuzzy_snippets_count, total_fuzzy_chars, fuzzy_token except Exception as e: logger.error(f"Search failed for file {file_path}: {str(e)}") raise RuntimeError(f"Search failed for file {file_path}: {str(e)}") # --- # GUI # --- class SearchThread(threading.Thread): def __init__(self, paths, config, stop_event, on_complete): super().__init__() self.paths = paths self.config = config self.stop_event = stop_event self.on_complete = on_complete # callback(wildcard_text, fuzzy_text, finished_ok) def run(self): try: agg_wild = [] agg_fuzzy = [] total_all_wildcard_matches = 0 total_merged_wildcard_snippets = 0 total_wildcard_chars = 0 total_wildcard_tokens = 0 total_all_fuzzy_matches = 0 total_merged_fuzzy_snippets = 0 total_fuzzy_chars = 0 total_fuzzy_tokens = 0 for p in self.paths: if self.stop_event.is_set(): self.on_complete("", "", False) return try: w, f, all_wildcard_matches, merged_wildcard_snippets, wildcard_chars, wildcard_token, all_fuzzy_matches, merged_fuzzy_snippets, fuzzy_chars, fuzzy_token = run_search_for_file(p, self.config, self.stop_event) # Accumulate totals total_all_wildcard_matches += all_wildcard_matches total_merged_wildcard_snippets += merged_wildcard_snippets total_wildcard_chars += wildcard_chars total_wildcard_tokens += wildcard_token total_all_fuzzy_matches += all_fuzzy_matches total_merged_fuzzy_snippets += merged_fuzzy_snippets total_fuzzy_chars += fuzzy_chars total_fuzzy_tokens += fuzzy_token # Include file name in the aggregated results agg_wild.append(f"--- {os.path.basename(p)} ---\n{w}") agg_fuzzy.append(f"--- {os.path.basename(p)} ---\n{f}") except Exception as e: # If one file fails, continue with others but report the error if not self.stop_event.is_set(): # Only show error if not aborted self.on_complete(f"ERROR processing {p}: {str(e)}", f"ERROR processing {p}: {str(e)}", False) return wildcard_text = "\n\n".join(agg_wild) fuzzy_text = "\n\n".join(agg_fuzzy) # Create summary strings wildcard_summary = f"Wildcard Results (output_snippets.txt): All matches: {total_all_wildcard_matches}, Merged snippets: {total_merged_wildcard_snippets}, Characters in merged: {total_wildcard_chars}, Token: {total_wildcard_tokens}" fuzzy_summary = f"Fuzzy Results (output_fuzzy_snippets.txt): All matches: {total_all_fuzzy_matches}, Merged snippets: {total_merged_fuzzy_snippets}, Characters in merged: {total_fuzzy_chars}, Token: {total_fuzzy_tokens}" # Prepend summaries to results wildcard_text = wildcard_summary + "\n\n" + wildcard_text if wildcard_text else wildcard_summary fuzzy_text = fuzzy_summary + "\n\n" + fuzzy_text if fuzzy_text else fuzzy_summary self.on_complete(wildcard_text, fuzzy_text, True) except Exception as e: # Handle exceptions in the thread itself self.on_complete(f"THREAD ERROR: {str(e)}", f"THREAD ERROR: {str(e)}", False) class MainFrame(wx.Frame): def __init__(self): super().__init__(None, title="Text Search by Sevenof9 (v8_alpha)", size=(1260, 1000)) panel = wx.Panel(self) # Top: file / dir pickers and right-side label for chosen path top_sizer = wx.BoxSizer(wx.HORIZONTAL) self.file_picker = wx.FilePickerCtrl(panel, style=wx.FLP_OPEN | wx.FLP_FILE_MUST_EXIST) self.dir_picker = wx.DirPickerCtrl(panel) # Set the button labels using SetLabel method on the button part of the picker controls self.file_picker.GetPickerCtrl().SetLabel("Browse File") self.dir_picker.GetPickerCtrl().SetLabel("Browse Folder") self.path_label = wx.StaticText(panel, label="No file/folder selected") top_sizer.Add(self.file_picker, 0, wx.ALL | wx.ALIGN_LEFT, 4) top_sizer.Add(self.dir_picker, 0, wx.ALL | wx.ALIGN_LEFT, 4) top_sizer.Add(self.path_label, 0, wx.ALL | wx.ALIGN_LEFT, 6) # Middle: left = buzzwords (4 fields with AND/OR buttons between), right = controls/config middle_sizer = wx.BoxSizer(wx.HORIZONTAL) # Left: buzzwords area buzz_sizer = wx.BoxSizer(wx.VERTICAL) self.buzz_inputs = [] self.toggle_buttons = [] for i in range(4): txt = wx.TextCtrl(panel, size=(250, -1)) self.buzz_inputs.append(txt) buzz_sizer.Add(txt, 0, wx.ALL | wx.ALIGN_LEFT, 2) if i < 3: btn = wx.Button(panel, label="AND", size=(80, 24)) btn.Bind(wx.EVT_BUTTON, self.on_toggle) self.toggle_buttons.append(btn) buzz_sizer.Add(btn, 0, wx.ALL | wx.ALIGN_LEFT, 2) middle_sizer.Add(buzz_sizer, 0, wx.ALL | wx.ALIGN_LEFT, 6) # Middle: controls and config ctrl_sizer = wx.BoxSizer(wx.VERTICAL) # Start / Abort self.start_button = wx.Button(panel, label="Start Search") self.abort_button = wx.Button(panel, label="Abort") self.abort_button.Disable() self.start_button.Bind(wx.EVT_BUTTON, self.on_start) self.abort_button.Bind(wx.EVT_BUTTON, self.on_abort) ctrl_sizer.Add(self.start_button, 0, wx.ALL | wx.ALIGN_LEFT, 4) ctrl_sizer.Add(self.abort_button, 0, wx.ALL | wx.ALIGN_LEFT, 4) # Config fields self.cfg_fields = {} defaults = [("snippet_size", "2000"), ("pre_ratio", "0.3"), ("post_ratio", "0.7"), ("distance_match", "300"), ("fuzzy_threshold", "94")] for label, val in defaults: row = wx.BoxSizer(wx.HORIZONTAL) lbl = wx.StaticText(panel, label=label + ":") # Update this line to include the new labels if label == "snippet_size": lbl.SetLabel("Snippet Size (chars):") elif label == "pre_ratio": lbl.SetLabel("Pre Ratio (%):") elif label == "post_ratio": lbl.SetLabel("Post Ratio (%):") elif label == "distance_match": lbl.SetLabel("Distance Match (chars):") elif label == "fuzzy_threshold": lbl.SetLabel("Fuzzy Threshold (%):") fld = wx.TextCtrl(panel, value=val, size=(50, -1)) # Bind focus event for validation fld.Bind(wx.EVT_KILL_FOCUS, self.on_field_focus_lost) row.Add(lbl, 0, wx.ALL | wx.ALIGN_LEFT, 2) row.Add(fld, 0, wx.ALL | wx.ALIGN_LEFT, 2) ctrl_sizer.Add(row, 0, wx.ALL | wx.ALIGN_LEFT, 2) self.cfg_fields[label] = fld middle_sizer.Add(ctrl_sizer, 0, wx.ALL | wx.ALIGN_LEFT, 6) # --- Right image panel 700x261 pixel bmp = wx.Bitmap(resource_path("example.bmp"), wx.BITMAP_TYPE_BMP) #bmp = wx.Bitmap("example.bmp", wx.BITMAP_TYPE_BMP) # compute exact size img_w = bmp.GetWidth() img_h = bmp.GetHeight() border = 1 border_panel = wx.Panel(panel, size=(img_w + 2*border, img_h + 2*border)) border_panel.SetBackgroundColour(wx.WHITE) self.img_ctrl = wx.StaticBitmap(border_panel, bitmap=bmp) s = wx.BoxSizer(wx.VERTICAL) s.Add(self.img_ctrl, 0, wx.ALL | wx.ALIGN_CENTER, border) border_panel.SetSizer(s) middle_sizer.Add(border_panel, 0, wx.ALL | wx.ALIGN_LEFT, 6) # Bottom: results (wildcard and fuzzy) across full width result_sizer = wx.BoxSizer(wx.VERTICAL) # Add the summary labels BEFORE the text controls self.wildcard_summary_label = wx.StaticText(panel, label="Wildcard Results:") result_sizer.Add(self.wildcard_summary_label, 0, wx.ALL | wx.ALIGN_LEFT, 2) self.wildcard_box = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY, size=(-1, 220)) result_sizer.Add(self.wildcard_box, 1, wx.EXPAND | wx.ALL, 4) self.fuzzy_summary_label = wx.StaticText(panel, label="Fuzzy Results:") result_sizer.Add(self.fuzzy_summary_label, 0, wx.ALL | wx.ALIGN_LEFT, 2) self.fuzzy_box = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY, size=(-1, 220)) result_sizer.Add(self.fuzzy_box, 1, wx.EXPAND | wx.ALL, 4) # Main vertical layout using only horizontal alignment flags where appropriate main_sizer = wx.BoxSizer(wx.VERTICAL) main_sizer.Add(top_sizer, 0, wx.ALL | wx.ALIGN_LEFT, 6) main_sizer.Add(middle_sizer, 0, wx.ALL | wx.ALIGN_LEFT, 6) main_sizer.Add(result_sizer, 1, wx.EXPAND | wx.ALL, 6) panel.SetSizer(main_sizer) # Events self.file_picker.Bind(wx.EVT_FILEPICKER_CHANGED, self.on_path_change) self.dir_picker.Bind(wx.EVT_DIRPICKER_CHANGED, self.on_path_change) # Thread controls self.worker = None self.stop_event = threading.Event() def on_field_focus_lost(self, evt): """Validate all fields when any field loses focus""" self.validate_all_fields() evt.Skip() # Allow normal processing to continue def validate_all_fields(self): """Validate all configuration fields and enforce dependencies""" try: # Get current values snippet_size_val = self.cfg_fields["snippet_size"].GetValue().strip() pre_ratio_val = self.cfg_fields["pre_ratio"].GetValue().strip() post_ratio_val = self.cfg_fields["post_ratio"].GetValue().strip() distance_match_val = self.cfg_fields["distance_match"].GetValue().strip() fuzzy_threshold_val = self.cfg_fields["fuzzy_threshold"].GetValue().strip() # Default values if empty snippet_size_val = snippet_size_val if snippet_size_val else "2000" pre_ratio_val = pre_ratio_val if pre_ratio_val else "0.3" post_ratio_val = post_ratio_val if post_ratio_val else "0.7" distance_match_val = distance_match_val if distance_match_val else "300" fuzzy_threshold_val = fuzzy_threshold_val if fuzzy_threshold_val else "94" # Validate and process each field # snippet_size: min=0, max=999999, round to integer snippet_size = int(float(snippet_size_val)) if snippet_size_val else 2000 snippet_size = max(0, min(999999, snippet_size)) # pre_ratio: min=0.1, max=0.9, 1 decimal place pre_ratio = round(float(pre_ratio_val), 1) if pre_ratio_val else 0.3 pre_ratio = max(0.1, min(0.9, pre_ratio)) # post_ratio: min=0.1, max=0.9, 1 decimal place post_ratio = round(float(post_ratio_val), 1) if post_ratio_val else 0.7 post_ratio = max(0.1, min(0.9, post_ratio)) # Ensure pre + post = 1 (adjust one to maintain sum) total = pre_ratio + post_ratio if abs(total - 1.0) > 0.001: # Allow small floating point differences # Adjust post_ratio to make the sum equal to 1.0 post_ratio = round(1.0 - pre_ratio, 1) self.cfg_fields["post_ratio"].SetValue(str(post_ratio)) # distance_match: min=0, max=snippet_size, round to integer distance_match = int(float(distance_match_val)) if distance_match_val else 300 distance_match = max(0, min(snippet_size, distance_match)) # If snippet_size < distance_match, adjust snippet_size to match if snippet_size < distance_match: snippet_size = distance_match self.cfg_fields["snippet_size"].SetValue(str(snippet_size)) # fuzzy_threshold: min=1, max=100, round to integer fuzzy_threshold = int(float(fuzzy_threshold_val)) if fuzzy_threshold_val else 94 fuzzy_threshold = max(1, min(100, fuzzy_threshold)) # Apply validated values back to fields self.cfg_fields["snippet_size"].SetValue(str(snippet_size)) self.cfg_fields["pre_ratio"].SetValue(str(pre_ratio)) self.cfg_fields["post_ratio"].SetValue(str(post_ratio)) self.cfg_fields["distance_match"].SetValue(str(distance_match)) self.cfg_fields["fuzzy_threshold"].SetValue(str(fuzzy_threshold)) except Exception as e: # If validation fails, show error but don't block the user wx.MessageBox(f"Validation Error: {str(e)}", "Error") def on_path_change(self, evt): path = evt.GetPath() self.path_label.SetLabel(path) def on_toggle(self, evt): btn = evt.GetEventObject() label = btn.GetLabel() if label == "AND": btn.SetLabel("OR") else: btn.SetLabel("AND") def on_abort(self, evt): """Abort button now properly stops all processes""" if self.worker and self.worker.is_alive(): # Set the stop event to signal all running operations to abort self.stop_event.set() # Disable buttons immediately self.abort_button.Disable() self.start_button.Enable() # Clear any text that might have been set during processing wx.CallAfter(self.wildcard_box.SetValue, "Aborting...") wx.CallAfter(self.fuzzy_box.SetValue, "Aborting...") def on_start(self, evt): # get path path = self.path_label.GetLabel() if not path or path == "No file/folder selected": wx.MessageBox("Please select a file or folder first.", "Error") return try: if os.path.isdir(path): txts = [str(Path(path) / f) for f in sorted(os.listdir(path)) if f.lower().endswith(".txt") and os.path.isfile(os.path.join(path, f))] if not txts: wx.MessageBox("Selected folder contains no .txt files.", "Error") return paths = txts else: if not os.path.isfile(path): wx.MessageBox("Selected path is not a file.", "Error") return # Only allow .txt files - this validation was missing before if not path.lower().endswith(".txt"): wx.MessageBox("Please select a .txt file.", "Error") return paths = [path] except Exception as e: wx.MessageBox(f"Failed to access path: {str(e)}", "Error") return # prepare config try: cfg = { "snippet_size": int(self.cfg_fields["snippet_size"].GetValue().strip()), "pre_ratio": float(self.cfg_fields["pre_ratio"].GetValue().strip()), "post_ratio": float(self.cfg_fields["post_ratio"].GetValue().strip()), "distance_match": int(self.cfg_fields["distance_match"].GetValue().strip()), "fuzzy_threshold": float(self.cfg_fields["fuzzy_threshold"].GetValue().strip()), } except Exception: wx.MessageBox("Please check numeric configuration values.", "Error") return buzzwords_list = [t.GetValue().strip() for t in self.buzz_inputs] search_type_value = "AND" if self.toggle_buttons[0].GetLabel() == "AND" else "OR" cfg["buzzwords"] = buzzwords_list cfg["search_type"] = search_type_value # UI state self.start_button.Disable() self.abort_button.Enable() self.wildcard_box.SetValue("Running...") self.fuzzy_box.SetValue("Running...") # reset stop_event and start thread self.stop_event.clear() # Overwrite output files at the beginning of each new search Path("output_snippets.txt").write_text("", encoding="utf-8", errors="surrogateescape") Path("output_fuzzy_snippets.txt").write_text("", encoding="utf-8", errors="surrogateescape") self.worker = SearchThread(paths, cfg, self.stop_event, self.on_search_complete) self.worker.start() def on_search_complete(self, wildcard_text, fuzzy_text, finished_ok): # This callback runs in worker thread; must marshal to main GUI thread def _update(): if finished_ok: self.wildcard_box.SetValue(wildcard_text) self.fuzzy_box.SetValue(fuzzy_text) # Append results to output files for each processed file Path("output_snippets.txt").write_text(wildcard_text, encoding="utf-8", errors="surrogateescape") Path("output_fuzzy_snippets.txt").write_text(fuzzy_text, encoding="utf-8", errors="surrogateescape") else: # signals either error or aborted self.wildcard_box.SetValue(wildcard_text or "Aborted / Error") self.fuzzy_box.SetValue(fuzzy_text or "Aborted / Error") self.stop_event.clear() self.start_button.Enable() self.abort_button.Disable() wx.CallAfter(_update) def on_browse_file(self, evt): with wx.FileDialog(self, "Open Text File", wildcard="Text files (*.txt)|*.txt", style=wx.FD_OPEN | wx.FD_FILE_MUST_EXIST) as fileDialog: if fileDialog.ShowModal() == wx.ID_CANCEL: return path = fileDialog.GetPath() self.path_label.SetLabel(path) def on_browse_dir(self, evt): with wx.DirDialog(self, "Choose a directory", style=wx.DD_DEFAULT_STYLE | wx.DD_DIR_MUST_EXIST) as dirDialog: if dirDialog.ShowModal() == wx.ID_CANCEL: return path = dirDialog.GetPath() self.path_label.SetLabel(path) if __name__ == "__main__": app = wx.App(False) frame = MainFrame() frame.Show() app.MainLoop()