kalle07 commited on
Commit
2f65985
·
verified ·
1 Parent(s): 302a9bf

Upload 4 files

Browse files
.gitattributes CHANGED
@@ -36,3 +36,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  snippet_extracor_v03alpha.exe filter=lfs diff=lfs merge=lfs -text
37
  snippet_extractor_v03alpha.exe filter=lfs diff=lfs merge=lfs -text
38
  snippet_extractor_v04alpha.exe filter=lfs diff=lfs merge=lfs -text
 
 
 
36
  snippet_extracor_v03alpha.exe filter=lfs diff=lfs merge=lfs -text
37
  snippet_extractor_v03alpha.exe filter=lfs diff=lfs merge=lfs -text
38
  snippet_extractor_v04alpha.exe filter=lfs diff=lfs merge=lfs -text
39
+ example.bmp filter=lfs diff=lfs merge=lfs -text
40
+ snippet_extractor_v08alpha.exe filter=lfs diff=lfs merge=lfs -text
build_v08alpha.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import subprocess
3
+ import os
4
+ from pathlib import Path
5
+
6
+ # Define the entry point (your main script)
7
+ entry_point = "snippet_extractor_v08alpha.py"
8
+
9
+ # Path to your image
10
+ image_file = "example.bmp"
11
+
12
+ # Build command with PyInstaller arguments
13
+ cmd = [
14
+ sys.executable,
15
+ "-m", "PyInstaller",
16
+ "--onefile",
17
+ "--noconfirm",
18
+ "--clean",
19
+ "--noconsole",
20
+ "--hidden-import", "wx",
21
+ "--hidden-import", "rapidfuzz",
22
+ "--hidden-import", "typing_extensions",
23
+ # Include the image
24
+ "--add-data", f"{image_file}{os.pathsep}.",
25
+ ]
26
+
27
+ # Add the entry point
28
+ cmd.append(entry_point)
29
+
30
+ # Execute the build command
31
+ try:
32
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
33
+ print("Compilation completed successfully.")
34
+
35
+ # Show any warnings or info from PyInstaller
36
+ if result.stderr:
37
+ for line in result.stderr.split('\n'):
38
+ if line.strip() and not line.startswith('['):
39
+ print(f"PyInstaller: {line}")
40
+
41
+ except subprocess.CalledProcessError as e:
42
+ print(f"Error during compilation: {e}")
43
+ print("Stderr:", e.stderr)
44
+
45
+ except FileNotFoundError:
46
+ print("PyInstaller not found. Please install it with 'pip install pyinstaller'")
example.bmp ADDED

Git LFS Details

  • SHA256: 561bbf6f16df01962720679bb49ed737dc00a04116edfce7b09ae3b216ddc1a1
  • Pointer size: 131 Bytes
  • Size of remote file: 480 kB
snippet_extractor_v08alpha.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3dfc388d7ef8ac814f6ed802e0701da33fe69544746780fb34e758e3bcc5eb8
3
+ size 29889659
snippet_extractor_v08alpha.py ADDED
@@ -0,0 +1,1143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wx
2
+ from wx import Image, Bitmap, StaticBitmap
3
+ import os
4
+ import sys
5
+ import re
6
+ import json
7
+ import threading
8
+ from pathlib import Path
9
+ from rapidfuzz import fuzz, process
10
+ from typing import List, Tuple, Optional, Match, Dict, Set
11
+ import logging
12
+
13
+
14
+ # --- Configure logging ---
15
+ # Create a custom logger
16
+ logger = logging.getLogger(__name__)
17
+ logger.setLevel(logging.DEBUG) # Default to info level
18
+
19
+ # Create handler and set level
20
+ handler = logging.StreamHandler()
21
+ handler.setLevel(logging.INFO)
22
+
23
+ # Create formatter
24
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
25
+ handler.setFormatter(formatter)
26
+
27
+ # Add handler to logger
28
+ logger.addHandler(handler)
29
+
30
+
31
+ def resource_path(relative_path):
32
+ """Get absolute path to resource, works for dev and PyInstaller"""
33
+ try:
34
+ # PyInstaller creates a temp folder and stores path in _MEIPASS
35
+ base_path = sys._MEIPASS
36
+ except AttributeError:
37
+ base_path = os.path.abspath(".")
38
+
39
+ return os.path.join(base_path, relative_path)
40
+
41
+
42
+ # --- Engine ---
43
+
44
+ class TextProcessor:
45
+ def __init__(self, file_path: str):
46
+ self.file_path = Path(file_path)
47
+ self.text_bytes: Optional[bytes] = None # Explicit type annotation
48
+ self.decoded_text: Optional[str] = None
49
+ self.char_to_byte: Optional[List[int]] = None
50
+ self.load_and_process_file()
51
+
52
+ def load_and_process_file(self):
53
+ try:
54
+ logger.debug(f"Loading file: {self.file_path}")
55
+ self.text_bytes = self.file_path.read_bytes()
56
+ self.decoded_text = self.text_bytes.decode("utf-8", errors="surrogateescape")
57
+ self._build_char_to_byte_mapping()
58
+ except Exception as e:
59
+ raise RuntimeError(f"Failed to read file {self.file_path}: {str(e)}")
60
+
61
+ def _build_char_to_byte_mapping(self):
62
+ if self.decoded_text is None:
63
+ return # Safety check
64
+
65
+ logger.debug("Building character-to-byte mapping")
66
+ self.char_to_byte = [0]
67
+ for ch in self.decoded_text:
68
+ self.char_to_byte.append(self.char_to_byte[-1] + len(ch.encode("utf-8", errors="surrogateescape")))
69
+
70
+
71
+ class Match:
72
+ def __init__(self, pattern: str, text: str, start_char: int, end_char: int):
73
+ self.pattern = pattern
74
+ self.text = text
75
+ self.start_char = start_char
76
+ self.end_char = end_char
77
+ self.byte_start: Optional[int] = None
78
+ self.byte_end: Optional[int] = None
79
+
80
+ def set_byte_positions(self, char_to_byte_map: List[int]) -> None:
81
+ """Convert character positions to byte positions using the mapping."""
82
+ if char_to_byte_map and len(char_to_byte_map) > self.start_char:
83
+ self.byte_start = char_to_byte_map[self.start_char]
84
+ if char_to_byte_map and len(char_to_byte_map) > self.end_char:
85
+ self.byte_end = char_to_byte_map[self.end_char]
86
+
87
+
88
+
89
+ class SnippetExtractor:
90
+ # Pre-compiled regex patterns for performance
91
+ _regex_cache: dict[str, str] = {}
92
+
93
+ @staticmethod
94
+ def wildcards_to_regex(pattern: str) -> str:
95
+ """
96
+ Convert wildcard pattern to regex with caching.
97
+ - '?' → matches exactly one character of any type
98
+ - '*' → matches zero or more non-whitespace chars
99
+ """
100
+ try:
101
+ # Use cache for better performance
102
+ if pattern in SnippetExtractor._regex_cache:
103
+ return SnippetExtractor._regex_cache[pattern]
104
+
105
+ logger.debug(f"Converting wildcard pattern to regex: {pattern}")
106
+ regex_parts = []
107
+ i = 0
108
+ while i < len(pattern):
109
+ ch = pattern[i]
110
+ if ch == '?':
111
+ regex_parts.append('.')
112
+ i += 1
113
+ elif ch == '*':
114
+ regex_parts.append(r'(?:\S*)')
115
+ i += 1
116
+ else:
117
+ regex_parts.append(re.escape(ch))
118
+ i += 1
119
+
120
+ result = "".join(regex_parts)
121
+ SnippetExtractor._regex_cache[pattern] = result
122
+ logger.debug(f"Converted pattern '{pattern}' to regex: {result}")
123
+ return result
124
+ except Exception as e:
125
+ raise RuntimeError(f"Failed to convert wildcard pattern '{pattern}' to regex: {str(e)}")
126
+
127
+
128
+ @staticmethod
129
+ def expand_to_word_boundaries(text: str, start_char: int, end_char: int, pattern: str):
130
+ """
131
+ Expand match boundaries depending on '*' position.
132
+ """
133
+ try:
134
+ # Exact match for '?' only patterns
135
+ if '?' in pattern and '*' not in pattern:
136
+ return text[start_char:end_char], start_char, end_char
137
+
138
+ expanded_start = start_char
139
+ expanded_end = end_char
140
+
141
+ logger.debug(f"Expanding boundaries for pattern '{pattern}'")
142
+
143
+ if '*' in pattern:
144
+ if pattern.startswith('*') and not pattern.endswith('*'):
145
+ # expand LEFT until whitespace - optimized with backward search
146
+ while expanded_start > 0 and not text[expanded_start - 1].isspace():
147
+ expanded_start -= 1
148
+
149
+ elif pattern.endswith('*') and not pattern.startswith('*'):
150
+ # expand RIGHT until whitespace - optimized forward search
151
+ while expanded_end < len(text) and not text[expanded_end].isspace():
152
+ expanded_end += 1
153
+
154
+ else:
155
+ # '*' is inside → expand both sides until visible character
156
+ if expanded_start > 0:
157
+ expanded_start -= 1
158
+ if expanded_end < len(text):
159
+ expanded_end += 1
160
+
161
+ return text[expanded_start:expanded_end], expanded_start, expanded_end
162
+
163
+ except Exception as e:
164
+ raise RuntimeError(f"Failed to expand word boundaries for pattern '{pattern}': {str(e)}")
165
+
166
+
167
+ @staticmethod
168
+ def find_matches(patterns, decoded_text: str, char_to_byte_map):
169
+ """
170
+ Find all matches. These are also passed on to fuzzy match.
171
+ """
172
+ try:
173
+ logger.debug("Finding wildcard matches")
174
+ matches = []
175
+
176
+ # Pre-compile all patterns once - cached version
177
+ compiled_patterns = {}
178
+ for pattern in patterns:
179
+ if not pattern:
180
+ continue
181
+ if '*' in pattern or '?' in pattern:
182
+ regex_pattern = SnippetExtractor.wildcards_to_regex(pattern)
183
+ compiled_patterns[pattern] = re.compile(regex_pattern, re.IGNORECASE | re.DOTALL)
184
+ else:
185
+ escaped_pattern = re.escape(pattern)
186
+ regex_pattern = r'\b' + escaped_pattern + r'\b'
187
+ compiled_patterns[pattern] = re.compile(regex_pattern, re.IGNORECASE)
188
+
189
+ for pattern, compiled_pattern in compiled_patterns.items():
190
+ try:
191
+ # Check stop event before each iteration
192
+ for match in compiled_pattern.finditer(decoded_text):
193
+ start_pos, end_pos = match.start(), match.end()
194
+ match_text = decoded_text[start_pos:end_pos]
195
+
196
+ if '*' in pattern or '?' in pattern:
197
+ expanded_match_text, expanded_start, expanded_end = SnippetExtractor.expand_to_word_boundaries(
198
+ decoded_text, start_pos, end_pos, pattern
199
+ )
200
+ match_text = expanded_match_text
201
+ start_pos = expanded_start
202
+ end_pos = expanded_end
203
+
204
+ match_obj = Match(pattern, match_text, start_pos, end_pos)
205
+ match_obj.set_byte_positions(char_to_byte_map)
206
+ matches.append(match_obj)
207
+ except re.error as e:
208
+ raise RuntimeError(f"Regex compilation error for pattern '{pattern}': {str(e)}")
209
+
210
+ logger.debug(f"Found {len(matches)} wildcard matches")
211
+ return matches
212
+ except Exception as e:
213
+ raise RuntimeError(f"Failed to find matches: {str(e)}")
214
+
215
+
216
+ @staticmethod
217
+ def filter_by_distance(matches, distance: int, buzzwords):
218
+ """
219
+ filter matches by distance limit given by user input.
220
+ """
221
+ try:
222
+ logger.debug("Filtering matches by distance")
223
+ if not matches:
224
+ return []
225
+
226
+ # Use sets for faster membership checks and avoid redundant lookups
227
+ pattern_positions: dict[str, set[tuple[int, int]]] = {word: set() for word in buzzwords}
228
+ for m in matches:
229
+ if m.pattern in pattern_positions:
230
+ pattern_positions[m.pattern].add((m.start_char, m.end_char))
231
+
232
+ if any(not pos_set for pos_set in pattern_positions.values()):
233
+ return []
234
+
235
+ combined_spans = []
236
+ first_word = list(buzzwords)[0]
237
+
238
+ for start1, end1 in pattern_positions[first_word]:
239
+ span_candidates = [(start1, end1)]
240
+ for other_word in buzzwords:
241
+ if other_word == first_word:
242
+ continue
243
+ best_match = None
244
+ min_distance = float('inf')
245
+
246
+ # Direct set iteration - much faster than list lookup
247
+ for start2, end2 in pattern_positions[other_word]:
248
+ dist = abs(start1 - start2)
249
+ if dist <= distance and dist < min_distance:
250
+ min_distance = dist
251
+ best_match = (start2, end2)
252
+
253
+ if best_match:
254
+ span_candidates.append(best_match)
255
+
256
+ if len(span_candidates) == len(buzzwords):
257
+ min_pos = min(s for s, _ in span_candidates)
258
+ max_pos = max(e for _, e in span_candidates)
259
+ combined_spans.append((min_pos, max_pos))
260
+
261
+ logger.debug(f"After distance filtering: {len(combined_spans)} matches")
262
+ return combined_spans
263
+ except Exception as e:
264
+ raise RuntimeError(f"Failed to filter by distance: {str(e)}")
265
+
266
+ @staticmethod
267
+ def extract_snippets(matches, snippet_size, pre_ratio, post_ratio, decoded_text):
268
+ try:
269
+ logger.debug("Extracting snippets for wildcard matches")
270
+ snippets = []
271
+ for start, end in matches:
272
+ pre_chars = int(snippet_size * pre_ratio)
273
+ post_chars = int(snippet_size * post_ratio)
274
+ snippet_start = max(0, start - pre_chars)
275
+ snippet_end = min(len(decoded_text), end + post_chars)
276
+ snippets.append((snippet_start, snippet_end))
277
+ return snippets
278
+ except Exception as e:
279
+ raise RuntimeError(f"Failed to extract snippets: {str(e)}")
280
+
281
+ @staticmethod
282
+ def merge_snippets(snippets):
283
+ try:
284
+ logger.debug("Merging wildcard snippets")
285
+ if not snippets:
286
+ return [], 0
287
+
288
+ total_snippets = len(snippets)
289
+
290
+ # Sort once instead of repeatedly during merging
291
+ sorted_snippets = sorted(snippets, key=lambda x: x[0])
292
+ merged = [sorted_snippets[0]]
293
+
294
+ for current in sorted_snippets[1:]:
295
+ last_end = merged[-1][1]
296
+ if current[0] <= last_end:
297
+ # Fast merge - no need to check all previous ones
298
+ merged[-1] = (merged[-1][0], max(last_end, current[1]))
299
+ else:
300
+ merged.append(current)
301
+
302
+ logger.debug(f"Merged snippets: {len(merged)} from {total_snippets}")
303
+ return merged, total_snippets
304
+ except Exception as e:
305
+ raise RuntimeError(f"Failed to merge snippets: {str(e)}")
306
+
307
+
308
+ # ----------
309
+ # Fuzzy part
310
+ # ----------
311
+
312
+ @staticmethod
313
+ def find_fuzzy_matches(decoded_text: str, wildcard_matches: List[Match], threshold: float, stop_event=None):
314
+ """
315
+ Search the entire text using matches from wildcard search as fuzzily searched words.
316
+ Returns list of tuples (match_start, match_end, score, original_word) where score >= threshold.
317
+ """
318
+ try:
319
+ logger.debug("Starting fuzzy matching")
320
+ fuzzy_results: List[Tuple[int, int, float, str]] = []
321
+
322
+ # Get all unique texts from wildcard matches to use as buzzwords
323
+ buzzwords = [match.text for match in wildcard_matches if match.text.strip()]
324
+
325
+ if not buzzwords:
326
+ logger.debug("No buzzwords found for fuzzy matching")
327
+ return fuzzy_results
328
+
329
+ logger.debug(f"Using {len(buzzwords)} buzzwords for fuzzy matching")
330
+
331
+ # Use rapidfuzz.process.extract for efficient fuzzy matching
332
+ words = decoded_text.split()
333
+ processed_words = []
334
+
335
+ # Create a list of (word, start_pos, end_pos) tuples to track positions
336
+ current_pos = 0
337
+ for word in words:
338
+ if stop_event and stop_event.is_set():
339
+ raise RuntimeError("Fuzzy search was aborted")
340
+
341
+ # Find exact position of this word in original text
342
+ try:
343
+ pos = decoded_text.index(word, current_pos)
344
+ processed_words.append((word, pos, pos + len(word)))
345
+ current_pos = pos + len(word)
346
+ except ValueError:
347
+ # Word not found - skip it
348
+ continue
349
+
350
+ # For each word in the document, check fuzzy matches against our buzzwords
351
+ for word, start_pos, end_pos in processed_words:
352
+ if stop_event and stop_event.is_set():
353
+ raise RuntimeError("Fuzzy search was aborted")
354
+
355
+ # Find best match among buzzwords using rapidfuzz
356
+ try:
357
+ # Get top match with score >= threshold
358
+ matches = process.extract(
359
+ word,
360
+ buzzwords,
361
+ limit=1,
362
+ scorer=fuzz.ratio,
363
+ score_cutoff=threshold
364
+ )
365
+
366
+ if matches and len(matches) > 0:
367
+ best_match_text, score, _ = matches[0]
368
+ # Add the position of this match in original text + the actual word that was matched
369
+ fuzzy_results.append((start_pos, end_pos, score, word))
370
+
371
+ except Exception as e:
372
+ # Continue with other words if one fails
373
+ logger.warning(f"Fuzzy matching failed for word '{word}': {str(e)}")
374
+ continue
375
+
376
+ logger.debug(f"Found {len(fuzzy_results)} fuzzy matches")
377
+ return fuzzy_results
378
+
379
+ except Exception as e:
380
+ raise RuntimeError(f"Failed to find fuzzy matches: {str(e)}")
381
+
382
+
383
+ @staticmethod
384
+ def filter_by_distance_fuzzy(fuzzy_matches, distance_threshold):
385
+ """
386
+ Filter fuzzy matches requiring all buzzwords within distance threshold.
387
+ Groups matching words together and only keeps groups where all required
388
+ buzzwords appear within the specified distance.
389
+
390
+ Args:
391
+ fuzzy_matches: List of tuples (start_pos, end_pos, score, original_word)
392
+ distance_threshold: Maximum character distance between matches
393
+
394
+ Returns:
395
+ List of filtered fuzzy match tuples
396
+ """
397
+ try:
398
+ logger.debug("Filtering fuzzy matches by distance")
399
+ if not fuzzy_matches:
400
+ return []
401
+
402
+ # Group matches by their original word (buzzword)
403
+ word_groups = {}
404
+ for start, end, score, word in fuzzy_matches:
405
+ if word not in word_groups:
406
+ word_groups[word] = []
407
+ word_groups[word].append((start, end, score))
408
+
409
+ logger.debug(f"Processing {len(word_groups)} unique words from fuzzy matches")
410
+ for word, positions in word_groups.items():
411
+ logger.debug(f" Word '{word}': {len(positions)} matches at positions {[pos[0] for pos in positions]}")
412
+
413
+ # Get all buzzwords that were actually found
414
+ found_buzzwords = list(word_groups.keys())
415
+
416
+ if len(found_buzzwords) < 2:
417
+ logger.debug("Only one unique word found - returning all matches")
418
+ return fuzzy_matches
419
+
420
+ # For multiple words, create sliding windows to find valid groups
421
+ results = []
422
+
423
+ # Sort all positions by start position to make grouping easier
424
+ all_positions = []
425
+ for word, pos_list in word_groups.items():
426
+ for start, end, score in pos_list:
427
+ all_positions.append((start, end, score, word))
428
+
429
+ all_positions.sort(key=lambda x: x[0]) # Sort by start position
430
+
431
+ logger.debug(f"Total positions to process: {len(all_positions)}")
432
+
433
+ # Try to find groups where multiple buzzwords appear within distance
434
+ i = 0
435
+ while i < len(all_positions):
436
+ current_start = all_positions[i][0]
437
+ current_end = all_positions[i][1]
438
+
439
+ # Create a window around this position
440
+ window_end = current_start + distance_threshold
441
+
442
+ # Collect all words in this window
443
+ window_words = {}
444
+ j = i
445
+ while j < len(all_positions) and all_positions[j][0] <= window_end:
446
+ pos_start, pos_end, score, word = all_positions[j]
447
+ if word not in window_words:
448
+ window_words[word] = []
449
+ window_words[word].append((pos_start, pos_end, score))
450
+ j += 1
451
+
452
+ # Check if we have matches for ALL required buzzwords
453
+ if len(window_words) >= 2: # At least two different words found together
454
+ logger.debug(f"Found group with {len(window_words)} words in range [{current_start}, {window_end}]")
455
+ for word, positions in window_words.items():
456
+ logger.debug(f" Word '{word}': {[pos[0] for pos in positions]}")
457
+
458
+ # Add all matches from this valid window
459
+ for word, positions in window_words.items():
460
+ for start, end, score in positions:
461
+ results.append((start, end, score, word))
462
+ else:
463
+ logger.debug(f"Window [{current_start}, {window_end}] only had {len(window_words)} unique words")
464
+
465
+ i = j
466
+
467
+ # Remove duplicates while preserving order
468
+ seen = set()
469
+ final_results = []
470
+ for item in results:
471
+ if item not in seen:
472
+ seen.add(item)
473
+ final_results.append(item)
474
+
475
+ logger.debug(f"Final filtered results count: {len(final_results)}")
476
+ return final_results
477
+
478
+ except Exception as e:
479
+ raise RuntimeError(f"Failed to filter fuzzy matches by distance: {str(e)}")
480
+
481
+
482
+
483
+ @staticmethod
484
+ def extract_snippets_fuzzy(matches, snippet_size, pre_ratio, post_ratio, decoded_text):
485
+ """
486
+ Extract snippets from fuzzy matches.
487
+ """
488
+ try:
489
+ logger.debug("Extracting snippets for fuzzy matches")
490
+ snippets = []
491
+ for start, end, score, original_word in matches:
492
+ # Apply ratio-based padding to include more context
493
+ pre_chars = int(snippet_size * pre_ratio)
494
+ post_chars = int(snippet_size * post_ratio)
495
+ snippet_start = max(0, start - pre_chars)
496
+ snippet_end = min(len(decoded_text), end + post_chars)
497
+
498
+ snippets.append((snippet_start, snippet_end, score, original_word))
499
+ return snippets
500
+ except Exception as e:
501
+ raise RuntimeError(f"Failed to extract fuzzy snippets: {str(e)}")
502
+
503
+
504
+
505
+ @staticmethod
506
+ def merge_snippets_fuzzy(snippets):
507
+ """
508
+ Merge overlapping or adjacent fuzzy snippets.
509
+ """
510
+ try:
511
+ logger.debug("Merging fuzzy snippets")
512
+ if not snippets:
513
+ return [], 0
514
+
515
+ total_snippets = len(snippets)
516
+
517
+ # Sort by start position
518
+ sorted_snippets = sorted(snippets, key=lambda x: x[0])
519
+ merged = [sorted_snippets[0]]
520
+
521
+ for current in sorted_snippets[1:]:
522
+ last_end = merged[-1][1]
523
+
524
+ if current[0] <= last_end:
525
+ # Merge overlapping or adjacent snippets
526
+ new_start = merged[-1][0]
527
+ new_end = max(last_end, current[1])
528
+
529
+ # Update the score to be average of both scores (or keep highest)
530
+ avg_score = (merged[-1][2] + current[2]) / 2.0
531
+
532
+ merged[-1] = (new_start, new_end, avg_score, merged[-1][3]) # Keep original word from first
533
+ else:
534
+ merged.append(current)
535
+
536
+ logger.debug(f"Merged fuzzy snippets: {len(merged)} from {total_snippets}")
537
+ return merged, total_snippets
538
+ except Exception as e:
539
+ raise RuntimeError(f"Failed to merge fuzzy snippets: {str(e)}")
540
+
541
+
542
+ # --- Main search function ---
543
+
544
+ def run_search_for_file(file_path: str, config: dict, stop_event: threading.Event):
545
+ """
546
+ Run search for a single file. Writes output_snippets.txt and output_fuzzy_snippets.txt.
547
+ Returns (wildcard_text, fuzzy_text) strings for UI display.
548
+ Optimized version with faster operations.
549
+ """
550
+ try:
551
+ processor = TextProcessor(file_path)
552
+
553
+ if processor.text_bytes is None or processor.char_to_byte is None:
554
+ raise RuntimeError("Failed to load file properly")
555
+
556
+ buzzwords = [bw for bw in config.get("buzzwords", []) if bw.strip()]
557
+
558
+ # Use set for filter_by_distance membership but keep list for order preservation
559
+ buzzwords_set = list(dict.fromkeys(buzzwords)) # unique preserving order
560
+
561
+ # wildcard-part - optimized
562
+ all_matches = SnippetExtractor.find_matches(
563
+ buzzwords_set,
564
+ processor.decoded_text,
565
+ processor.char_to_byte
566
+ )
567
+
568
+ if config.get("search_type", "AND") == "AND":
569
+ final_matches = SnippetExtractor.filter_by_distance(
570
+ all_matches,
571
+ config.get("distance_match", 100),
572
+ buzzwords_set
573
+ )
574
+ else:
575
+ final_matches = [(m.start_char, m.end_char) for m in all_matches]
576
+
577
+ snippets = SnippetExtractor.extract_snippets(
578
+ final_matches,
579
+ config.get("snippet_size", 2000),
580
+ config.get("pre_ratio", 0.3),
581
+ config.get("post_ratio", 0.7),
582
+ processor.decoded_text
583
+ )
584
+
585
+ merged_snippets, total_snippets = SnippetExtractor.merge_snippets(snippets)
586
+
587
+ # Count wildcard metrics
588
+ all_wildcard_matches = len(all_matches)
589
+ merged_wildcard_snippets = len(merged_snippets)
590
+
591
+ # Calculate characters in merged snippets for wildcard
592
+ total_wildcard_chars = 0
593
+ for start, end in merged_snippets:
594
+ if stop_event.is_set():
595
+ raise RuntimeError("Search was aborted")
596
+
597
+ s_b = processor.char_to_byte[start]
598
+ e_b = processor.char_to_byte[end]
599
+ snippet_bytes = processor.text_bytes[s_b:e_b]
600
+ snippet_text = snippet_bytes.decode("utf-8", errors="surrogateescape")
601
+ cleaned = re.sub(r'\s+', ' ', snippet_text) # without \n and \r
602
+ total_wildcard_chars += len(cleaned)
603
+
604
+ # Calculate token for wildcard (characters / 4.2)
605
+ wildcard_token = round(total_wildcard_chars / 4.2, 1) if total_wildcard_chars > 0 else 0
606
+
607
+ # Build wildcard textual output - optimized with pre-calculated values
608
+ wildcard_blocks = []
609
+ for idx, (start, end) in enumerate(merged_snippets):
610
+ if stop_event.is_set():
611
+ raise RuntimeError("Search was aborted")
612
+
613
+ s_b = processor.char_to_byte[start]
614
+ e_b = processor.char_to_byte[end]
615
+ snippet_bytes = processor.text_bytes[s_b:e_b]
616
+ snippet_text = snippet_bytes.decode("utf-8", errors="surrogateescape")
617
+ cleaned = re.sub(r'\s+', ' ', snippet_text) # without \n and \r
618
+
619
+ # Find first match
620
+ match_text = None
621
+ byte_start = None
622
+ for m in all_matches:
623
+ if start <= m.start_char and end >= m.end_char:
624
+ match_text = m.text
625
+ byte_start = m.byte_start
626
+ break
627
+
628
+ block = [
629
+ {"Excerpt": idx + 1},
630
+ {"Match Buzzword": match_text},
631
+ {"Position, match_text": byte_start},
632
+ {"Content": cleaned},
633
+ ]
634
+ wildcard_blocks.append(json.dumps(block, ensure_ascii=False, indent=1))
635
+
636
+ wildcard_text = "\n\n".join(wildcard_blocks)
637
+
638
+
639
+
640
+
641
+ # fuzzy part, similar approach like wildcard
642
+ ft = config.get("fuzzy_threshold", 94)
643
+ if not isinstance(ft, (int, float)) or not (0 <= ft <= 100):
644
+ ft = 94.0 # default threshold
645
+
646
+ # Use all wildcard matches as input for fuzzy search
647
+ fuzzy_matches = SnippetExtractor.find_fuzzy_matches(
648
+ processor.decoded_text,
649
+ all_matches,
650
+ ft
651
+ )
652
+
653
+ if config.get("search_type", "AND") == "AND":
654
+ if len(buzzwords) > 1:
655
+ filtered_fuzzy_matches = SnippetExtractor.filter_by_distance_fuzzy(
656
+ fuzzy_matches,
657
+ config.get("distance_match", 100)
658
+ )
659
+ else:
660
+ # fallback to OR behavior when only one buzzword
661
+ filtered_fuzzy_matches = fuzzy_matches
662
+ else:
663
+ filtered_fuzzy_matches = fuzzy_matches
664
+
665
+ # Extract snippets for fuzzy matches
666
+ fuzzy_snippets = SnippetExtractor.extract_snippets_fuzzy(
667
+ filtered_fuzzy_matches,
668
+ config.get("snippet_size", 2000),
669
+ config.get("pre_ratio", 0.3),
670
+ config.get("post_ratio", 0.7),
671
+ processor.decoded_text
672
+ )
673
+
674
+ # Merge fuzzy snippets
675
+ merged_fuzzy_snippets, total_fuzzy_snippets = SnippetExtractor.merge_snippets_fuzzy(fuzzy_snippets)
676
+
677
+ # Count fuzzy metrics
678
+ all_fuzzy_matches = len(fuzzy_matches)
679
+ merged_fuzzy_snippets_count = len(merged_fuzzy_snippets)
680
+
681
+ # Calculate characters in merged snippets for fuzzy
682
+ total_fuzzy_chars = 0
683
+ for start, end, score, original_word in merged_fuzzy_snippets:
684
+ if stop_event.is_set():
685
+ raise RuntimeError("Search was aborted")
686
+
687
+ s_b = processor.char_to_byte[start]
688
+ e_b = processor.char_to_byte[end]
689
+ snippet_bytes = processor.text_bytes[s_b:e_b]
690
+ snippet_text = snippet_bytes.decode("utf-8", errors="surrogateescape")
691
+ cleaned_snippet = re.sub(r'\s+', ' ', snippet_text) # without \n and \r
692
+ total_fuzzy_chars += len(cleaned_snippet)
693
+
694
+ # Calculate token for fuzzy (characters / 4.2)
695
+ fuzzy_token = round(total_fuzzy_chars / 4.2, 1) if total_fuzzy_chars > 0 else 0
696
+
697
+ # Build fuzzy textual output - now with actual matched text and byte positions
698
+ fuzzy_blocks = []
699
+ for idx, (start, end, score, original_word) in enumerate(merged_fuzzy_snippets):
700
+ if stop_event.is_set():
701
+ raise RuntimeError("Search was aborted")
702
+
703
+ s_b = processor.char_to_byte[start]
704
+ e_b = processor.char_to_byte[end]
705
+ snippet_bytes = processor.text_bytes[s_b:e_b]
706
+ snippet_text = snippet_bytes.decode("utf-8", errors="surrogateescape")
707
+ cleaned_snippet = re.sub(r'\s+', ' ', snippet_text) # without \n and \r
708
+
709
+ # Get the actual byte start position of the matched word in the original file
710
+ match_byte_start = None
711
+ for fm in fuzzy_matches: # Use original fuzzy_matches, not filtered_fuzzy_matches
712
+ if fm[3] == original_word and fm[0] >= start and fm[1] <= end:
713
+ # Found the exact fuzzy match that corresponds to this merged snippet
714
+ match_byte_start = processor.char_to_byte[fm[0]]
715
+ break
716
+
717
+ block = [
718
+ {"Excerpt": idx + 1},
719
+ {"Match Buzzword": original_word}, # Show the actual word that was matched
720
+ {"Score": score},
721
+ {"Position": match_byte_start}, # Add byte position to JSON output
722
+ {"Content": cleaned_snippet},
723
+ ]
724
+ fuzzy_blocks.append(json.dumps(block, ensure_ascii=False, indent=1))
725
+
726
+ fuzzy_text = "\n\n".join(fuzzy_blocks)
727
+
728
+ logger.debug("Search completed successfully")
729
+ return wildcard_text, fuzzy_text, all_wildcard_matches, merged_wildcard_snippets, total_wildcard_chars, wildcard_token, all_fuzzy_matches, merged_fuzzy_snippets_count, total_fuzzy_chars, fuzzy_token
730
+ except Exception as e:
731
+ logger.error(f"Search failed for file {file_path}: {str(e)}")
732
+ raise RuntimeError(f"Search failed for file {file_path}: {str(e)}")
733
+
734
+ # ---
735
+ # GUI
736
+ # ---
737
+
738
+ class SearchThread(threading.Thread):
739
+ def __init__(self, paths, config, stop_event, on_complete):
740
+ super().__init__()
741
+ self.paths = paths
742
+ self.config = config
743
+ self.stop_event = stop_event
744
+ self.on_complete = on_complete # callback(wildcard_text, fuzzy_text, finished_ok)
745
+
746
+ def run(self):
747
+ try:
748
+ agg_wild = []
749
+ agg_fuzzy = []
750
+ total_all_wildcard_matches = 0
751
+ total_merged_wildcard_snippets = 0
752
+ total_wildcard_chars = 0
753
+ total_wildcard_tokens = 0
754
+
755
+ total_all_fuzzy_matches = 0
756
+ total_merged_fuzzy_snippets = 0
757
+ total_fuzzy_chars = 0
758
+ total_fuzzy_tokens = 0
759
+
760
+ for p in self.paths:
761
+ if self.stop_event.is_set():
762
+ self.on_complete("", "", False)
763
+ return
764
+ try:
765
+ w, f, all_wildcard_matches, merged_wildcard_snippets, wildcard_chars, wildcard_token, all_fuzzy_matches, merged_fuzzy_snippets, fuzzy_chars, fuzzy_token = run_search_for_file(p, self.config, self.stop_event)
766
+
767
+ # Accumulate totals
768
+ total_all_wildcard_matches += all_wildcard_matches
769
+ total_merged_wildcard_snippets += merged_wildcard_snippets
770
+ total_wildcard_chars += wildcard_chars
771
+ total_wildcard_tokens += wildcard_token
772
+
773
+ total_all_fuzzy_matches += all_fuzzy_matches
774
+ total_merged_fuzzy_snippets += merged_fuzzy_snippets
775
+ total_fuzzy_chars += fuzzy_chars
776
+ total_fuzzy_tokens += fuzzy_token
777
+
778
+ # Include file name in the aggregated results
779
+ agg_wild.append(f"--- {os.path.basename(p)} ---\n{w}")
780
+ agg_fuzzy.append(f"--- {os.path.basename(p)} ---\n{f}")
781
+ except Exception as e:
782
+ # If one file fails, continue with others but report the error
783
+ if not self.stop_event.is_set(): # Only show error if not aborted
784
+ self.on_complete(f"ERROR processing {p}: {str(e)}", f"ERROR processing {p}: {str(e)}", False)
785
+ return
786
+
787
+ wildcard_text = "\n\n".join(agg_wild)
788
+ fuzzy_text = "\n\n".join(agg_fuzzy)
789
+
790
+ # Create summary strings
791
+ wildcard_summary = f"Wildcard Results (output_snippets.txt): All matches: {total_all_wildcard_matches}, Merged snippets: {total_merged_wildcard_snippets}, Characters in merged: {total_wildcard_chars}, Token: {total_wildcard_tokens}"
792
+ fuzzy_summary = f"Fuzzy Results (output_fuzzy_snippets.txt): All matches: {total_all_fuzzy_matches}, Merged snippets: {total_merged_fuzzy_snippets}, Characters in merged: {total_fuzzy_chars}, Token: {total_fuzzy_tokens}"
793
+
794
+ # Prepend summaries to results
795
+ wildcard_text = wildcard_summary + "\n\n" + wildcard_text if wildcard_text else wildcard_summary
796
+ fuzzy_text = fuzzy_summary + "\n\n" + fuzzy_text if fuzzy_text else fuzzy_summary
797
+
798
+ self.on_complete(wildcard_text, fuzzy_text, True)
799
+ except Exception as e:
800
+ # Handle exceptions in the thread itself
801
+ self.on_complete(f"THREAD ERROR: {str(e)}", f"THREAD ERROR: {str(e)}", False)
802
+
803
+ class MainFrame(wx.Frame):
804
+ def __init__(self):
805
+ super().__init__(None, title="Text Search by Sevenof9 (v8_alpha)", size=(1260, 1000))
806
+ panel = wx.Panel(self)
807
+
808
+ # Top: file / dir pickers and right-side label for chosen path
809
+ top_sizer = wx.BoxSizer(wx.HORIZONTAL)
810
+ self.file_picker = wx.FilePickerCtrl(panel, style=wx.FLP_OPEN | wx.FLP_FILE_MUST_EXIST)
811
+ self.dir_picker = wx.DirPickerCtrl(panel)
812
+
813
+ # Set the button labels using SetLabel method on the button part of the picker controls
814
+ self.file_picker.GetPickerCtrl().SetLabel("Browse File")
815
+ self.dir_picker.GetPickerCtrl().SetLabel("Browse Folder")
816
+
817
+ self.path_label = wx.StaticText(panel, label="No file/folder selected")
818
+
819
+ top_sizer.Add(self.file_picker, 0, wx.ALL | wx.ALIGN_LEFT, 4)
820
+ top_sizer.Add(self.dir_picker, 0, wx.ALL | wx.ALIGN_LEFT, 4)
821
+ top_sizer.Add(self.path_label, 0, wx.ALL | wx.ALIGN_LEFT, 6)
822
+
823
+ # Middle: left = buzzwords (4 fields with AND/OR buttons between), right = controls/config
824
+ middle_sizer = wx.BoxSizer(wx.HORIZONTAL)
825
+
826
+ # Left: buzzwords area
827
+ buzz_sizer = wx.BoxSizer(wx.VERTICAL)
828
+ self.buzz_inputs = []
829
+ self.toggle_buttons = []
830
+ for i in range(4):
831
+ txt = wx.TextCtrl(panel, size=(250, -1))
832
+ self.buzz_inputs.append(txt)
833
+ buzz_sizer.Add(txt, 0, wx.ALL | wx.ALIGN_LEFT, 2)
834
+ if i < 3:
835
+ btn = wx.Button(panel, label="AND", size=(80, 24))
836
+ btn.Bind(wx.EVT_BUTTON, self.on_toggle)
837
+ self.toggle_buttons.append(btn)
838
+ buzz_sizer.Add(btn, 0, wx.ALL | wx.ALIGN_LEFT, 2)
839
+
840
+ middle_sizer.Add(buzz_sizer, 0, wx.ALL | wx.ALIGN_LEFT, 6)
841
+
842
+ # Middle: controls and config
843
+ ctrl_sizer = wx.BoxSizer(wx.VERTICAL)
844
+
845
+ # Start / Abort
846
+ self.start_button = wx.Button(panel, label="Start Search")
847
+ self.abort_button = wx.Button(panel, label="Abort")
848
+ self.abort_button.Disable()
849
+ self.start_button.Bind(wx.EVT_BUTTON, self.on_start)
850
+ self.abort_button.Bind(wx.EVT_BUTTON, self.on_abort)
851
+ ctrl_sizer.Add(self.start_button, 0, wx.ALL | wx.ALIGN_LEFT, 4)
852
+ ctrl_sizer.Add(self.abort_button, 0, wx.ALL | wx.ALIGN_LEFT, 4)
853
+
854
+
855
+ # Config fields
856
+ self.cfg_fields = {}
857
+ defaults = [("snippet_size", "2000"),
858
+ ("pre_ratio", "0.3"),
859
+ ("post_ratio", "0.7"),
860
+ ("distance_match", "300"),
861
+ ("fuzzy_threshold", "94")]
862
+ for label, val in defaults:
863
+ row = wx.BoxSizer(wx.HORIZONTAL)
864
+ lbl = wx.StaticText(panel, label=label + ":") # Update this line to include the new labels
865
+ if label == "snippet_size":
866
+ lbl.SetLabel("Snippet Size (chars):")
867
+ elif label == "pre_ratio":
868
+ lbl.SetLabel("Pre Ratio (%):")
869
+ elif label == "post_ratio":
870
+ lbl.SetLabel("Post Ratio (%):")
871
+ elif label == "distance_match":
872
+ lbl.SetLabel("Distance Match (chars):")
873
+ elif label == "fuzzy_threshold":
874
+ lbl.SetLabel("Fuzzy Threshold (%):")
875
+
876
+ fld = wx.TextCtrl(panel, value=val, size=(50, -1))
877
+ # Bind focus event for validation
878
+ fld.Bind(wx.EVT_KILL_FOCUS, self.on_field_focus_lost)
879
+ row.Add(lbl, 0, wx.ALL | wx.ALIGN_LEFT, 2)
880
+ row.Add(fld, 0, wx.ALL | wx.ALIGN_LEFT, 2)
881
+ ctrl_sizer.Add(row, 0, wx.ALL | wx.ALIGN_LEFT, 2)
882
+ self.cfg_fields[label] = fld
883
+
884
+
885
+ middle_sizer.Add(ctrl_sizer, 0, wx.ALL | wx.ALIGN_LEFT, 6)
886
+
887
+ # --- Right image panel 700x261 pixel
888
+ bmp = wx.Bitmap(resource_path("example.bmp"), wx.BITMAP_TYPE_BMP)
889
+ #bmp = wx.Bitmap("example.bmp", wx.BITMAP_TYPE_BMP)
890
+
891
+ # compute exact size
892
+ img_w = bmp.GetWidth()
893
+ img_h = bmp.GetHeight()
894
+ border = 1
895
+
896
+ border_panel = wx.Panel(panel, size=(img_w + 2*border, img_h + 2*border))
897
+ border_panel.SetBackgroundColour(wx.WHITE)
898
+
899
+ self.img_ctrl = wx.StaticBitmap(border_panel, bitmap=bmp)
900
+
901
+ s = wx.BoxSizer(wx.VERTICAL)
902
+ s.Add(self.img_ctrl, 0, wx.ALL | wx.ALIGN_CENTER, border)
903
+ border_panel.SetSizer(s)
904
+
905
+ middle_sizer.Add(border_panel, 0, wx.ALL | wx.ALIGN_LEFT, 6)
906
+
907
+ # Bottom: results (wildcard and fuzzy) across full width
908
+ result_sizer = wx.BoxSizer(wx.VERTICAL)
909
+
910
+ # Add the summary labels BEFORE the text controls
911
+ self.wildcard_summary_label = wx.StaticText(panel, label="Wildcard Results:")
912
+ result_sizer.Add(self.wildcard_summary_label, 0, wx.ALL | wx.ALIGN_LEFT, 2)
913
+
914
+ self.wildcard_box = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY, size=(-1, 220))
915
+ result_sizer.Add(self.wildcard_box, 1, wx.EXPAND | wx.ALL, 4)
916
+
917
+ self.fuzzy_summary_label = wx.StaticText(panel, label="Fuzzy Results:")
918
+ result_sizer.Add(self.fuzzy_summary_label, 0, wx.ALL | wx.ALIGN_LEFT, 2)
919
+
920
+ self.fuzzy_box = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY, size=(-1, 220))
921
+ result_sizer.Add(self.fuzzy_box, 1, wx.EXPAND | wx.ALL, 4)
922
+
923
+ # Main vertical layout using only horizontal alignment flags where appropriate
924
+ main_sizer = wx.BoxSizer(wx.VERTICAL)
925
+ main_sizer.Add(top_sizer, 0, wx.ALL | wx.ALIGN_LEFT, 6)
926
+ main_sizer.Add(middle_sizer, 0, wx.ALL | wx.ALIGN_LEFT, 6)
927
+ main_sizer.Add(result_sizer, 1, wx.EXPAND | wx.ALL, 6)
928
+
929
+ panel.SetSizer(main_sizer)
930
+
931
+ # Events
932
+ self.file_picker.Bind(wx.EVT_FILEPICKER_CHANGED, self.on_path_change)
933
+ self.dir_picker.Bind(wx.EVT_DIRPICKER_CHANGED, self.on_path_change)
934
+
935
+ # Thread controls
936
+ self.worker = None
937
+ self.stop_event = threading.Event()
938
+
939
+ def on_field_focus_lost(self, evt):
940
+ """Validate all fields when any field loses focus"""
941
+ self.validate_all_fields()
942
+ evt.Skip() # Allow normal processing to continue
943
+
944
+ def validate_all_fields(self):
945
+ """Validate all configuration fields and enforce dependencies"""
946
+ try:
947
+ # Get current values
948
+ snippet_size_val = self.cfg_fields["snippet_size"].GetValue().strip()
949
+ pre_ratio_val = self.cfg_fields["pre_ratio"].GetValue().strip()
950
+ post_ratio_val = self.cfg_fields["post_ratio"].GetValue().strip()
951
+ distance_match_val = self.cfg_fields["distance_match"].GetValue().strip()
952
+ fuzzy_threshold_val = self.cfg_fields["fuzzy_threshold"].GetValue().strip()
953
+
954
+ # Default values if empty
955
+ snippet_size_val = snippet_size_val if snippet_size_val else "2000"
956
+ pre_ratio_val = pre_ratio_val if pre_ratio_val else "0.3"
957
+ post_ratio_val = post_ratio_val if post_ratio_val else "0.7"
958
+ distance_match_val = distance_match_val if distance_match_val else "300"
959
+ fuzzy_threshold_val = fuzzy_threshold_val if fuzzy_threshold_val else "94"
960
+
961
+ # Validate and process each field
962
+ # snippet_size: min=0, max=999999, round to integer
963
+ snippet_size = int(float(snippet_size_val)) if snippet_size_val else 2000
964
+ snippet_size = max(0, min(999999, snippet_size))
965
+
966
+ # pre_ratio: min=0.1, max=0.9, 1 decimal place
967
+ pre_ratio = round(float(pre_ratio_val), 1) if pre_ratio_val else 0.3
968
+ pre_ratio = max(0.1, min(0.9, pre_ratio))
969
+
970
+ # post_ratio: min=0.1, max=0.9, 1 decimal place
971
+ post_ratio = round(float(post_ratio_val), 1) if post_ratio_val else 0.7
972
+ post_ratio = max(0.1, min(0.9, post_ratio))
973
+
974
+ # Ensure pre + post = 1 (adjust one to maintain sum)
975
+ total = pre_ratio + post_ratio
976
+ if abs(total - 1.0) > 0.001: # Allow small floating point differences
977
+ # Adjust post_ratio to make the sum equal to 1.0
978
+ post_ratio = round(1.0 - pre_ratio, 1)
979
+ self.cfg_fields["post_ratio"].SetValue(str(post_ratio))
980
+
981
+ # distance_match: min=0, max=snippet_size, round to integer
982
+ distance_match = int(float(distance_match_val)) if distance_match_val else 300
983
+ distance_match = max(0, min(snippet_size, distance_match))
984
+
985
+ # If snippet_size < distance_match, adjust snippet_size to match
986
+ if snippet_size < distance_match:
987
+ snippet_size = distance_match
988
+ self.cfg_fields["snippet_size"].SetValue(str(snippet_size))
989
+
990
+ # fuzzy_threshold: min=1, max=100, round to integer
991
+ fuzzy_threshold = int(float(fuzzy_threshold_val)) if fuzzy_threshold_val else 94
992
+ fuzzy_threshold = max(1, min(100, fuzzy_threshold))
993
+
994
+ # Apply validated values back to fields
995
+ self.cfg_fields["snippet_size"].SetValue(str(snippet_size))
996
+ self.cfg_fields["pre_ratio"].SetValue(str(pre_ratio))
997
+ self.cfg_fields["post_ratio"].SetValue(str(post_ratio))
998
+ self.cfg_fields["distance_match"].SetValue(str(distance_match))
999
+ self.cfg_fields["fuzzy_threshold"].SetValue(str(fuzzy_threshold))
1000
+
1001
+ except Exception as e:
1002
+ # If validation fails, show error but don't block the user
1003
+ wx.MessageBox(f"Validation Error: {str(e)}", "Error")
1004
+
1005
+ def on_path_change(self, evt):
1006
+ path = evt.GetPath()
1007
+ self.path_label.SetLabel(path)
1008
+
1009
+ def on_toggle(self, evt):
1010
+ btn = evt.GetEventObject()
1011
+ label = btn.GetLabel()
1012
+ if label == "AND":
1013
+ btn.SetLabel("OR")
1014
+ else:
1015
+ btn.SetLabel("AND")
1016
+
1017
+ def on_abort(self, evt):
1018
+ """Abort button now properly stops all processes"""
1019
+ if self.worker and self.worker.is_alive():
1020
+ # Set the stop event to signal all running operations to abort
1021
+ self.stop_event.set()
1022
+
1023
+ # Disable buttons immediately
1024
+ self.abort_button.Disable()
1025
+ self.start_button.Enable()
1026
+
1027
+ # Clear any text that might have been set during processing
1028
+ wx.CallAfter(self.wildcard_box.SetValue, "Aborting...")
1029
+ wx.CallAfter(self.fuzzy_box.SetValue, "Aborting...")
1030
+
1031
+ def on_start(self, evt):
1032
+ # get path
1033
+ path = self.path_label.GetLabel()
1034
+ if not path or path == "No file/folder selected":
1035
+ wx.MessageBox("Please select a file or folder first.", "Error")
1036
+ return
1037
+
1038
+ try:
1039
+ if os.path.isdir(path):
1040
+ txts = [str(Path(path) / f) for f in sorted(os.listdir(path))
1041
+ if f.lower().endswith(".txt") and os.path.isfile(os.path.join(path, f))]
1042
+ if not txts:
1043
+ wx.MessageBox("Selected folder contains no .txt files.", "Error")
1044
+ return
1045
+ paths = txts
1046
+ else:
1047
+ if not os.path.isfile(path):
1048
+ wx.MessageBox("Selected path is not a file.", "Error")
1049
+ return
1050
+ # Only allow .txt files - this validation was missing before
1051
+ if not path.lower().endswith(".txt"):
1052
+ wx.MessageBox("Please select a .txt file.", "Error")
1053
+ return
1054
+ paths = [path]
1055
+ except Exception as e:
1056
+ wx.MessageBox(f"Failed to access path: {str(e)}", "Error")
1057
+ return
1058
+
1059
+ # prepare config
1060
+ try:
1061
+ cfg = {
1062
+ "snippet_size": int(self.cfg_fields["snippet_size"].GetValue().strip()),
1063
+ "pre_ratio": float(self.cfg_fields["pre_ratio"].GetValue().strip()),
1064
+ "post_ratio": float(self.cfg_fields["post_ratio"].GetValue().strip()),
1065
+ "distance_match": int(self.cfg_fields["distance_match"].GetValue().strip()),
1066
+ "fuzzy_threshold": float(self.cfg_fields["fuzzy_threshold"].GetValue().strip()),
1067
+ }
1068
+ except Exception:
1069
+ wx.MessageBox("Please check numeric configuration values.", "Error")
1070
+ return
1071
+
1072
+ buzzwords_list = [t.GetValue().strip() for t in self.buzz_inputs]
1073
+ search_type_value = "AND" if self.toggle_buttons[0].GetLabel() == "AND" else "OR"
1074
+
1075
+ cfg["buzzwords"] = buzzwords_list
1076
+ cfg["search_type"] = search_type_value
1077
+
1078
+
1079
+ # UI state
1080
+ self.start_button.Disable()
1081
+ self.abort_button.Enable()
1082
+ self.wildcard_box.SetValue("Running...")
1083
+ self.fuzzy_box.SetValue("Running...")
1084
+
1085
+ # reset stop_event and start thread
1086
+ self.stop_event.clear()
1087
+
1088
+ # Overwrite output files at the beginning of each new search
1089
+ Path("output_snippets.txt").write_text("", encoding="utf-8", errors="surrogateescape")
1090
+ Path("output_fuzzy_snippets.txt").write_text("", encoding="utf-8", errors="surrogateescape")
1091
+
1092
+ self.worker = SearchThread(paths, cfg, self.stop_event, self.on_search_complete)
1093
+ self.worker.start()
1094
+
1095
+ def on_search_complete(self, wildcard_text, fuzzy_text, finished_ok):
1096
+ # This callback runs in worker thread; must marshal to main GUI thread
1097
+ def _update():
1098
+ if finished_ok:
1099
+ self.wildcard_box.SetValue(wildcard_text)
1100
+ self.fuzzy_box.SetValue(fuzzy_text)
1101
+
1102
+ # Append results to output files for each processed file
1103
+ Path("output_snippets.txt").write_text(wildcard_text, encoding="utf-8", errors="surrogateescape")
1104
+ Path("output_fuzzy_snippets.txt").write_text(fuzzy_text, encoding="utf-8", errors="surrogateescape")
1105
+
1106
+ else:
1107
+ # signals either error or aborted
1108
+ self.wildcard_box.SetValue(wildcard_text or "Aborted / Error")
1109
+ self.fuzzy_box.SetValue(fuzzy_text or "Aborted / Error")
1110
+ self.stop_event.clear()
1111
+ self.start_button.Enable()
1112
+ self.abort_button.Disable()
1113
+
1114
+ wx.CallAfter(_update)
1115
+
1116
+
1117
+
1118
+ def on_browse_file(self, evt):
1119
+ with wx.FileDialog(self, "Open Text File", wildcard="Text files (*.txt)|*.txt",
1120
+ style=wx.FD_OPEN | wx.FD_FILE_MUST_EXIST) as fileDialog:
1121
+ if fileDialog.ShowModal() == wx.ID_CANCEL:
1122
+ return
1123
+ path = fileDialog.GetPath()
1124
+ self.path_label.SetLabel(path)
1125
+
1126
+ def on_browse_dir(self, evt):
1127
+ with wx.DirDialog(self, "Choose a directory", style=wx.DD_DEFAULT_STYLE | wx.DD_DIR_MUST_EXIST) as dirDialog:
1128
+ if dirDialog.ShowModal() == wx.ID_CANCEL:
1129
+ return
1130
+ path = dirDialog.GetPath()
1131
+ self.path_label.SetLabel(path)
1132
+
1133
+
1134
+ if __name__ == "__main__":
1135
+ app = wx.App(False)
1136
+ frame = MainFrame()
1137
+ frame.Show()
1138
+ app.MainLoop()
1139
+
1140
+
1141
+
1142
+
1143
+