Coverage for openhcs/formats/pattern/pattern_discovery.py: 75.4%
166 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-04 02:09 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-04 02:09 +0000
1"""
2Pattern discovery engine for OpenHCS.
4This module provides a dedicated engine for discovering and grouping patterns
5in microscopy image files, separating this responsibility from FilenameParser.
6"""
8# Standard Library
9import logging
10import os
11from collections import defaultdict
12from pathlib import Path
13from typing import Any, Dict, List, Optional, Union
15from openhcs.constants.constants import DEFAULT_IMAGE_EXTENSION
16from openhcs.io.filemanager import FileManager
17# Core OpenHCS Interfaces
18from openhcs.microscopes.microscope_interfaces import FilenameParser
20# Note: Previously used GenericPatternEngine, but now we always use microscope-specific parsers
22logger = logging.getLogger(__name__)
24# Pattern utility functions
25def has_placeholders(pattern: str) -> bool:
26 """Check if pattern contains placeholder variables."""
27 return '{' in pattern and '}' in pattern
30class PatternDiscoveryEngine:
31 """
32 Engine for discovering and grouping patterns in microscopy image files.
34 This class is responsible for:
35 - Finding image files in directories
36 - Filtering files based on well IDs
37 - Generating patterns from files
38 - Grouping patterns by components
40 It works with a FilenameParser to parse individual filenames and a
41 FileManager to access the file system.
42 """
44 # Constants
45 PLACEHOLDER_PATTERN = '{iii}'
47 def __init__(self, parser: FilenameParser, filemanager: FileManager):
48 """
49 Initialize the pattern discovery engine.
51 Args:
52 parser: Parser for microscopy filenames
53 filemanager: FileManager for file system operations
54 """
55 self.parser = parser
56 self.filemanager = filemanager
58 def path_list_from_pattern(self, directory: Union[str, Path], pattern: str, backend: str, variable_components: Optional[List[str]] = None) -> List[str]:
59 """
60 Get a list of filenames matching a pattern in a directory.
62 Args:
63 directory: Directory to search (string or Path object)
64 pattern: Pattern to match (string with optional {iii} placeholders)
65 backend: Backend to use for file operations (required)
66 variable_components: List of components that can vary (will be ignored during matching)
68 Returns:
69 List of matching filenames
71 Raises:
72 ValueError: If directory does not exist
73 """
74 directory_path = str(directory) # Keep as string for FileManager consistency
75 if not self.filemanager.is_dir(directory_path, backend): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 raise FileNotFoundError(f"Directory not found: {directory_path}")
78 pattern_str = str(pattern)
80 # Handle literal filenames (patterns without placeholders)
81 if not has_placeholders(pattern_str): 81 ↛ 83line 81 didn't jump to line 83 because the condition on line 81 was never true
82 # Use FileManager to check if file exists
83 file_path = os.path.join(directory_path, pattern_str) # Use os.path.join instead of /
84 file_exists = self.filemanager.exists(file_path, backend)
85 if file_exists:
86 return [pattern_str]
87 return []
89 # Handle pattern strings with placeholders
90 logger.debug("Using pattern template: %s", pattern_str)
92 # Parse pattern template to get expected structure
93 pattern_template = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001')
94 pattern_metadata = self.parser.parse_filename(pattern_template)
95 if not pattern_metadata: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true
96 logger.error("Failed to parse pattern template: %s", pattern_template)
97 return []
99 # Get all image files in directory using FileManager
100 all_files = self.filemanager.list_image_files(str(directory_path), backend)
102 matching_files = []
104 for file_path in all_files:
105 # Extract filename from path
106 if isinstance(file_path, str): 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true
107 filename = os.path.basename(file_path)
108 elif isinstance(file_path, Path): 108 ↛ 111line 108 didn't jump to line 111 because the condition on line 108 was always true
109 filename = file_path.name
110 else:
111 continue
113 # Parse the actual filename
114 file_metadata = self.parser.parse_filename(filename)
115 if not file_metadata: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true
116 continue
118 # Check if file matches pattern structure
119 if self._matches_pattern_structure(file_metadata, pattern_metadata, variable_components or []):
120 matching_files.append(filename)
122 return matching_files
124 def _matches_pattern_structure(self, file_metadata: Dict[str, Any], pattern_metadata: Dict[str, Any], variable_components: List[str]) -> bool:
125 """
126 Check if a file's metadata matches a pattern's structure.
128 Args:
129 file_metadata: Metadata extracted from actual filename
130 pattern_metadata: Metadata extracted from pattern template
131 variable_components: List of components that can vary
133 Returns:
134 True if file matches pattern structure, False otherwise
135 """
136 # Check all components in the pattern
137 for component in self.parser.FILENAME_COMPONENTS:
138 if component not in pattern_metadata: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 continue
141 pattern_value = pattern_metadata[component]
142 file_value = file_metadata.get(component)
144 # Variable components can have any value
145 if component in variable_components:
146 # File must have a value for this component, but it can be anything
147 if file_value is None: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 return False
149 continue
151 # Fixed components must match exactly
152 if pattern_value != file_value:
153 return False
155 return True
157 def group_patterns_by_component(
158 self,
159 patterns: List[str],
160 component: str
161 ) -> Dict[str, List[str]]:
162 """
163 Group patterns by a required component.
165 Args:
166 patterns: List of pattern strings to group
167 component: Component to group by
169 Returns:
170 Dictionary mapping component values to lists of patterns
172 Raises:
173 TypeError: If patterns are not strings
174 ValueError: If component is not present in a pattern
175 """
176 grouped_patterns = defaultdict(list)
177 # Validate inputs
178 if not component or not isinstance(component, str): 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true
179 raise ValueError(f"Component must be a non-empty string, got {component}")
181 if not all(isinstance(p, str) for p in patterns): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true
182 raise TypeError("All patterns must be strings")
184 for pattern in patterns:
185 pattern_str = str(pattern)
187 # Note: Patterns with template fields (like {iii}) are EXPECTED for pattern discovery
188 # The has_placeholders() check is only relevant when using patterns as concrete filenames
189 # For pattern discovery and grouping, we WANT patterns with placeholders
191 pattern_template = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001')
192 metadata = self.parser.parse_filename(pattern_template)
194 if not metadata or component not in metadata or metadata[component] is None: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true
195 raise ValueError(
196 f"Missing required component '{component}' in pattern: {pattern_str}"
197 )
199 value = str(metadata[component])
200 grouped_patterns[value].append(pattern)
202 return grouped_patterns
204 def auto_detect_patterns(
205 self,
206 folder_path: Union[str, Path],
207 variable_components: List[str],
208 backend: str,
209 extensions: List[str] = None,
210 group_by=None, # Accept GroupBy enum or None
211 recursive: bool = False,
212 **kwargs # Dynamic filter parameters (e.g., well_filter, site_filter)
213 ) -> Dict[str, Any]:
214 """
215 Automatically detect image patterns in a folder.
216 """
217 # Extract axis_filter from dynamic kwargs
218 from openhcs.constants import MULTIPROCESSING_AXIS
219 axis_name = MULTIPROCESSING_AXIS.value
220 axis_filter = kwargs.get(f"{axis_name}_filter")
222 files_by_axis = self._find_and_filter_images(
223 folder_path, axis_filter, extensions, True, backend
224 )
226 if not files_by_axis: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true
227 return {}
229 result = {}
230 for axis_value, files in files_by_axis.items():
231 patterns = self._generate_patterns_for_files(files, variable_components, axis_value)
233 # Validate patterns
234 for pattern in patterns:
235 if not isinstance(pattern, str): 235 ↛ 236line 235 didn't jump to line 236 because the condition on line 235 was never true
236 raise TypeError(f"Pattern generator returned invalid type: {type(pattern).__name__}")
238 if group_by: 238 ↛ 246line 238 didn't jump to line 246 because the condition on line 238 was always true
239 # Extract string value from GroupBy enum for pattern grouping
240 component_string = group_by.value if group_by.value else None
241 if component_string: 241 ↛ 244line 241 didn't jump to line 244 because the condition on line 241 was always true
242 result[axis_value] = self.group_patterns_by_component(patterns, component=component_string)
243 else:
244 result[axis_value] = patterns
245 else:
246 result[axis_value] = patterns
248 return result
250 def _find_and_filter_images(
251 self,
252 folder_path: Union[str, Path],
253 axis_filter: List[str],
254 extensions: List[str],
255 recursive: bool,
256 backend: str
257 ) -> Dict[str, List[Any]]:
258 """
259 Find all image files in a directory and filter by multiprocessing axis.
261 Args:
262 folder_path: Path to the folder to search (string or Path object)
263 axis_filter: List of axis values to include
264 extensions: List of file extensions to include
265 recursive: Whether to search recursively
266 backend: Backend to use for file operations (required)
268 Returns:
269 Dictionary mapping axis values to lists of image paths
271 Raises:
272 TypeError: If folder_path is not a string or Path object
273 ValueError: If axis_filter is empty or folder_path does not exist
274 """
275 # Convert to Path and validate using FileManager abstraction
276 folder_path = Path(folder_path)
277 if not self.filemanager.exists(str(folder_path), backend): 277 ↛ 278line 277 didn't jump to line 278 because the condition on line 277 was never true
278 raise FileNotFoundError(f"Folder not found: {folder_path}")
280 # Validate inputs
281 if not axis_filter: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true
282 raise ValueError("axis_filter cannot be empty")
284 extensions = extensions or ['.tif', '.TIF', '.tiff', '.TIFF']
286 image_paths = self.filemanager.list_image_files(folder_path, backend, extensions=extensions, recursive=recursive)
288 files_by_axis = defaultdict(list)
289 for img_path in image_paths:
290 # FileManager should return strings, but handle Path objects too
291 if isinstance(img_path, str):
292 filename = os.path.basename(img_path)
293 elif isinstance(img_path, Path): 293 ↛ 297line 293 didn't jump to line 297 because the condition on line 293 was always true
294 filename = img_path.name
295 else:
296 # Skip any unexpected types
297 logger.warning(f"Unexpected file path type: {type(img_path).__name__}")
298 continue
300 metadata = self.parser.parse_filename(filename)
301 if not metadata: 301 ↛ 302line 301 didn't jump to line 302 because the condition on line 301 was never true
302 continue
304 # Get multiprocessing axis dynamically from configuration
305 from openhcs.constants import MULTIPROCESSING_AXIS
306 axis_key = MULTIPROCESSING_AXIS.value
307 axis_value = metadata.get(axis_key)
308 if not axis_value or axis_value not in axis_filter:
309 continue
311 files_by_axis[axis_value].append(img_path)
313 return files_by_axis
315 def _generate_patterns_for_files(
316 self,
317 files: List[Any],
318 variable_components: List[str],
319 axis_value: str
320 ) -> List[str]:
321 """
322 Generate patterns for a list of files.
324 Args:
325 files: List of file path objects representing files
326 variable_components: List of components that can vary in the pattern
328 Returns:
329 List of pattern strings
331 Raises:
332 TypeError: If files list is not a list
333 ValueError: If pattern templates cannot be instantiated
334 """
335 # Validate input parameters
336 if not isinstance(files, list): 336 ↛ 337line 336 didn't jump to line 337 because the condition on line 336 was never true
337 raise TypeError(f"Expected list of file path objects, got {type(files).__name__}")
339 if not isinstance(variable_components, list): 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true
340 raise TypeError(f"Expected list of variable components, got {type(variable_components).__name__}")
342 # Use microscope-specific parser for pattern generation
345 component_combinations = defaultdict(list)
346 for file_path in files:
347 # FileManager should return strings, but handle Path objects too
348 if isinstance(file_path, str):
349 filename = os.path.basename(file_path)
350 elif isinstance(file_path, Path): 350 ↛ 354line 350 didn't jump to line 354 because the condition on line 350 was always true
351 filename = file_path.name
352 else:
353 # Skip any unexpected types
354 logger.warning(f"Unexpected file path type: {type(file_path).__name__}")
355 continue
357 metadata = self.parser.parse_filename(filename)
358 if not metadata: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true
359 continue
361 key_parts = []
362 for comp in self.parser.FILENAME_COMPONENTS:
363 if comp in metadata and comp not in variable_components and metadata[comp] is not None:
364 key_parts.append(f"{comp}={metadata[comp]}")
366 key = ",".join(key_parts)
367 component_combinations[key].append((file_path, metadata))
369 patterns = []
370 for _, files_metadata in component_combinations.items():
371 if not files_metadata: 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true
372 continue
374 _, template_metadata = files_metadata[0]
375 # Generate pattern arguments for all discovered components
376 pattern_args = {}
377 for comp in self.parser.FILENAME_COMPONENTS:
378 if comp in template_metadata: 378 ↛ 377line 378 didn't jump to line 377 because the condition on line 378 was always true
379 if comp in variable_components:
380 pattern_args[comp] = self.PLACEHOLDER_PATTERN
381 else:
382 pattern_args[comp] = template_metadata[comp]
384 # 🔒 Clause 93 — Declarative Execution Enforcement
385 # Ensure pattern generation succeeded
386 if not pattern_args: 386 ↛ 387line 386 didn't jump to line 387 because the condition on line 386 was never true
387 raise ValueError("Clause 93 Violation: No components found in template metadata for pattern generation")
389 # Use metaprogramming approach - pass all components dynamically
390 extension = pattern_args.get('extension') or DEFAULT_IMAGE_EXTENSION
391 component_kwargs = {comp: pattern_args.get(comp) for comp in self.parser.get_component_names() if comp in pattern_args}
393 pattern_str = self.parser.construct_filename(
394 extension=extension,
395 **component_kwargs
396 )
398 # Validate that the pattern can be instantiated
399 test_instance = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001')
400 if not self.parser.parse_filename(test_instance): 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true
401 raise ValueError(f"Clause 93 Violation: Pattern template '{pattern_str}' cannot be instantiated")
403 patterns.append(pattern_str)
405 # 🔒 Clause 92 — Structural Validation First
406 # Validate the final pattern list
407 if not patterns: 407 ↛ 408line 407 didn't jump to line 408 because the condition on line 407 was never true
408 raise ValueError(
409 "No patterns generated from files. This indicates either: "
410 "(1) no image files found in the directory, "
411 "(2) files don't match the expected naming convention, or "
412 "(3) pattern generation logic failed. "
413 "Check that image files exist and follow the expected naming pattern."
414 )
416 return patterns