Coverage for openhcs/formats/pattern/pattern_discovery.py: 69.0%
155 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-14 05:57 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-14 05:57 +0000
1"""
2Pattern discovery engine for OpenHCS.
4This module provides a dedicated engine for discovering and grouping patterns
5in microscopy image files, separating this responsibility from FilenameParser.
6"""
8# Standard Library
9import logging
10import os
11import re
12from collections import defaultdict
13from pathlib import Path
14from typing import Any, Dict, List, Optional, Union
16from openhcs.constants.constants import DEFAULT_IMAGE_EXTENSION
17from openhcs.io.filemanager import FileManager
18# Core OpenHCS Interfaces
19from openhcs.microscopes.microscope_interfaces import FilenameParser
21logger = logging.getLogger(__name__)
23# Pattern utility functions
24def has_placeholders(pattern: str) -> bool:
25 """Check if pattern contains placeholder variables."""
26 return '{' in pattern and '}' in pattern
29class PatternDiscoveryEngine:
30 """
31 Engine for discovering and grouping patterns in microscopy image files.
33 This class is responsible for:
34 - Finding image files in directories
35 - Filtering files based on well IDs
36 - Generating patterns from files
37 - Grouping patterns by components
39 It works with a FilenameParser to parse individual filenames and a
40 FileManager to access the file system.
41 """
43 # Constants
44 PLACEHOLDER_PATTERN = '{iii}'
46 def __init__(self, parser: FilenameParser, filemanager: FileManager):
47 """
48 Initialize the pattern discovery engine.
50 Args:
51 parser: Parser for microscopy filenames
52 filemanager: FileManager for file system operations
53 """
54 self.parser = parser
55 self.filemanager = filemanager
57 def path_list_from_pattern(self, directory: Union[str, Path], pattern: str, backend: str, variable_components: Optional[List[str]] = None) -> List[str]:
58 """
59 Get a list of filenames matching a pattern in a directory.
61 Args:
62 directory: Directory to search (string or Path object)
63 pattern: Pattern to match (string with optional {iii} placeholders)
64 backend: Backend to use for file operations (required)
65 variable_components: List of components that can vary (will be ignored during matching)
67 Returns:
68 List of matching filenames
70 Raises:
71 ValueError: If directory does not exist
72 """
73 directory_path = str(directory) # Keep as string for FileManager consistency
74 if not self.filemanager.is_dir(directory_path, backend): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 raise FileNotFoundError(f"Directory not found: {directory_path}")
77 pattern_str = str(pattern)
79 # Handle literal filenames (patterns without placeholders)
80 if not has_placeholders(pattern_str): 80 ↛ 82line 80 didn't jump to line 82 because the condition on line 80 was never true
81 # Use FileManager to check if file exists
82 file_path = os.path.join(directory_path, pattern_str) # Use os.path.join instead of /
83 file_exists = self.filemanager.exists(file_path, backend)
84 if file_exists:
85 return [pattern_str]
86 return []
88 # Handle pattern strings with placeholders
89 logger.debug("Using pattern template: %s", pattern_str)
91 # Parse pattern template to get expected structure
92 pattern_template = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001')
93 pattern_metadata = self.parser.parse_filename(pattern_template)
94 if not pattern_metadata: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true
95 logger.error("Failed to parse pattern template: %s", pattern_template)
96 return []
98 # Get all image files in directory using FileManager
99 all_files = self.filemanager.list_image_files(str(directory_path), backend)
101 matching_files = []
103 for file_path in all_files:
104 # Extract filename from path
105 if isinstance(file_path, str): 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 filename = os.path.basename(file_path)
107 elif isinstance(file_path, Path): 107 ↛ 110line 107 didn't jump to line 110 because the condition on line 107 was always true
108 filename = file_path.name
109 else:
110 continue
112 # Parse the actual filename
113 file_metadata = self.parser.parse_filename(filename)
114 if not file_metadata: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true
115 continue
117 # Check if file matches pattern structure
118 if self._matches_pattern_structure(file_metadata, pattern_metadata, variable_components or []):
119 matching_files.append(filename)
121 return matching_files
123 def _matches_pattern_structure(self, file_metadata: Dict[str, Any], pattern_metadata: Dict[str, Any], variable_components: List[str]) -> bool:
124 """
125 Check if a file's metadata matches a pattern's structure.
127 Args:
128 file_metadata: Metadata extracted from actual filename
129 pattern_metadata: Metadata extracted from pattern template
130 variable_components: List of components that can vary
132 Returns:
133 True if file matches pattern structure, False otherwise
134 """
135 # Check all components in the pattern
136 for component in self.parser.FILENAME_COMPONENTS:
137 if component not in pattern_metadata: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true
138 continue
140 pattern_value = pattern_metadata[component]
141 file_value = file_metadata.get(component)
143 # Variable components can have any value
144 if component in variable_components:
145 # File must have a value for this component, but it can be anything
146 if file_value is None: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true
147 return False
148 continue
150 # Fixed components must match exactly
151 if pattern_value != file_value:
152 return False
154 return True
156 def group_patterns_by_component(
157 self,
158 patterns: List[str],
159 component: str
160 ) -> Dict[str, List[str]]:
161 """
162 Group patterns by a required component.
164 Args:
165 patterns: List of pattern strings to group
166 component: Component to group by
168 Returns:
169 Dictionary mapping component values to lists of patterns
171 Raises:
172 TypeError: If patterns are not strings
173 ValueError: If component is not present in a pattern
174 """
175 grouped_patterns = defaultdict(list)
176 # Validate inputs
177 if not component or not isinstance(component, str):
178 raise ValueError(f"Component must be a non-empty string, got {component}")
180 if not all(isinstance(p, str) for p in patterns):
181 raise TypeError("All patterns must be strings")
183 for pattern in patterns:
184 pattern_str = str(pattern)
186 # Note: Patterns with template fields (like {iii}) are EXPECTED for pattern discovery
187 # The has_placeholders() check is only relevant when using patterns as concrete filenames
188 # For pattern discovery and grouping, we WANT patterns with placeholders
190 pattern_template = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001')
191 metadata = self.parser.parse_filename(pattern_template)
193 if not metadata or component not in metadata or metadata[component] is None:
194 raise ValueError(
195 f"Missing required component '{component}' in pattern: {pattern_str}"
196 )
198 value = str(metadata[component])
199 grouped_patterns[value].append(pattern)
201 return grouped_patterns
203 def auto_detect_patterns(
204 self,
205 folder_path: Union[str, Path],
206 well_filter: List[str],
207 extensions: List[str],
208 group_by: Optional[str],
209 variable_components: List[str],
210 backend: str
211 ) -> Dict[str, Any]:
212 """
213 Automatically detect image patterns in a folder.
214 """
215 files_by_well = self._find_and_filter_images(
216 folder_path, well_filter, extensions, True, backend
217 )
219 if not files_by_well: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true
220 return {}
222 result = {}
223 for well, files in files_by_well.items():
224 patterns = self._generate_patterns_for_files(files, variable_components)
226 # Validate patterns
227 for pattern in patterns:
228 if not isinstance(pattern, str): 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true
229 raise TypeError(f"Pattern generator returned invalid type: {type(pattern).__name__}")
231 result[well] = (
232 self.group_patterns_by_component(patterns, component=group_by)
233 if group_by else patterns
234 )
236 return result
238 def _find_and_filter_images(
239 self,
240 folder_path: Union[str, Path],
241 well_filter: List[str],
242 extensions: List[str],
243 recursive: bool,
244 backend: str
245 ) -> Dict[str, List[Any]]:
246 """
247 Find all image files in a directory and filter by well.
249 Args:
250 folder_path: Path to the folder to search (string or Path object)
251 well_filter: List of wells to include
252 extensions: List of file extensions to include
253 recursive: Whether to search recursively
254 backend: Backend to use for file operations (required)
256 Returns:
257 Dictionary mapping wells to lists of image paths
259 Raises:
260 TypeError: If folder_path is not a string or Path object
261 ValueError: If well_filter is empty or folder_path does not exist
262 """
263 # Convert to Path and validate using FileManager abstraction
264 folder_path = Path(folder_path)
265 if not self.filemanager.exists(str(folder_path), backend): 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true
266 raise FileNotFoundError(f"Folder not found: {folder_path}")
268 # Validate inputs
269 if not well_filter: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true
270 raise ValueError("well_filter cannot be empty")
272 extensions = extensions or ['.tif', '.TIF', '.tiff', '.TIFF']
274 image_paths = self.filemanager.list_image_files(folder_path, backend, extensions=extensions, recursive=recursive)
276 files_by_well = defaultdict(list)
277 for img_path in image_paths:
278 # FileManager should return strings, but handle Path objects too
279 if isinstance(img_path, str):
280 filename = os.path.basename(img_path)
281 elif isinstance(img_path, Path): 281 ↛ 285line 281 didn't jump to line 285 because the condition on line 281 was always true
282 filename = img_path.name
283 else:
284 # Skip any unexpected types
285 logger.warning(f"Unexpected file path type: {type(img_path).__name__}")
286 continue
288 metadata = self.parser.parse_filename(filename)
289 if not metadata: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true
290 continue
292 well = metadata['well']
293 if well not in well_filter:
294 continue
296 files_by_well[well].append(img_path)
298 return files_by_well
300 def _generate_patterns_for_files(
301 self,
302 files: List[Any],
303 variable_components: List[str]
304 ) -> List[str]:
305 """
306 Generate patterns for a list of files.
308 Args:
309 files: List of file path objects representing files
310 variable_components: List of components that can vary in the pattern
312 Returns:
313 List of pattern strings
315 Raises:
316 TypeError: If files list is not a list
317 ValueError: If pattern templates cannot be instantiated
318 """
319 # Validate input parameters
320 if not isinstance(files, list): 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true
321 raise TypeError(f"Expected list of file path objects, got {type(files).__name__}")
323 if not isinstance(variable_components, list): 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true
324 raise TypeError(f"Expected list of variable components, got {type(variable_components).__name__}")
327 component_combinations = defaultdict(list)
328 for file_path in files:
329 # FileManager should return strings, but handle Path objects too
330 if isinstance(file_path, str):
331 filename = os.path.basename(file_path)
332 elif isinstance(file_path, Path): 332 ↛ 336line 332 didn't jump to line 336 because the condition on line 332 was always true
333 filename = file_path.name
334 else:
335 # Skip any unexpected types
336 logger.warning(f"Unexpected file path type: {type(file_path).__name__}")
337 continue
339 metadata = self.parser.parse_filename(filename)
340 if not metadata: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true
341 continue
343 key_parts = []
344 for comp in self.parser.FILENAME_COMPONENTS:
345 if comp in metadata and comp not in variable_components and metadata[comp] is not None:
346 key_parts.append(f"{comp}={metadata[comp]}")
348 key = ",".join(key_parts)
349 component_combinations[key].append((file_path, metadata))
351 patterns = []
352 for _, files_metadata in component_combinations.items():
353 if not files_metadata: 353 ↛ 354line 353 didn't jump to line 354 because the condition on line 353 was never true
354 continue
356 _, template_metadata = files_metadata[0]
357 pattern_args = {}
358 for comp in self.parser.FILENAME_COMPONENTS:
359 if comp in template_metadata: 359 ↛ 358line 359 didn't jump to line 358 because the condition on line 359 was always true
360 if comp in variable_components:
361 pattern_args[comp] = self.PLACEHOLDER_PATTERN
362 else:
363 pattern_args[comp] = template_metadata[comp]
365 # 🔒 Clause 93 — Declarative Execution Enforcement
366 # Ensure all required components are present
367 if 'well' not in pattern_args or pattern_args['well'] is None: 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true
368 raise ValueError("Clause 93 Violation: 'well' is a required component for pattern templates")
370 pattern_str = self.parser.construct_filename(
371 well=pattern_args['well'],
372 site=pattern_args.get('site'),
373 channel=pattern_args.get('channel'),
374 z_index=pattern_args.get('z_index'),
375 extension=pattern_args.get('extension') or DEFAULT_IMAGE_EXTENSION
376 )
378 # Validate that the pattern can be instantiated
379 test_instance = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001')
380 if not self.parser.parse_filename(test_instance): 380 ↛ 381line 380 didn't jump to line 381 because the condition on line 380 was never true
381 raise ValueError(f"Clause 93 Violation: Pattern template '{pattern_str}' cannot be instantiated")
383 patterns.append(pattern_str)
385 # 🔒 Clause 92 — Structural Validation First
386 # Validate the final pattern list
387 if not patterns: 387 ↛ 388line 387 didn't jump to line 388 because the condition on line 387 was never true
388 raise ValueError(
389 "No patterns generated from files. This indicates either: "
390 "(1) no image files found in the directory, "
391 "(2) files don't match the expected naming convention, or "
392 "(3) pattern generation logic failed. "
393 "Check that image files exist and follow the expected naming pattern."
394 )
396 return patterns