Coverage for openhcs/formats/pattern/pattern_discovery.py: 75.4%

166 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-04 02:09 +0000

1""" 

2Pattern discovery engine for OpenHCS. 

3 

4This module provides a dedicated engine for discovering and grouping patterns 

5in microscopy image files, separating this responsibility from FilenameParser. 

6""" 

7 

8# Standard Library 

9import logging 

10import os 

11from collections import defaultdict 

12from pathlib import Path 

13from typing import Any, Dict, List, Optional, Union 

14 

15from openhcs.constants.constants import DEFAULT_IMAGE_EXTENSION 

16from openhcs.io.filemanager import FileManager 

17# Core OpenHCS Interfaces 

18from openhcs.microscopes.microscope_interfaces import FilenameParser 

19 

20# Note: Previously used GenericPatternEngine, but now we always use microscope-specific parsers 

21 

22logger = logging.getLogger(__name__) 

23 

24# Pattern utility functions 

25def has_placeholders(pattern: str) -> bool: 

26 """Check if pattern contains placeholder variables.""" 

27 return '{' in pattern and '}' in pattern 

28 

29 

30class PatternDiscoveryEngine: 

31 """ 

32 Engine for discovering and grouping patterns in microscopy image files. 

33 

34 This class is responsible for: 

35 - Finding image files in directories 

36 - Filtering files based on well IDs 

37 - Generating patterns from files 

38 - Grouping patterns by components 

39 

40 It works with a FilenameParser to parse individual filenames and a 

41 FileManager to access the file system. 

42 """ 

43 

44 # Constants 

45 PLACEHOLDER_PATTERN = '{iii}' 

46 

47 def __init__(self, parser: FilenameParser, filemanager: FileManager): 

48 """ 

49 Initialize the pattern discovery engine. 

50 

51 Args: 

52 parser: Parser for microscopy filenames 

53 filemanager: FileManager for file system operations 

54 """ 

55 self.parser = parser 

56 self.filemanager = filemanager 

57 

58 def path_list_from_pattern(self, directory: Union[str, Path], pattern: str, backend: str, variable_components: Optional[List[str]] = None) -> List[str]: 

59 """ 

60 Get a list of filenames matching a pattern in a directory. 

61 

62 Args: 

63 directory: Directory to search (string or Path object) 

64 pattern: Pattern to match (string with optional {iii} placeholders) 

65 backend: Backend to use for file operations (required) 

66 variable_components: List of components that can vary (will be ignored during matching) 

67 

68 Returns: 

69 List of matching filenames 

70 

71 Raises: 

72 ValueError: If directory does not exist 

73 """ 

74 directory_path = str(directory) # Keep as string for FileManager consistency 

75 if not self.filemanager.is_dir(directory_path, backend): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 raise FileNotFoundError(f"Directory not found: {directory_path}") 

77 

78 pattern_str = str(pattern) 

79 

80 # Handle literal filenames (patterns without placeholders) 

81 if not has_placeholders(pattern_str): 81 ↛ 83line 81 didn't jump to line 83 because the condition on line 81 was never true

82 # Use FileManager to check if file exists 

83 file_path = os.path.join(directory_path, pattern_str) # Use os.path.join instead of / 

84 file_exists = self.filemanager.exists(file_path, backend) 

85 if file_exists: 

86 return [pattern_str] 

87 return [] 

88 

89 # Handle pattern strings with placeholders 

90 logger.debug("Using pattern template: %s", pattern_str) 

91 

92 # Parse pattern template to get expected structure 

93 pattern_template = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001') 

94 pattern_metadata = self.parser.parse_filename(pattern_template) 

95 if not pattern_metadata: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 logger.error("Failed to parse pattern template: %s", pattern_template) 

97 return [] 

98 

99 # Get all image files in directory using FileManager 

100 all_files = self.filemanager.list_image_files(str(directory_path), backend) 

101 

102 matching_files = [] 

103 

104 for file_path in all_files: 

105 # Extract filename from path 

106 if isinstance(file_path, str): 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 filename = os.path.basename(file_path) 

108 elif isinstance(file_path, Path): 108 ↛ 111line 108 didn't jump to line 111 because the condition on line 108 was always true

109 filename = file_path.name 

110 else: 

111 continue 

112 

113 # Parse the actual filename 

114 file_metadata = self.parser.parse_filename(filename) 

115 if not file_metadata: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true

116 continue 

117 

118 # Check if file matches pattern structure 

119 if self._matches_pattern_structure(file_metadata, pattern_metadata, variable_components or []): 

120 matching_files.append(filename) 

121 

122 return matching_files 

123 

124 def _matches_pattern_structure(self, file_metadata: Dict[str, Any], pattern_metadata: Dict[str, Any], variable_components: List[str]) -> bool: 

125 """ 

126 Check if a file's metadata matches a pattern's structure. 

127 

128 Args: 

129 file_metadata: Metadata extracted from actual filename 

130 pattern_metadata: Metadata extracted from pattern template 

131 variable_components: List of components that can vary 

132 

133 Returns: 

134 True if file matches pattern structure, False otherwise 

135 """ 

136 # Check all components in the pattern 

137 for component in self.parser.FILENAME_COMPONENTS: 

138 if component not in pattern_metadata: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 continue 

140 

141 pattern_value = pattern_metadata[component] 

142 file_value = file_metadata.get(component) 

143 

144 # Variable components can have any value 

145 if component in variable_components: 

146 # File must have a value for this component, but it can be anything 

147 if file_value is None: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 return False 

149 continue 

150 

151 # Fixed components must match exactly 

152 if pattern_value != file_value: 

153 return False 

154 

155 return True 

156 

157 def group_patterns_by_component( 

158 self, 

159 patterns: List[str], 

160 component: str 

161 ) -> Dict[str, List[str]]: 

162 """ 

163 Group patterns by a required component. 

164 

165 Args: 

166 patterns: List of pattern strings to group 

167 component: Component to group by 

168 

169 Returns: 

170 Dictionary mapping component values to lists of patterns 

171 

172 Raises: 

173 TypeError: If patterns are not strings 

174 ValueError: If component is not present in a pattern 

175 """ 

176 grouped_patterns = defaultdict(list) 

177 # Validate inputs 

178 if not component or not isinstance(component, str): 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 raise ValueError(f"Component must be a non-empty string, got {component}") 

180 

181 if not all(isinstance(p, str) for p in patterns): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 raise TypeError("All patterns must be strings") 

183 

184 for pattern in patterns: 

185 pattern_str = str(pattern) 

186 

187 # Note: Patterns with template fields (like {iii}) are EXPECTED for pattern discovery 

188 # The has_placeholders() check is only relevant when using patterns as concrete filenames 

189 # For pattern discovery and grouping, we WANT patterns with placeholders 

190 

191 pattern_template = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001') 

192 metadata = self.parser.parse_filename(pattern_template) 

193 

194 if not metadata or component not in metadata or metadata[component] is None: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true

195 raise ValueError( 

196 f"Missing required component '{component}' in pattern: {pattern_str}" 

197 ) 

198 

199 value = str(metadata[component]) 

200 grouped_patterns[value].append(pattern) 

201 

202 return grouped_patterns 

203 

204 def auto_detect_patterns( 

205 self, 

206 folder_path: Union[str, Path], 

207 variable_components: List[str], 

208 backend: str, 

209 extensions: List[str] = None, 

210 group_by=None, # Accept GroupBy enum or None 

211 recursive: bool = False, 

212 **kwargs # Dynamic filter parameters (e.g., well_filter, site_filter) 

213 ) -> Dict[str, Any]: 

214 """ 

215 Automatically detect image patterns in a folder. 

216 """ 

217 # Extract axis_filter from dynamic kwargs 

218 from openhcs.constants import MULTIPROCESSING_AXIS 

219 axis_name = MULTIPROCESSING_AXIS.value 

220 axis_filter = kwargs.get(f"{axis_name}_filter") 

221 

222 files_by_axis = self._find_and_filter_images( 

223 folder_path, axis_filter, extensions, True, backend 

224 ) 

225 

226 if not files_by_axis: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true

227 return {} 

228 

229 result = {} 

230 for axis_value, files in files_by_axis.items(): 

231 patterns = self._generate_patterns_for_files(files, variable_components, axis_value) 

232 

233 # Validate patterns 

234 for pattern in patterns: 

235 if not isinstance(pattern, str): 235 ↛ 236line 235 didn't jump to line 236 because the condition on line 235 was never true

236 raise TypeError(f"Pattern generator returned invalid type: {type(pattern).__name__}") 

237 

238 if group_by: 238 ↛ 246line 238 didn't jump to line 246 because the condition on line 238 was always true

239 # Extract string value from GroupBy enum for pattern grouping 

240 component_string = group_by.value if group_by.value else None 

241 if component_string: 241 ↛ 244line 241 didn't jump to line 244 because the condition on line 241 was always true

242 result[axis_value] = self.group_patterns_by_component(patterns, component=component_string) 

243 else: 

244 result[axis_value] = patterns 

245 else: 

246 result[axis_value] = patterns 

247 

248 return result 

249 

250 def _find_and_filter_images( 

251 self, 

252 folder_path: Union[str, Path], 

253 axis_filter: List[str], 

254 extensions: List[str], 

255 recursive: bool, 

256 backend: str 

257 ) -> Dict[str, List[Any]]: 

258 """ 

259 Find all image files in a directory and filter by multiprocessing axis. 

260 

261 Args: 

262 folder_path: Path to the folder to search (string or Path object) 

263 axis_filter: List of axis values to include 

264 extensions: List of file extensions to include 

265 recursive: Whether to search recursively 

266 backend: Backend to use for file operations (required) 

267 

268 Returns: 

269 Dictionary mapping axis values to lists of image paths 

270 

271 Raises: 

272 TypeError: If folder_path is not a string or Path object 

273 ValueError: If axis_filter is empty or folder_path does not exist 

274 """ 

275 # Convert to Path and validate using FileManager abstraction 

276 folder_path = Path(folder_path) 

277 if not self.filemanager.exists(str(folder_path), backend): 277 ↛ 278line 277 didn't jump to line 278 because the condition on line 277 was never true

278 raise FileNotFoundError(f"Folder not found: {folder_path}") 

279 

280 # Validate inputs 

281 if not axis_filter: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true

282 raise ValueError("axis_filter cannot be empty") 

283 

284 extensions = extensions or ['.tif', '.TIF', '.tiff', '.TIFF'] 

285 

286 image_paths = self.filemanager.list_image_files(folder_path, backend, extensions=extensions, recursive=recursive) 

287 

288 files_by_axis = defaultdict(list) 

289 for img_path in image_paths: 

290 # FileManager should return strings, but handle Path objects too 

291 if isinstance(img_path, str): 

292 filename = os.path.basename(img_path) 

293 elif isinstance(img_path, Path): 293 ↛ 297line 293 didn't jump to line 297 because the condition on line 293 was always true

294 filename = img_path.name 

295 else: 

296 # Skip any unexpected types 

297 logger.warning(f"Unexpected file path type: {type(img_path).__name__}") 

298 continue 

299 

300 metadata = self.parser.parse_filename(filename) 

301 if not metadata: 301 ↛ 302line 301 didn't jump to line 302 because the condition on line 301 was never true

302 continue 

303 

304 # Get multiprocessing axis dynamically from configuration 

305 from openhcs.constants import MULTIPROCESSING_AXIS 

306 axis_key = MULTIPROCESSING_AXIS.value 

307 axis_value = metadata.get(axis_key) 

308 if not axis_value or axis_value not in axis_filter: 

309 continue 

310 

311 files_by_axis[axis_value].append(img_path) 

312 

313 return files_by_axis 

314 

315 def _generate_patterns_for_files( 

316 self, 

317 files: List[Any], 

318 variable_components: List[str], 

319 axis_value: str 

320 ) -> List[str]: 

321 """ 

322 Generate patterns for a list of files. 

323 

324 Args: 

325 files: List of file path objects representing files 

326 variable_components: List of components that can vary in the pattern 

327 

328 Returns: 

329 List of pattern strings 

330 

331 Raises: 

332 TypeError: If files list is not a list 

333 ValueError: If pattern templates cannot be instantiated 

334 """ 

335 # Validate input parameters 

336 if not isinstance(files, list): 336 ↛ 337line 336 didn't jump to line 337 because the condition on line 336 was never true

337 raise TypeError(f"Expected list of file path objects, got {type(files).__name__}") 

338 

339 if not isinstance(variable_components, list): 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 raise TypeError(f"Expected list of variable components, got {type(variable_components).__name__}") 

341 

342 # Use microscope-specific parser for pattern generation 

343 

344 

345 component_combinations = defaultdict(list) 

346 for file_path in files: 

347 # FileManager should return strings, but handle Path objects too 

348 if isinstance(file_path, str): 

349 filename = os.path.basename(file_path) 

350 elif isinstance(file_path, Path): 350 ↛ 354line 350 didn't jump to line 354 because the condition on line 350 was always true

351 filename = file_path.name 

352 else: 

353 # Skip any unexpected types 

354 logger.warning(f"Unexpected file path type: {type(file_path).__name__}") 

355 continue 

356 

357 metadata = self.parser.parse_filename(filename) 

358 if not metadata: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true

359 continue 

360 

361 key_parts = [] 

362 for comp in self.parser.FILENAME_COMPONENTS: 

363 if comp in metadata and comp not in variable_components and metadata[comp] is not None: 

364 key_parts.append(f"{comp}={metadata[comp]}") 

365 

366 key = ",".join(key_parts) 

367 component_combinations[key].append((file_path, metadata)) 

368 

369 patterns = [] 

370 for _, files_metadata in component_combinations.items(): 

371 if not files_metadata: 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true

372 continue 

373 

374 _, template_metadata = files_metadata[0] 

375 # Generate pattern arguments for all discovered components 

376 pattern_args = {} 

377 for comp in self.parser.FILENAME_COMPONENTS: 

378 if comp in template_metadata: 378 ↛ 377line 378 didn't jump to line 377 because the condition on line 378 was always true

379 if comp in variable_components: 

380 pattern_args[comp] = self.PLACEHOLDER_PATTERN 

381 else: 

382 pattern_args[comp] = template_metadata[comp] 

383 

384 # 🔒 Clause 93 — Declarative Execution Enforcement 

385 # Ensure pattern generation succeeded 

386 if not pattern_args: 386 ↛ 387line 386 didn't jump to line 387 because the condition on line 386 was never true

387 raise ValueError("Clause 93 Violation: No components found in template metadata for pattern generation") 

388 

389 # Use metaprogramming approach - pass all components dynamically 

390 extension = pattern_args.get('extension') or DEFAULT_IMAGE_EXTENSION 

391 component_kwargs = {comp: pattern_args.get(comp) for comp in self.parser.get_component_names() if comp in pattern_args} 

392 

393 pattern_str = self.parser.construct_filename( 

394 extension=extension, 

395 **component_kwargs 

396 ) 

397 

398 # Validate that the pattern can be instantiated 

399 test_instance = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001') 

400 if not self.parser.parse_filename(test_instance): 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true

401 raise ValueError(f"Clause 93 Violation: Pattern template '{pattern_str}' cannot be instantiated") 

402 

403 patterns.append(pattern_str) 

404 

405 # 🔒 Clause 92 — Structural Validation First 

406 # Validate the final pattern list 

407 if not patterns: 407 ↛ 408line 407 didn't jump to line 408 because the condition on line 407 was never true

408 raise ValueError( 

409 "No patterns generated from files. This indicates either: " 

410 "(1) no image files found in the directory, " 

411 "(2) files don't match the expected naming convention, or " 

412 "(3) pattern generation logic failed. " 

413 "Check that image files exist and follow the expected naming pattern." 

414 ) 

415 

416 return patterns