Coverage for openhcs/formats/pattern/pattern_discovery.py: 69.0%

155 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-14 05:57 +0000

1""" 

2Pattern discovery engine for OpenHCS. 

3 

4This module provides a dedicated engine for discovering and grouping patterns 

5in microscopy image files, separating this responsibility from FilenameParser. 

6""" 

7 

8# Standard Library 

9import logging 

10import os 

11import re 

12from collections import defaultdict 

13from pathlib import Path 

14from typing import Any, Dict, List, Optional, Union 

15 

16from openhcs.constants.constants import DEFAULT_IMAGE_EXTENSION 

17from openhcs.io.filemanager import FileManager 

18# Core OpenHCS Interfaces 

19from openhcs.microscopes.microscope_interfaces import FilenameParser 

20 

21logger = logging.getLogger(__name__) 

22 

23# Pattern utility functions 

24def has_placeholders(pattern: str) -> bool: 

25 """Check if pattern contains placeholder variables.""" 

26 return '{' in pattern and '}' in pattern 

27 

28 

29class PatternDiscoveryEngine: 

30 """ 

31 Engine for discovering and grouping patterns in microscopy image files. 

32 

33 This class is responsible for: 

34 - Finding image files in directories 

35 - Filtering files based on well IDs 

36 - Generating patterns from files 

37 - Grouping patterns by components 

38 

39 It works with a FilenameParser to parse individual filenames and a 

40 FileManager to access the file system. 

41 """ 

42 

43 # Constants 

44 PLACEHOLDER_PATTERN = '{iii}' 

45 

46 def __init__(self, parser: FilenameParser, filemanager: FileManager): 

47 """ 

48 Initialize the pattern discovery engine. 

49 

50 Args: 

51 parser: Parser for microscopy filenames 

52 filemanager: FileManager for file system operations 

53 """ 

54 self.parser = parser 

55 self.filemanager = filemanager 

56 

57 def path_list_from_pattern(self, directory: Union[str, Path], pattern: str, backend: str, variable_components: Optional[List[str]] = None) -> List[str]: 

58 """ 

59 Get a list of filenames matching a pattern in a directory. 

60 

61 Args: 

62 directory: Directory to search (string or Path object) 

63 pattern: Pattern to match (string with optional {iii} placeholders) 

64 backend: Backend to use for file operations (required) 

65 variable_components: List of components that can vary (will be ignored during matching) 

66 

67 Returns: 

68 List of matching filenames 

69 

70 Raises: 

71 ValueError: If directory does not exist 

72 """ 

73 directory_path = str(directory) # Keep as string for FileManager consistency 

74 if not self.filemanager.is_dir(directory_path, backend): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 raise FileNotFoundError(f"Directory not found: {directory_path}") 

76 

77 pattern_str = str(pattern) 

78 

79 # Handle literal filenames (patterns without placeholders) 

80 if not has_placeholders(pattern_str): 80 ↛ 82line 80 didn't jump to line 82 because the condition on line 80 was never true

81 # Use FileManager to check if file exists 

82 file_path = os.path.join(directory_path, pattern_str) # Use os.path.join instead of / 

83 file_exists = self.filemanager.exists(file_path, backend) 

84 if file_exists: 

85 return [pattern_str] 

86 return [] 

87 

88 # Handle pattern strings with placeholders 

89 logger.debug("Using pattern template: %s", pattern_str) 

90 

91 # Parse pattern template to get expected structure 

92 pattern_template = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001') 

93 pattern_metadata = self.parser.parse_filename(pattern_template) 

94 if not pattern_metadata: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true

95 logger.error("Failed to parse pattern template: %s", pattern_template) 

96 return [] 

97 

98 # Get all image files in directory using FileManager 

99 all_files = self.filemanager.list_image_files(str(directory_path), backend) 

100 

101 matching_files = [] 

102 

103 for file_path in all_files: 

104 # Extract filename from path 

105 if isinstance(file_path, str): 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 filename = os.path.basename(file_path) 

107 elif isinstance(file_path, Path): 107 ↛ 110line 107 didn't jump to line 110 because the condition on line 107 was always true

108 filename = file_path.name 

109 else: 

110 continue 

111 

112 # Parse the actual filename 

113 file_metadata = self.parser.parse_filename(filename) 

114 if not file_metadata: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 continue 

116 

117 # Check if file matches pattern structure 

118 if self._matches_pattern_structure(file_metadata, pattern_metadata, variable_components or []): 

119 matching_files.append(filename) 

120 

121 return matching_files 

122 

123 def _matches_pattern_structure(self, file_metadata: Dict[str, Any], pattern_metadata: Dict[str, Any], variable_components: List[str]) -> bool: 

124 """ 

125 Check if a file's metadata matches a pattern's structure. 

126 

127 Args: 

128 file_metadata: Metadata extracted from actual filename 

129 pattern_metadata: Metadata extracted from pattern template 

130 variable_components: List of components that can vary 

131 

132 Returns: 

133 True if file matches pattern structure, False otherwise 

134 """ 

135 # Check all components in the pattern 

136 for component in self.parser.FILENAME_COMPONENTS: 

137 if component not in pattern_metadata: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true

138 continue 

139 

140 pattern_value = pattern_metadata[component] 

141 file_value = file_metadata.get(component) 

142 

143 # Variable components can have any value 

144 if component in variable_components: 

145 # File must have a value for this component, but it can be anything 

146 if file_value is None: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 return False 

148 continue 

149 

150 # Fixed components must match exactly 

151 if pattern_value != file_value: 

152 return False 

153 

154 return True 

155 

156 def group_patterns_by_component( 

157 self, 

158 patterns: List[str], 

159 component: str 

160 ) -> Dict[str, List[str]]: 

161 """ 

162 Group patterns by a required component. 

163 

164 Args: 

165 patterns: List of pattern strings to group 

166 component: Component to group by 

167 

168 Returns: 

169 Dictionary mapping component values to lists of patterns 

170 

171 Raises: 

172 TypeError: If patterns are not strings 

173 ValueError: If component is not present in a pattern 

174 """ 

175 grouped_patterns = defaultdict(list) 

176 # Validate inputs 

177 if not component or not isinstance(component, str): 

178 raise ValueError(f"Component must be a non-empty string, got {component}") 

179 

180 if not all(isinstance(p, str) for p in patterns): 

181 raise TypeError("All patterns must be strings") 

182 

183 for pattern in patterns: 

184 pattern_str = str(pattern) 

185 

186 # Note: Patterns with template fields (like {iii}) are EXPECTED for pattern discovery 

187 # The has_placeholders() check is only relevant when using patterns as concrete filenames 

188 # For pattern discovery and grouping, we WANT patterns with placeholders 

189 

190 pattern_template = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001') 

191 metadata = self.parser.parse_filename(pattern_template) 

192 

193 if not metadata or component not in metadata or metadata[component] is None: 

194 raise ValueError( 

195 f"Missing required component '{component}' in pattern: {pattern_str}" 

196 ) 

197 

198 value = str(metadata[component]) 

199 grouped_patterns[value].append(pattern) 

200 

201 return grouped_patterns 

202 

203 def auto_detect_patterns( 

204 self, 

205 folder_path: Union[str, Path], 

206 well_filter: List[str], 

207 extensions: List[str], 

208 group_by: Optional[str], 

209 variable_components: List[str], 

210 backend: str 

211 ) -> Dict[str, Any]: 

212 """ 

213 Automatically detect image patterns in a folder. 

214 """ 

215 files_by_well = self._find_and_filter_images( 

216 folder_path, well_filter, extensions, True, backend 

217 ) 

218 

219 if not files_by_well: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 return {} 

221 

222 result = {} 

223 for well, files in files_by_well.items(): 

224 patterns = self._generate_patterns_for_files(files, variable_components) 

225 

226 # Validate patterns 

227 for pattern in patterns: 

228 if not isinstance(pattern, str): 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true

229 raise TypeError(f"Pattern generator returned invalid type: {type(pattern).__name__}") 

230 

231 result[well] = ( 

232 self.group_patterns_by_component(patterns, component=group_by) 

233 if group_by else patterns 

234 ) 

235 

236 return result 

237 

238 def _find_and_filter_images( 

239 self, 

240 folder_path: Union[str, Path], 

241 well_filter: List[str], 

242 extensions: List[str], 

243 recursive: bool, 

244 backend: str 

245 ) -> Dict[str, List[Any]]: 

246 """ 

247 Find all image files in a directory and filter by well. 

248 

249 Args: 

250 folder_path: Path to the folder to search (string or Path object) 

251 well_filter: List of wells to include 

252 extensions: List of file extensions to include 

253 recursive: Whether to search recursively 

254 backend: Backend to use for file operations (required) 

255 

256 Returns: 

257 Dictionary mapping wells to lists of image paths 

258 

259 Raises: 

260 TypeError: If folder_path is not a string or Path object 

261 ValueError: If well_filter is empty or folder_path does not exist 

262 """ 

263 # Convert to Path and validate using FileManager abstraction 

264 folder_path = Path(folder_path) 

265 if not self.filemanager.exists(str(folder_path), backend): 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true

266 raise FileNotFoundError(f"Folder not found: {folder_path}") 

267 

268 # Validate inputs 

269 if not well_filter: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true

270 raise ValueError("well_filter cannot be empty") 

271 

272 extensions = extensions or ['.tif', '.TIF', '.tiff', '.TIFF'] 

273 

274 image_paths = self.filemanager.list_image_files(folder_path, backend, extensions=extensions, recursive=recursive) 

275 

276 files_by_well = defaultdict(list) 

277 for img_path in image_paths: 

278 # FileManager should return strings, but handle Path objects too 

279 if isinstance(img_path, str): 

280 filename = os.path.basename(img_path) 

281 elif isinstance(img_path, Path): 281 ↛ 285line 281 didn't jump to line 285 because the condition on line 281 was always true

282 filename = img_path.name 

283 else: 

284 # Skip any unexpected types 

285 logger.warning(f"Unexpected file path type: {type(img_path).__name__}") 

286 continue 

287 

288 metadata = self.parser.parse_filename(filename) 

289 if not metadata: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true

290 continue 

291 

292 well = metadata['well'] 

293 if well not in well_filter: 

294 continue 

295 

296 files_by_well[well].append(img_path) 

297 

298 return files_by_well 

299 

300 def _generate_patterns_for_files( 

301 self, 

302 files: List[Any], 

303 variable_components: List[str] 

304 ) -> List[str]: 

305 """ 

306 Generate patterns for a list of files. 

307 

308 Args: 

309 files: List of file path objects representing files 

310 variable_components: List of components that can vary in the pattern 

311 

312 Returns: 

313 List of pattern strings 

314 

315 Raises: 

316 TypeError: If files list is not a list 

317 ValueError: If pattern templates cannot be instantiated 

318 """ 

319 # Validate input parameters 

320 if not isinstance(files, list): 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true

321 raise TypeError(f"Expected list of file path objects, got {type(files).__name__}") 

322 

323 if not isinstance(variable_components, list): 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true

324 raise TypeError(f"Expected list of variable components, got {type(variable_components).__name__}") 

325 

326 

327 component_combinations = defaultdict(list) 

328 for file_path in files: 

329 # FileManager should return strings, but handle Path objects too 

330 if isinstance(file_path, str): 

331 filename = os.path.basename(file_path) 

332 elif isinstance(file_path, Path): 332 ↛ 336line 332 didn't jump to line 336 because the condition on line 332 was always true

333 filename = file_path.name 

334 else: 

335 # Skip any unexpected types 

336 logger.warning(f"Unexpected file path type: {type(file_path).__name__}") 

337 continue 

338 

339 metadata = self.parser.parse_filename(filename) 

340 if not metadata: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true

341 continue 

342 

343 key_parts = [] 

344 for comp in self.parser.FILENAME_COMPONENTS: 

345 if comp in metadata and comp not in variable_components and metadata[comp] is not None: 

346 key_parts.append(f"{comp}={metadata[comp]}") 

347 

348 key = ",".join(key_parts) 

349 component_combinations[key].append((file_path, metadata)) 

350 

351 patterns = [] 

352 for _, files_metadata in component_combinations.items(): 

353 if not files_metadata: 353 ↛ 354line 353 didn't jump to line 354 because the condition on line 353 was never true

354 continue 

355 

356 _, template_metadata = files_metadata[0] 

357 pattern_args = {} 

358 for comp in self.parser.FILENAME_COMPONENTS: 

359 if comp in template_metadata: 359 ↛ 358line 359 didn't jump to line 358 because the condition on line 359 was always true

360 if comp in variable_components: 

361 pattern_args[comp] = self.PLACEHOLDER_PATTERN 

362 else: 

363 pattern_args[comp] = template_metadata[comp] 

364 

365 # 🔒 Clause 93 — Declarative Execution Enforcement 

366 # Ensure all required components are present 

367 if 'well' not in pattern_args or pattern_args['well'] is None: 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true

368 raise ValueError("Clause 93 Violation: 'well' is a required component for pattern templates") 

369 

370 pattern_str = self.parser.construct_filename( 

371 well=pattern_args['well'], 

372 site=pattern_args.get('site'), 

373 channel=pattern_args.get('channel'), 

374 z_index=pattern_args.get('z_index'), 

375 extension=pattern_args.get('extension') or DEFAULT_IMAGE_EXTENSION 

376 ) 

377 

378 # Validate that the pattern can be instantiated 

379 test_instance = pattern_str.replace(self.PLACEHOLDER_PATTERN, '001') 

380 if not self.parser.parse_filename(test_instance): 380 ↛ 381line 380 didn't jump to line 381 because the condition on line 380 was never true

381 raise ValueError(f"Clause 93 Violation: Pattern template '{pattern_str}' cannot be instantiated") 

382 

383 patterns.append(pattern_str) 

384 

385 # 🔒 Clause 92 — Structural Validation First 

386 # Validate the final pattern list 

387 if not patterns: 387 ↛ 388line 387 didn't jump to line 388 because the condition on line 387 was never true

388 raise ValueError( 

389 "No patterns generated from files. This indicates either: " 

390 "(1) no image files found in the directory, " 

391 "(2) files don't match the expected naming convention, or " 

392 "(3) pattern generation logic failed. " 

393 "Check that image files exist and follow the expected naming pattern." 

394 ) 

395 

396 return patterns