Coverage for openhcs/processing/backends/experimental_analysis/format_registry.py: 38.8%

53 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-01 18:33 +0000

1""" 

2Base classes for microscope format registry system. 

3 

4This module provides the abstract base class and common functionality for 

5microscope format registries, following OpenHCS registry architecture patterns. 

6""" 

7 

8from abc import ABC, abstractmethod 

9from dataclasses import dataclass 

10from typing import Dict, List, Any, Optional, Tuple 

11import pandas as pd 

12from pathlib import Path 

13 

14 

15@dataclass(frozen=True) 

16class MicroscopeFormatConfig: 

17 """Configuration for microscope format processing.""" 

18 format_name: str 

19 sheet_name: Optional[str] 

20 supported_extensions: Tuple[str, ...] 

21 feature_extraction_method: str 

22 plate_detection_method: str 

23 

24 

25class MicroscopeFormatRegistryBase(ABC): 

26 """ 

27 Abstract base class for microscope format registries. 

28  

29 Following OpenHCS registry patterns, this provides a unified interface 

30 for processing different microscope data formats while eliminating 

31 code duplication and hardcoded format-specific logic. 

32 """ 

33 

34 # Abstract class attributes - each implementation must define these 

35 FORMAT_NAME: str 

36 SHEET_NAME: Optional[str] # None means use first sheet 

37 SUPPORTED_EXTENSIONS: Tuple[str, ...] 

38 

39 def __init__(self): 

40 """Initialize registry with format configuration.""" 

41 self.config = MicroscopeFormatConfig( 

42 format_name=self.FORMAT_NAME, 

43 sheet_name=self.SHEET_NAME, 

44 supported_extensions=self.SUPPORTED_EXTENSIONS, 

45 feature_extraction_method=f"extract_features_{self.FORMAT_NAME.lower()}", 

46 plate_detection_method=f"extract_plates_{self.FORMAT_NAME.lower()}" 

47 ) 

48 

49 @property 

50 def format_name(self) -> str: 

51 """Get format name for this registry.""" 

52 return self.FORMAT_NAME 

53 

54 @abstractmethod 

55 def extract_features(self, raw_df: pd.DataFrame) -> List[str]: 

56 """ 

57 Extract feature column names from raw microscope data. 

58  

59 Args: 

60 raw_df: Raw data DataFrame from microscope 

61  

62 Returns: 

63 List of feature column names 

64  

65 Raises: 

66 ValueError: If feature extraction fails 

67 """ 

68 pass 

69 

70 @abstractmethod 

71 def extract_plate_names(self, raw_df: pd.DataFrame) -> List[str]: 

72 """ 

73 Extract plate identifiers from raw microscope data. 

74  

75 Args: 

76 raw_df: Raw data DataFrame from microscope 

77  

78 Returns: 

79 List of plate identifiers 

80  

81 Raises: 

82 ValueError: If plate extraction fails 

83 """ 

84 pass 

85 

86 @abstractmethod 

87 def create_plates_dict(self, raw_df: pd.DataFrame) -> Dict[str, Dict[str, Dict[str, Any]]]: 

88 """ 

89 Create nested dictionary structure for plate data. 

90  

91 Args: 

92 raw_df: Raw data DataFrame from microscope 

93  

94 Returns: 

95 Dictionary structure: {plate_id: {well_id: {feature: value}}} 

96  

97 Raises: 

98 ValueError: If data structure creation fails 

99 """ 

100 pass 

101 

102 @abstractmethod 

103 def fill_plates_dict(self, raw_df: pd.DataFrame, plates_dict: Dict[str, Dict[str, Dict[str, Any]]]) -> Dict[str, Dict[str, Dict[str, Any]]]: 

104 """ 

105 Fill plates dictionary with actual measurement values. 

106  

107 Args: 

108 raw_df: Raw data DataFrame from microscope 

109 plates_dict: Empty plates dictionary structure 

110  

111 Returns: 

112 Filled plates dictionary with measurement values 

113  

114 Raises: 

115 ValueError: If data filling fails 

116 """ 

117 pass 

118 

119 def read_results(self, results_path: str) -> pd.DataFrame: 

120 """ 

121 Read results file using format-specific logic. 

122  

123 Args: 

124 results_path: Path to results file 

125  

126 Returns: 

127 Raw data DataFrame 

128  

129 Raises: 

130 FileNotFoundError: If results file doesn't exist 

131 ValueError: If file format is not supported 

132 """ 

133 results_file = Path(results_path) 

134 

135 if not results_file.exists(): 

136 raise FileNotFoundError(f"Results file not found: {results_path}") 

137 

138 if results_file.suffix not in self.SUPPORTED_EXTENSIONS: 

139 raise ValueError(f"Unsupported file extension {results_file.suffix} for format {self.FORMAT_NAME}") 

140 

141 if results_path.endswith('.csv'): 

142 return pd.read_csv(results_path) 

143 else: 

144 # Excel file 

145 xls = pd.ExcelFile(results_path) 

146 sheet_name = self.SHEET_NAME if self.SHEET_NAME else xls.sheet_names[0] 

147 return pd.read_excel(xls, sheet_name) 

148 

149 def process_data(self, results_path: str) -> Dict[str, Any]: 

150 """ 

151 Complete data processing pipeline for this format. 

152  

153 Args: 

154 results_path: Path to results file 

155  

156 Returns: 

157 Processed data structure ready for analysis 

158  

159 Raises: 

160 ValueError: If data processing fails 

161 """ 

162 # Read raw data 

163 raw_df = self.read_results(results_path) 

164 

165 # Extract features and plates 

166 features = self.extract_features(raw_df) 

167 plate_names = self.extract_plate_names(raw_df) 

168 

169 # Create and fill data structures 

170 plates_dict = self.create_plates_dict(raw_df) 

171 filled_plates_dict = self.fill_plates_dict(raw_df, plates_dict) 

172 

173 return { 

174 'raw_df': raw_df, 

175 'features': features, 

176 'plate_names': plate_names, 

177 'plates_dict': filled_plates_dict, 

178 'format_name': self.FORMAT_NAME 

179 } 

180 

181 def validate_data_structure(self, data: Dict[str, Any]) -> bool: 

182 """ 

183 Validate processed data structure. 

184  

185 Args: 

186 data: Processed data dictionary 

187  

188 Returns: 

189 True if data structure is valid 

190  

191 Raises: 

192 ValueError: If validation fails 

193 """ 

194 required_keys = ['raw_df', 'features', 'plate_names', 'plates_dict', 'format_name'] 

195 

196 for key in required_keys: 

197 if key not in data: 

198 raise ValueError(f"Missing required key in data structure: {key}") 

199 

200 if not data['features']: 

201 raise ValueError("No features extracted from data") 

202 

203 if not data['plate_names']: 

204 raise ValueError("No plates detected in data") 

205 

206 return True 

207 

208 

209class FormatDetectionError(Exception): 

210 """Raised when microscope format cannot be detected.""" 

211 pass 

212 

213 

214class DataProcessingError(Exception): 

215 """Raised when data processing fails.""" 

216 pass