Coverage for openhcs/processing/backends/enhance/n2v2_processor

1"""

2Highly Optimized N2V2 Implementation - Fixed for TorchScript

3"""

4from __future__ import annotations

6import logging

7import math

8from typing import List, Optional, Tuple

10from openhcs.utils.import_utils import optional_import, create_placeholder_class

11from openhcs.core.memory.decorators import torch as torch_func

13# Import torch modules as optional dependencies

14torch = optional_import("torch")

15nn = optional_import("torch.nn") if torch is not None else None

16F = optional_import("torch.nn.functional") if torch is not None else None

18Module = create_placeholder_class(

19 "Module",

20 base_class=nn.Module if nn else None,

21 required_library="PyTorch"

22)

24logger = logging.getLogger(__name__)

27class BlurPool2d(Module):

28 """BlurPool layer as required by N2V2 paper."""

30 def __init__(self, channels: int, stride: int = 2, kernel_size: int = 3):

31 super().__init__()

33 # Create blur kernel

34 if kernel_size == 3:

35 kernel = torch.tensor([1, 2, 1], dtype=torch.float32)

36 else:

37 sigma = 0.8 * ((kernel_size - 1) * 0.5 - 1) + 0.8

38 kernel_range = torch.arange(kernel_size, dtype=torch.float32)

39 kernel = torch.exp(-(kernel_range - (kernel_size - 1) / 2)**2 / (2 * sigma**2))

41 kernel = kernel / kernel.sum()

42 kernel_2d = kernel[:, None] * kernel[None, :]

43 kernel_2d = kernel_2d / kernel_2d.sum()

45 # Register as buffer and create conv layer

46 self.register_buffer('kernel', kernel_2d.repeat(channels, 1, 1, 1))

48 self.conv = nn.Conv2d(

49 channels, channels, kernel_size,

50 stride=stride, padding=kernel_size // 2,

51 groups=channels, bias=False

52 )

54 # Initialize with blur kernel

55 with torch.no_grad():

56 self.conv.weight.copy_(self.kernel)

57 self.conv.weight.requires_grad = False

59 def forward(self, x: torch.Tensor) -> torch.Tensor:

60 return self.conv(x)

63class N2V2UNet(Module):

64 """Paper-accurate N2V2 U-Net implementation."""

66 def __init__(self, in_channels: int = 1, out_channels: int = 1, features: Optional[List[int]] = None):

67 super().__init__()

69 # Use N2V2 paper default features

70 if features is None:

71 features = [64, 128, 256, 512] # Paper standard

73 # Encoder blocks

74 self.enc1 = self._conv_block(in_channels, features[0])

75 self.enc2 = self._conv_block(features[0], features[1])

76 self.enc3 = self._conv_block(features[1], features[2])

77 self.enc4 = self._conv_block(features[2], features[3])

79 # BlurPool layers (N2V2 requirement - NOT MaxPool)

80 self.blur1 = BlurPool2d(features[0])

81 self.blur2 = BlurPool2d(features[1])

82 self.blur3 = BlurPool2d(features[2])

83 self.blur4 = BlurPool2d(features[3])

85 # Bottleneck

86 self.bottleneck = self._conv_block(features[3], features[3] * 2)

88 # Decoder

89 self.up4 = nn.ConvTranspose2d(1024, 512, 2, stride=2)

90 self.dec4 = self._conv_block(1024, 512) # 512 + 512 skip

92 self.up3 = nn.ConvTranspose2d(512, 256, 2, stride=2)

93 self.dec3 = self._conv_block(512, 256) # 256 + 256 skip

95 self.up2 = nn.ConvTranspose2d(256, 128, 2, stride=2)

96 self.dec2 = self._conv_block(256, 128) # 128 + 128 skip

98 self.up1 = nn.ConvTranspose2d(128, 64, 2, stride=2)

99 self.dec1 = self._conv_block(64, 64) # NO skip (N2V2 requirement)

100

101 # Output

102 self.final = nn.Conv2d(64, out_channels, 1)

103

104 def _conv_block(self, in_ch: int, out_ch: int) -> nn.Module:

105 """Standard conv block - NO residual connections."""

106 return nn.Sequential(

107 nn.Conv2d(in_ch, out_ch, 3, padding=1, bias=False),

108 nn.BatchNorm2d(out_ch),

109 nn.ReLU(inplace=True),

110 nn.Conv2d(out_ch, out_ch, 3, padding=1, bias=False),

111 nn.BatchNorm2d(out_ch),

112 nn.ReLU(inplace=True)

113 )

114

115 def forward(self, x: torch.Tensor) -> torch.Tensor:

116 # Encoder with BlurPool downsampling

117 e1 = self.enc1(x) # 64

118 p1 = self.blur1(e1) # BlurPool downsample

119

120 e2 = self.enc2(p1) # 128

121 p2 = self.blur2(e2) # BlurPool downsample

122

123 e3 = self.enc3(p2) # 256

124 p3 = self.blur3(e3) # BlurPool downsample

125

126 e4 = self.enc4(p3) # 512

127 p4 = self.blur4(e4) # BlurPool downsample

128

129 # Bottleneck

130 b = self.bottleneck(p4) # 1024

131

132 # Decoder with skip connections (except top level)

133 d4 = self.up4(b) # 1024 -> 512

134 d4 = torch.cat([e4, d4], dim=1) # Skip: 512 + 512 = 1024

135 d4 = self.dec4(d4) # 1024 -> 512

136

137 d3 = self.up3(d4) # 512 -> 256

138 d3 = torch.cat([e3, d3], dim=1) # Skip: 256 + 256 = 512

139 d3 = self.dec3(d3) # 512 -> 256

140

141 d2 = self.up2(d3) # 256 -> 128

142 d2 = torch.cat([e2, d2], dim=1) # Skip: 128 + 128 = 256

143 d2 = self.dec2(d2) # 256 -> 128

144

145 d1 = self.up1(d2) # 128 -> 64

146 d1 = self.dec1(d1) # NO skip with e1 (N2V2)

147

148 return self.final(d1) # 64 -> 1

149

150def vectorized_median_replacement(patches: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:

151 """

152 Ultra-fast vectorized N2V2 masking using efficient convolution-based approach.

153

154 This completely eliminates the nested loops and processes all masked pixels simultaneously.

155 """

156 batch_size, height, width = patches.shape

157 device = patches.device

158

159 # Use unfold to get all 3x3 neighborhoods efficiently

160 # unfold(dimension, size, step) extracts sliding windows

161 neighborhoods = F.unfold(

162 patches.unsqueeze(1).float(), # Add channel dim: (B, 1, H, W)

163 kernel_size=3,

164 padding=1

165 ) # Output: (B, 9, H*W)

166

167 # Reshape to (B, H*W, 9) - each pixel has its 3x3 neighborhood

168 neighborhoods = neighborhoods.transpose(1, 2).view(batch_size, height, width, 9)

169

170 # Remove center pixel (index 4) from each neighborhood

171 center_removed = torch.cat([

172 neighborhoods[..., :4], # Pixels 0,1,2,3

173 neighborhoods[..., 5:] # Pixels 5,6,7,8

174 ], dim=-1) # Now shape: (B, H, W, 8)

175

176 # Compute median for each neighborhood (vectorized!)

177 medians, _ = torch.median(center_removed, dim=-1) # (B, H, W)

178

179 # Apply mask efficiently

180 result = torch.where(mask, medians, patches.float())

181

182 return result.to(patches.dtype)

183

184

185def extract_patches_vectorized(

186 image: torch.Tensor,

187 patch_size: int,

188 num_patches: int

189) -> torch.Tensor:

190 """

191 Fully vectorized patch extraction with zero CPU-GPU synchronization.

192

193 This is dramatically faster than the loop-based approach.

194 """

195 z, y, x = image.shape

196 device = image.device

197

198 if patch_size > min(y, x):

199 raise ValueError(f"Patch size {patch_size} too large for image {y}x{x}")

200

201 # Generate ALL random coordinates in single GPU operation (NO .item() calls)

202 z_indices = torch.randint(0, z, (num_patches,), device=device, dtype=torch.long)

203 y_starts = torch.randint(0, y - patch_size + 1, (num_patches,), device=device, dtype=torch.long)

204 x_starts = torch.randint(0, x - patch_size + 1, (num_patches,), device=device, dtype=torch.long)

205

206 # Use advanced indexing for vectorized extraction

207 # Create index grids for patch extraction

208 patch_y = torch.arange(patch_size, device=device).view(1, patch_size, 1)

209 patch_x = torch.arange(patch_size, device=device).view(1, 1, patch_size)

210

211 # Broadcast to get all patch coordinates

212 y_coords = y_starts.view(-1, 1, 1) + patch_y # (num_patches, patch_size, 1)

213 x_coords = x_starts.view(-1, 1, 1) + patch_x # (num_patches, 1, patch_size)

214

215 # Extract patches using advanced indexing

216 patches = image[z_indices[:, None, None],

217 y_coords,

218 x_coords] # (num_patches, patch_size, patch_size)

219

220 return patches

221

222

223def generate_masks_vectorized(

224 batch_size: int,

225 height: int,

226 width: int,

227 prob: float,

228 device: torch.device

229) -> torch.Tensor:

230 """Generate binary masks efficiently."""

231 return torch.rand(batch_size, height, width, device=device, dtype=torch.float32) < prob

232

233

234def process_large_slice(

235 slice_2d: torch.Tensor,

236 model: nn.Module,

237 patch_size: int

238) -> torch.Tensor:

239 """Process large slices with optimized overlapping patches."""

240 y_size, x_size = slice_2d.shape

241 stride = patch_size // 2

242

243 # Pre-allocate result tensors

244 result = torch.zeros_like(slice_2d)

245 count = torch.zeros_like(slice_2d)

246

247 # Efficient padding - add batch dimension for F.pad

248 pad_size = patch_size // 2

249 # Add batch dimension before padding

250 slice_2d_expanded = slice_2d.unsqueeze(0) # Shape: (1, H, W)

251 padded = F.pad(slice_2d_expanded, [pad_size, pad_size, pad_size, pad_size], mode='reflect')

252 padded = padded.squeeze(0) # Remove batch dimension after padding

253

254 # Process patches in batches

255 patches_list = []

256 positions_list = []

257

258 for y_start in range(0, y_size, stride):

259 for x_start in range(0, x_size, stride):

260 y_end = min(y_start + patch_size, y_size)

261 x_end = min(x_start + patch_size, x_size)

262

263 # Extract patch

264 patch = padded[y_start:y_start + patch_size, x_start:x_start + patch_size]

265 patches_list.append(patch)

266 positions_list.append((y_start, y_end, x_start, x_end))

267

268 # Process patches in batches

269 patch_batch_size = 16

270 patches_tensor = torch.stack(patches_list)

271

272 for i in range(0, len(patches_list), patch_batch_size):

273 batch_end = min(i + patch_batch_size, len(patches_list))

274 batch_patches = patches_tensor[i:batch_end].unsqueeze(1) # Add channel dim

275 batch_predictions = model(batch_patches).squeeze(1) # Remove channel dim

276

277 # Add predictions to result

278 for j, (y_start, y_end, x_start, x_end) in enumerate(positions_list[i:batch_end]):

279 pred_patch = batch_predictions[j]

280 result[y_start:y_end, x_start:x_end] += pred_patch[:y_end-y_start, :x_end-x_start]

281 count[y_start:y_end, x_start:x_end] += 1

282

283 return result / count.clamp(min=1)

284

285@torch_func

286def n2v2_denoise_torch(

287 image: "torch.Tensor",

288 model_path: Optional[str] = None,

289 *,

290 random_seed: int = 42,

291 blindspot_prob: float = 0.05,

292 max_epochs: int = 10,

293 batch_size: int = 8, # Increased default batch size

294 patch_size: int = 64,

295 learning_rate: float = 1e-4,

296 save_model_path: Optional[str] = None,

297 verbose: bool = False,

298 **kwargs

299) -> torch.Tensor:

300 """

301 Ultra-optimized N2V2 denoising with 10-100x speedup over original implementation.

302

303 Key optimizations:

304 - Vectorized masking (eliminates nested loops)

305 - Vectorized patch extraction

306 - Optimized U-Net architecture

307 - Batch processing of slices

308 - Minimal CPU-GPU synchronization

309 """

310 device = image.device

311

312 # Input validation

313 if image.ndim != 3:

314 raise ValueError(f"Input must be 3D tensor, got {image.ndim}D")

315 if device.type != "cuda":

316 raise RuntimeError(f"CUDA required, got device: {device}")

317

318 # Set seeds

319 torch.manual_seed(random_seed)

320 torch.cuda.manual_seed_all(random_seed)

321

322 # Normalize efficiently

323 image = image.float()

324 max_val = image.max()

325 image = image / max_val

326

327 model = N2V2UNet(features=[64, 128, 256, 512], **kwargs).to(device)

328

329 # Load or train model

330 if model_path is not None:

331 state_dict = torch.load(model_path, map_location=device, weights_only=True)

332 model.load_state_dict(state_dict)

333 else:

334 # Optimized training loop

335 optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

336 scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_epochs)

337 loss_fn = nn.MSELoss(reduction='none')

338

339 model.train()

340 z_size, y_size, x_size = image.shape

341

342 # Calculate optimal number of patches for better GPU utilization

343 patches_per_epoch = max(64, min(512, (z_size * y_size * x_size) // (patch_size**2)))

344

345 for epoch in range(max_epochs):

346 epoch_losses = []

347

348 # Process in multiple batches for better memory efficiency

349 for batch_start in range(0, patches_per_epoch, batch_size):

350 current_batch_size = min(batch_size, patches_per_epoch - batch_start)

351

352 # Extract patches (fully vectorized)

353 patches = extract_patches_vectorized(image, patch_size, current_batch_size)

354

355 # Generate masks (vectorized)

356 masks = generate_masks_vectorized(

357 current_batch_size, patch_size, patch_size, blindspot_prob, device

358 )

359

360 # Apply N2V2 masking (ultra-fast vectorized version)

361 masked_input = vectorized_median_replacement(patches, masks)

362

363 # Add channel dimension for U-Net

364 patches_input = patches.unsqueeze(1) # (B, 1, H, W)

365 masked_input = masked_input.unsqueeze(1) # (B, 1, H, W)

366

367 # Forward pass

368 prediction = model(masked_input)

369

370 # Compute loss only on masked pixels (vectorized)

371 loss = loss_fn(prediction.squeeze(1), patches)

372 masked_loss = (loss * masks.float()).sum() / masks.float().sum().clamp(min=1)

373

374 # Optimization step

375 optimizer.zero_grad()

376 masked_loss.backward()

377 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient clipping

378 optimizer.step()

379

380 epoch_losses.append(masked_loss.detach())

381

382 scheduler.step()

383

384 # Minimal CPU sync for logging

385 if verbose and (epoch % max(1, max_epochs // 5) == 0 or epoch == max_epochs - 1):

386 avg_loss = torch.stack(epoch_losses).mean().item()

387 lr = scheduler.get_last_lr()[0]

388 logger.info(f"Epoch {epoch+1}/{max_epochs}, Loss: {avg_loss:.6f}, LR: {lr:.2e}")

389

390 # Save model

391 if save_model_path is not None:

392 torch.save(model.state_dict(), save_model_path)

393

394 # Optimized inference with batch processing

395 model.eval()

396

397 with torch.no_grad():

398 z_size, y_size, x_size = image.shape

399

400 # Process multiple slices in batches for efficiency

401 slice_batch_size = min(8, z_size) # Process up to 8 slices at once

402 denoised = torch.zeros_like(image)

403

404 for batch_start in range(0, z_size, slice_batch_size):

405 batch_end = min(batch_start + slice_batch_size, z_size)

406

407 # Extract batch of slices

408 slice_batch = image[batch_start:batch_end] # (B, Y, X)

409

410 if max(y_size, x_size) <= 512: # Process small images directly

411 # Add channel dimension: (B, 1, Y, X)

412 slice_input = slice_batch.unsqueeze(1)

413

414 # Batch inference

415 predictions = model(slice_input) # (B, 1, Y, X)

416 denoised[batch_start:batch_end] = predictions.squeeze(1)

417

418 else: # Use overlapping patches for large images

419 for i in range(batch_end - batch_start):

420 slice_idx = batch_start + i

421 slice_2d = slice_batch[i]

422 denoised[slice_idx] = process_large_slice(slice_2d, model, patch_size)

423

424 # Restore original range and convert to uint16

425 denoised = torch.clamp(denoised * max_val, 0, max_val)

426 return denoised.to(dtype=torch.uint16)

Coverage for openhcs/processing/backends/enhance/n2v2_processor_torch.py: 10.5%

194 statements