Coverage for openhcs/core/memory/gpu_utils.py: 35.8%

83 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-01 18:33 +0000

1""" 

2GPU utility functions for OpenHCS. 

3 

4This module provides utility functions for checking GPU availability 

5across different frameworks (cupy, torch, tensorflow, jax). 

6 

7Doctrinal Clauses: 

8- Clause 88 — No Inferred Capabilities 

9- Clause 293 — GPU Pre-Declaration Enforcement 

10""" 

11 

12import logging 

13import os 

14from typing import Optional 

15 

16from openhcs.core.utils import optional_import 

17 

18logger = logging.getLogger(__name__) 

19 

20 

21def check_cupy_gpu_available() -> Optional[int]: 

22 """ 

23 Check if cupy is available and can access a GPU. 

24 

25 Returns: 

26 GPU device ID if available, None otherwise 

27 """ 

28 # Skip GPU checks in subprocess runner mode 

29 if os.getenv('OPENHCS_SUBPROCESS_NO_GPU') == '1': 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true

30 logger.debug("Subprocess runner mode - skipping cupy GPU check") 

31 return None 

32 

33 cp = optional_import("cupy") 

34 if cp is None: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 logger.debug("Cupy not installed") 

36 return None 

37 

38 try: 

39 # Check if cupy is available and can access a GPU 

40 if cp.cuda.is_available(): 40 ↛ anywhereline 40 didn't jump anywhere: it always raised an exception.

41 # Get the current device ID 

42 device_id = cp.cuda.get_device_id() 

43 logger.debug("Cupy GPU available: device_id=%s", device_id) 

44 return device_id 

45 else: 

46 logger.debug("Cupy CUDA not available") 

47 return None 

48 except Exception as e: 

49 logger.debug("Error checking cupy GPU availability: %s", e) 

50 return None 

51 

52 

53def check_torch_gpu_available() -> Optional[int]: 

54 """ 

55 Check if torch is available and can access a GPU. 

56 

57 Returns: 

58 GPU device ID if available, None otherwise 

59 """ 

60 # Skip GPU checks in subprocess runner mode 

61 if os.getenv('OPENHCS_SUBPROCESS_NO_GPU') == '1': 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 logger.debug("Subprocess runner mode - skipping torch GPU check") 

63 return None 

64 

65 torch = optional_import("torch") 

66 if torch is None: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 logger.debug("Torch not installed") 

68 return None 

69 

70 try: 

71 # Check if torch is available and can access a GPU 

72 if torch.cuda.is_available(): 72 ↛ anywhereline 72 didn't jump anywhere: it always raised an exception.

73 # Get the current device ID 

74 device_id = torch.cuda.current_device() 

75 logger.debug("Torch GPU available: device_id=%s", device_id) 

76 return device_id 

77 else: 

78 logger.debug("Torch CUDA not available") 

79 return None 

80 except Exception as e: 

81 logger.debug("Error checking torch GPU availability: %s", e) 

82 return None 

83 

84 

85def check_tf_gpu_available() -> Optional[int]: 

86 """ 

87 Check if tensorflow is available and can access a GPU. 

88 

89 Returns: 

90 GPU device ID if available, None otherwise 

91 """ 

92 # Skip GPU checks in subprocess runner mode 

93 if os.getenv('OPENHCS_SUBPROCESS_NO_GPU') == '1': 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 logger.debug("Subprocess runner mode - skipping tensorflow GPU check") 

95 return None 

96 

97 tf = optional_import("tensorflow") 

98 if tf is None: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 logger.debug("TensorFlow not installed") 

100 return None 

101 

102 try: 

103 # Check if tensorflow is available and can access a GPU 

104 gpus = tf.config.list_physical_devices('GPU') 

105 if gpus: 

106 # Get the first GPU device ID 

107 # TensorFlow doesn't have a direct way to get the CUDA device ID, 

108 # so we'll just use the index in the list 

109 device_id = 0 

110 logger.debug("TensorFlow GPU available: device_id=%s", device_id) 

111 return device_id 

112 else: 

113 logger.debug("TensorFlow GPU not available") 

114 return None 

115 except Exception as e: 

116 logger.debug("Error checking TensorFlow GPU availability: %s", e) 

117 return None 

118 

119 

120def check_jax_gpu_available() -> Optional[int]: 

121 """ 

122 Check if JAX is available and can access a GPU. 

123 

124 Returns: 

125 GPU device ID if available, None otherwise 

126 """ 

127 # Skip GPU checks in subprocess runner mode 

128 if os.getenv('OPENHCS_SUBPROCESS_NO_GPU') == '1': 

129 logger.debug("Subprocess runner mode - skipping JAX GPU check") 

130 return None 

131 

132 jax = optional_import("jax") 

133 if jax is None: 

134 logger.debug("JAX not installed") 

135 return None 

136 

137 try: 

138 # Check if JAX is available and can access a GPU 

139 devices = jax.devices() 

140 gpu_devices = [d for d in devices if d.platform == 'gpu'] 

141 

142 if gpu_devices: 

143 # Get the first GPU device ID 

144 # JAX device IDs are typically in the form 'gpu:0' 

145 device_str = str(gpu_devices[0]) 

146 if ':' in device_str: 

147 device_id = int(device_str.split(':')[-1]) 

148 else: 

149 # Default to 0 if we can't parse the device ID 

150 device_id = 0 

151 logger.debug("JAX GPU available: device_id=%s", device_id) 

152 return device_id 

153 else: 

154 logger.debug("JAX GPU not available") 

155 return None 

156 except Exception as e: 

157 logger.debug("Error checking JAX GPU availability: %s", e) 

158 return None