/home/runner/work/HiCR/HiCR/include/hicr/backends/ascend/topologyManager.hpp Source File

HiCR: /home/runner/work/HiCR/HiCR/include/hicr/backends/ascend/topologyManager.hpp Source File
HiCR
topologyManager.hpp
Go to the documentation of this file.
1/*
2 * Copyright 2025 Huawei Technologies Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * Copyright Huawei Technologies Switzerland AG
19 * All rights reserved.
20 */
21
29#pragma once
30
31#include <memory>
35#include <hicr/core/topology.hpp>
37
38namespace HiCR::backend::ascend
39{
40
45{
46 public:
47
52 : HiCR::TopologyManager()
53 {}
54
58 ~TopologyManager() = default;
59
60 __INLINE__ HiCR::Topology queryTopology() override
61 {
62 // Storage for the topology to return
64
65 // Storage for getting the Ascend device count
66 uint32_t deviceCount = 0;
67
68 // ask ACL for available devices
69 aclError err;
70 err = aclrtGetDeviceCount((uint32_t *)&deviceCount);
71 if (err != ACL_SUCCESS) HICR_THROW_RUNTIME("Can not retrieve Ascend device count. Error %d", err);
72
73 // add as many memory spaces as devices
74 for (int32_t deviceId = 0; deviceId < (int32_t)deviceCount; deviceId++)
75 {
76 // Creating new devices
77 size_t ascendFreeMemory, ascendMemorySize;
78
79 // set the device
80 err = aclrtSetDevice(deviceId);
81 if (err != ACL_SUCCESS) HICR_THROW_RUNTIME("Can not select the Ascend device %d. Error %d", deviceId, err);
82
83 // Creating new Ascend device
84 auto ascendDevice = std::make_shared<ascend::Device>(deviceId, HiCR::Device::computeResourceList_t({}), HiCR::Device::memorySpaceList_t({}));
85
86 // retrieve the default device context
87 err = aclrtGetCurrentContext(ascendDevice->getContext());
88 if (err != ACL_SUCCESS) HICR_THROW_RUNTIME("Can not get default context in Ascend device %d. Error %d", deviceId, err);
89
90 // get the memory info
91 err = aclrtGetMemInfo(ACL_HBM_MEM, &ascendFreeMemory, &ascendMemorySize);
92 if (err != ACL_SUCCESS) HICR_THROW_RUNTIME("Can not retrieve Ascend device %d memory space. Error %d", deviceId, err);
93
94 // Creating Device's memory space
95 auto ascendDeviceMemorySpace = std::make_shared<ascend::MemorySpace>(ascendDevice, ascendMemorySize);
96
97 // Creating Device's compute resource
98 auto ascendDeviceComputeResource = std::make_shared<ascend::ComputeResource>(ascendDevice);
99
100 // Now adding resources to the device
101 ascendDevice->addComputeResource(ascendDeviceComputeResource);
102 ascendDevice->addMemorySpace(ascendDeviceMemorySpace);
103
104 // Adding new device
105 t.addDevice(ascendDevice);
106 }
107
108 // Setting up communication between the local Ascend devices
109 setupInterDeviceCommunication(t);
110
111 // Returning topology
112 return t;
113 }
114
120 __INLINE__ static HiCR::Topology deserializeTopology(const nlohmann::json &topology)
121 {
122 // Verifying input's syntax
123 HiCR::Topology::verify(topology);
124
125 // New topology to create
127
128 // Iterating over the device list entries in the serialized input
129 for (const auto &device : topology["Devices"])
130 {
131 // Getting device type
132 const auto type = device["Type"].get<std::string>();
133
134 // If the device type is recognized, add it to the new topology
135 if (type == "Ascend Device") t.addDevice(std::make_shared<ascend::Device>(device));
136 }
137
138 // Returning new topology
139 return t;
140 }
141
142 __INLINE__ HiCR::Topology _deserializeTopology(const nlohmann::json &topology) const override { return deserializeTopology(topology); }
143
149 __INLINE__ static std::unique_ptr<HiCR::TopologyManager> createDefault()
150 {
151 // Initialize (Ascend's) ACL runtime
152 aclError err = aclInit(NULL);
153 if (err != ACL_SUCCESS) HICR_THROW_RUNTIME("Failed to initialize Ascend Computing Language. Error %d", err);
154
155 // Initializing ascend topology manager
156 return std::make_unique<HiCR::backend::ascend::TopologyManager>();
157 }
158
159 private:
160
164 __INLINE__ void setupInterDeviceCommunication(HiCR::Topology &topology)
165 {
166 int32_t canAccessPeer = 0;
167 aclError err;
168
169 // enable communication among each pair of Ascend devices
170 for (const auto &srcDevice : topology.getDevices())
171 for (const auto &dstDevice : topology.getDevices())
172 {
173 auto src = (ascend::Device *)srcDevice.get();
174 auto dst = (ascend::Device *)dstDevice.get();
175 if (src->getId() != dst->getId())
176 {
177 // verify that the two Ascend devices can see each other
178 err = aclrtDeviceCanAccessPeer(&canAccessPeer, src->getId(), dst->getId());
179 if (err != ACL_SUCCESS) HICR_THROW_RUNTIME("Can not determine peer accessibility to device %ld from device %ld.. Error %d", dst, src, err);
180
181 if (canAccessPeer == 0) HICR_THROW_RUNTIME("Can not access device %ld from device %ld. Error %d", dst, src, err);
182
183 // Selecting device
184 dst->select();
185
186 // enable the communication
187 err = aclrtDeviceEnablePeerAccess(src->getId(), 0);
188 if (err != ACL_SUCCESS) HICR_THROW_RUNTIME("Can not enable peer access from device %ld to device %ld. Error %d", dst->getId(), src->getId(), err);
189 }
190 }
191 }
192
196 __INLINE__ HiCR::Device::computeResourceList_t queryComputeResources()
197 {
198 // New compute resource list to return
199 HiCR::Device::computeResourceList_t computeResourceList;
200
201 // Returning new compute resource list
202 return computeResourceList;
203 }
204
208 __INLINE__ HiCR::Device::memorySpaceList_t queryMemorySpaces()
209 {
210 // New memory space list to return
211 HiCR::Device::memorySpaceList_t memorySpaceList;
212
213 // Returning new memory space list
214 return memorySpaceList;
215 }
216};
217
218} // namespace HiCR::backend::ascend
This file implements the compute resource class for the Ascend backend.
This file implements the Device class for the Ascend backend.
This file implements the memory space class for the Ascend backend.
std::vector< std::shared_ptr< ComputeResource > > computeResourceList_t
Definition device.hpp:59
std::vector< std::shared_ptr< MemorySpace > > memorySpaceList_t
Definition device.hpp:64
Definition topologyManager.hpp:50
Definition topology.hpp:40
__INLINE__ void addDevice(const std::shared_ptr< HiCR::Device > &device)
Definition topology.hpp:62
static __INLINE__ void verify(const nlohmann::json &input)
Definition topology.hpp:102
Definition device.hpp:39
Definition topologyManager.hpp:45
static __INLINE__ std::unique_ptr< HiCR::TopologyManager > createDefault()
Definition topologyManager.hpp:149
static __INLINE__ HiCR::Topology deserializeTopology(const nlohmann::json &topology)
Definition topologyManager.hpp:120
TopologyManager()
Definition topologyManager.hpp:51
__INLINE__ HiCR::Topology queryTopology() override
Definition topologyManager.hpp:60
__INLINE__ HiCR::Topology _deserializeTopology(const nlohmann::json &topology) const override
Definition topologyManager.hpp:142
Provides a definition for the abstract device manager class.
#define HICR_THROW_RUNTIME(...)
Definition exceptions.hpp:74