uxlfoundation
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/benchmark/include/benchmark.hpp‎
Lines changed: 37 additions & 4 deletions b/‎examples/benchmark/include/benchmark.hpp‎
Lines changed: 37 additions & 4 deletions
diff --git a/‎examples/benchmark/src/benchmark.cpp‎
Lines changed: 5 additions & 2 deletions b/‎examples/benchmark/src/benchmark.cpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎man/doxconfig‎
Lines changed: 1 addition & 1 deletion b/‎man/doxconfig‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/atl/ofi/atl_ofi.cpp‎
Lines changed: 40 additions & 0 deletions b/‎src/atl/ofi/atl_ofi.cpp‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.cpp‎
Lines changed: 25 additions & 8 deletions b/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.cpp‎
Lines changed: 25 additions & 8 deletions
diff --git a/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.hpp‎
Lines changed: 20 additions & 3 deletions b/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.hpp‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎src/coll/algorithms/allreduce/sycl/allreduce_pcie.cpp‎
Lines changed: 27 additions & 7 deletions b/‎src/coll/algorithms/allreduce/sycl/allreduce_pcie.cpp‎
Lines changed: 27 additions & 7 deletions
diff --git a/‎src/coll/algorithms/allreduce/sycl/allreduce_pcie.hpp‎
Lines changed: 22 additions & 3 deletions b/‎src/coll/algorithms/allreduce/sycl/allreduce_pcie.hpp‎
Lines changed: 22 additions & 3 deletions
@@ -335,7 +335,7 @@ endif()
 
 set(CCL_MAJOR_VERSION     "2021")
 set(CCL_MINOR_VERSION     "15")
-set(CCL_UPDATE_VERSION    "6")
+set(CCL_UPDATE_VERSION    "7")
 set(CCL_PRODUCT_STATUS    "Gold")
 string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
 get_vcs_properties("git")
 
@@ -377,7 +377,9 @@ void store_to_csv(const user_options_t& options,
                   double max_time,
                   double avg_time,
                   double stddev,
-                  double wait_avg_time) {
+                  double wait_avg_time,
+                  double algbw,
+                  double busbw) {
     std::ofstream csvf;
     csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::app);
 
@@ -396,7 +398,7 @@ void store_to_csv(const user_options_t& options,
                  << "," << ccl::get_datatype_size(dtype) << "," << elem_count << ","
                  << ccl::get_datatype_size(dtype) * elem_count << "," << buf_count << ","
                  << iter_count << "," << min_time << "," << max_time << "," << avg_time << ","
-                 << stddev << "," << wait_avg_time << std::endl;
+                 << stddev << "," << wait_avg_time << "," << algbw << "," << busbw << std::endl;
         }
         csvf.close();
     }
@@ -472,13 +474,42 @@ void print_timings(const ccl::communicator& comm,
         max_time /= iter_count;
 
         size_t bytes = elem_count * ccl::get_datatype_size(dtype) * buf_count;
+
+        double algbw = bytes / total_avg_time / 1000;
+
+        if (ncolls == 1) {
+            if (options.coll_names.front() == "allgather" ||
+                options.coll_names.front() == "allgatherv" ||
+                options.coll_names.front() == "reduce_scatter" ||
+                options.coll_names.front() == "alltoall" ||
+                options.coll_names.front() == "alltoallv") {
+                algbw = algbw * nranks;
+            }
+        }
+
+        double busbw = algbw;
+        if (ncolls == 1) {
+            if (options.coll_names.front() == "allreduce") {
+                busbw = algbw * 2 * (nranks - 1) / nranks;
+            }
+            else if (options.coll_names.front() == "allgather" ||
+                     options.coll_names.front() == "allgatherv" ||
+                     options.coll_names.front() == "reduce_scatter" ||
+                     options.coll_names.front() == "alltoall" ||
+                     options.coll_names.front() == "alltoallv") {
+                busbw = algbw * (nranks - 1) / nranks;
+            }
+        }
+
         std::stringstream ss;
         ss << std::right << std::fixed << std::setw(COL_WIDTH) << bytes << std::setw(COL_WIDTH)
            << elem_count * buf_count << std::setw(COL_WIDTH) << iter_count << std::setw(COL_WIDTH)
            << std::setprecision(COL_PRECISION) << min_time << std::setw(COL_WIDTH)
            << std::setprecision(COL_PRECISION) << max_time << std::setw(COL_WIDTH)
            << std::setprecision(COL_PRECISION) << total_avg_time << std::setw(COL_WIDTH - 3)
-           << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH + 3);
+           << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH)
+           << std::setprecision(COL_PRECISION) << algbw << std::setw(COL_WIDTH)
+           << std::setprecision(COL_PRECISION) << busbw << std::setw(COL_WIDTH + 3);
 
         if (show_extened_info(options.show_additional_info)) {
             ss << std::right << std::fixed << std::setprecision(COL_PRECISION) << wait_avg_time;
@@ -497,7 +528,9 @@ void print_timings(const ccl::communicator& comm,
                          max_time,
                          total_avg_time,
                          stddev,
-                         wait_avg_time);
+                         wait_avg_time,
+                         algbw,
+                         busbw);
         }
     }
 
 
@@ -105,7 +105,8 @@ void run(ccl::communicator& service_comm,
                    << "#elem_count" << std::setw(COL_WIDTH) << "#repetitions"
                    << std::setw(COL_WIDTH) << "t_min[usec]" << std::setw(COL_WIDTH) << "t_max[usec]"
                    << std::setw(COL_WIDTH) << "t_avg[usec]" << std::setw(COL_WIDTH - 3)
-                   << "stddev[%]";
+                   << "stddev[%]" << std::setw(COL_WIDTH) << "algbw[GB/s]" << std::setw(COL_WIDTH)
+                   << "busbw[GB/s]";
 
                 if (show_extened_info(options.show_additional_info)) {
                     ss << std::right << std::setw(COL_WIDTH + 3) << "wait_t_avg[usec]";
@@ -435,7 +436,9 @@ int main(int argc, char* argv[]) {
              << "t_max[usec],"
              << "t_avg[usec],"
              << "stddev[%],"
-             << "wait_t_avg[usec]" << std::endl;
+             << "wait_t_avg[usec],"
+             << "algbw[GB/s],"
+             << "busbw[GB/s]" << std::endl;
         csvf.close();
     }
 
 
@@ -1,5 +1,5 @@
 PROJECT_NAME           = "Intel® oneAPI Collective Communications Library"
-PROJECT_NUMBER         = "2021.15.6"
+PROJECT_NUMBER         = "2021.15.7"
 
 INPUT = ../src/common/env/vars.hpp ../src/common/env/vars_experimental.hpp
 
 
@@ -101,6 +101,46 @@ atl_status_t atl_ofi::init(int* argc,
     base_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
     base_hints->caps = FI_TAGGED;
 
+    /*
+     * Domain selection based on local rank index
+     * If CCL_OFI_DOMAIN_NAMES is set, parse the comma-separated list and assign
+     * the domain corresponding to local rank index for better fabric resource utilization
+     */
+    if (ccl::global_data::env().ofi_domain_names != CCL_ENV_STR_NOT_SPECIFIED) {
+        std::string domain_names_str = ccl::global_data::env().ofi_domain_names;
+        std::vector<std::string> domain_list;
+        std::stringstream ss(domain_names_str);
+        std::string domain_name;
+
+        // Parse comma-separated domain names
+        while (std::getline(ss, domain_name, ',')) {
+            // Trim whitespace
+            size_t start = domain_name.find_first_not_of(" \t");
+            size_t end = domain_name.find_last_not_of(" \t");
+            if (start != std::string::npos && end != std::string::npos) {
+                domain_list.push_back(domain_name.substr(start, end - start + 1));
+            }
+        }
+
+        // Select domain based on local rank index
+        int local_idx = coord.local_idx;
+        if (!domain_list.empty() && local_idx < static_cast<int>(domain_list.size())) {
+            base_hints->domain_attr->name = strdup(domain_list[local_idx].c_str());
+            LOG_INFO("Selected OFI domain: ",
+                     base_hints->domain_attr->name,
+                     " for local rank: ",
+                     local_idx,
+                     ", global rank: ",
+                     pmi->get_rank());
+        }
+        else {
+            LOG_WARN("Cannot select domain for local rank: ",
+                     local_idx,
+                     ", available domains: ",
+                     domain_list.size());
+        }
+    }
+
     prov_env = getenv("FI_PROVIDER");
 
     ctx.enable_hmem = 0;
 
@@ -43,19 +43,32 @@ ccl::event allgatherv_ll_ring(const void *send_buf,
     size_t send_size = send_count * ccl_dtype.size();
 
     bool p2p = node_comm->get_topo_manager().has_p2p_access();
-    uint32_t pattern = comm->get_rt_pattern(pattern_type::collective, -1);
+    uint32_t pattern = node_comm->get_rt_pattern(pattern_type::collective, -1);
 
     auto lambda = [&]<typename T, template <typename, int> class Proto>(int NRanks) {
         const size_t *offs = offsets.empty() ? NULL : offsets.data();
 
         T *peerbuf0[NRanks];
         T *peerbuf1[NRanks];
-        for (int i = 0; i < NRanks; i++) {
-            peerbuf0[i] = (T *)get_remote_node_tmp_buf(0, comm)[i];
-            peerbuf1[i] = (T *)get_remote_node_tmp_buf(1, comm)[i];
+        T *ipcbuf0;
+        T *ipcbuf1;
+        if (ccl::global_data::env().sycl_ll_buffer_global) {
+            for (int i = 0; i < NRanks; i++) {
+                peerbuf0[i] = (T *)get_remote_node_tmp_buf(0, comm)[i];
+                peerbuf1[i] = (T *)get_remote_node_tmp_buf(1, comm)[i];
+            }
+            ipcbuf0 = (T *)get_tmp_buf(0, comm);
+            ipcbuf1 = (T *)get_tmp_buf(1, comm);
+        }
+        else {
+            auto [local_tmp_buf, remote_ptrs] = node_comm->get_all_tmp_bufs(true);
+            for (int i = 0; i < NRanks; i++) {
+                peerbuf0[i] = (T *)remote_ptrs[i];
+                peerbuf1[i] = (T *)((char *)remote_ptrs[i] + ccl_tmp_bufs::buf_size / 2);
+            }
+            ipcbuf0 = (T *)local_tmp_buf;
+            ipcbuf1 = (T *)((char *)local_tmp_buf + ccl_tmp_bufs::buf_size / 2);
         }
-        T *ipcbuf0 = (T *)get_tmp_buf(0, comm);
-        T *ipcbuf1 = (T *)get_tmp_buf(1, comm);
         sycl::event e = AllGather<T, Proto, RingTransmit>::launch(NRanks,
                                                                   (T *)send_buf,
                                                                   (T *)recv_buf,
@@ -68,13 +81,17 @@ ccl::event allgatherv_ll_ring(const void *send_buf,
                                                                   comm_rank,
                                                                   pattern,
                                                                   q,
+                                                                  node_comm,
                                                                   p2p,
                                                                   done);
-        // update pattern
-        comm->update_rt_pattern(pattern_type::collective, -1, pattern);
         return e;
     };
 
+    if (ccl::global_data::env().sycl_ll_buffer_global) {
+        const bool is_cpu_barrier = ccl::global_data::env().sycl_ccl_barrier;
+        sycl::event barrier_event = invoke_barrier(node_comm, q, {}, is_cpu_barrier);
+    }
+
     if (send_size <= ccl::global_data::env().sycl_allgatherv_ll_threshold) {
         // small ring with LL
         sycl_e = invoke_pcie_type<Rt64_PCIE>(lambda, comm_size, dtype);
 
@@ -58,7 +58,11 @@ struct AllGather : public Transmit<T, Proto, SubGroupSize> {
                                                p2p),
               workSize(calcWorkSize(input, output, nelems * sizeof(T))) {}
 
-    sycl::nd_range<1> getLaunchParam(uint32_t& updateSeqNo) const {
+    sycl::nd_range<1> getLaunchParam(sycl::queue q,
+                                     const std::shared_ptr<ccl_comm> comm,
+                                     T* ipcbuf0,
+                                     T* ipcbuf1,
+                                     uint32_t& updateSeqNo) const {
         constexpr uint32_t nThreads = 64; /* TODO: get EU/thread config */
 #if defined(CCL_SYCL_ENABLE_PVC)
         constexpr size_t maxSS = 64;
@@ -73,7 +77,18 @@ struct AllGather : public Transmit<T, Proto, SubGroupSize> {
         size_t nSS = divUp(nWire, wirePerSS);
         auto actualSS = std::min(nSS, maxSS);
         auto nSteps = divUp(nWire, actualSS * wirePerSS);
-        updateSeqNo += nSteps;
+        auto nSlot = Transmit<T, Proto, SubGroupSize>::nSlot;
+        nSteps = (nSteps + nSlot - 1) / nSlot;
+        auto newSeqNo = comm->increase_rt_pattern(pattern_type::collective, -1, updateSeqNo, nSteps);
+        // check for pattern wraparound
+        rt_check_pattern<T>(q,
+                            comm,
+                            updateSeqNo,
+                            newSeqNo,
+                            ipcbuf0,
+                            ipcbuf1,
+                            RingTransmit<int, Rt64_128_PCIE>::ringSize / sizeof(T));
+        updateSeqNo = newSeqNo;
         //
         // XXX: we over updated sequence number. Should be nSteps / nSlot
         // No harm, but not nice.
@@ -94,6 +109,7 @@ struct AllGather : public Transmit<T, Proto, SubGroupSize> {
                               int rank,
                               uint32_t& step,
                               sycl::queue queue,
+                              const std::shared_ptr<ccl_comm> comm,
                               bool p2p,
                               bool& done) {
         sycl::event e;
@@ -105,8 +121,9 @@ struct AllGather : public Transmit<T, Proto, SubGroupSize> {
         }
         done = true;
 
+        const sycl::nd_range<1> ndrange = offload.getLaunchParam(queue, comm, ipcbuf0, ipcbuf1, step);
         e = queue.submit([&](sycl::handler& cgh) {
-            cgh.parallel_for(offload.getLaunchParam(step), offload);
+            cgh.parallel_for(ndrange, offload);
         });
         return e;
     }
 
@@ -42,17 +42,32 @@ ccl::event allreduce_ll_ring(const void *src,
         q.memcpy(dst, src, dt_sz * count);
 
     bool p2p = node_comm->get_topo_manager().has_p2p_access();
-    uint32_t pattern = comm->get_rt_pattern(pattern_type::collective, -1);
+    uint32_t pattern = node_comm->get_rt_pattern(pattern_type::collective, -1);
 
     auto lambda = [&]<typename T, template <typename, int> class Proto>(int NRanks) {
         T *peerbuf0[NRanks];
         T *peerbuf1[NRanks];
-        for (int i = 0; i < NRanks; i++) {
-            peerbuf0[i] = (T *)get_remote_node_tmp_buf(0, comm)[i];
-            peerbuf1[i] = (T *)get_remote_node_tmp_buf(1, comm)[i];
+        T *ipcbuf0;
+        T *ipcbuf1;
+        if (ccl::global_data::env().sycl_ll_buffer_global) {
+            // large buffer
+            for (int i = 0; i < NRanks; i++) {
+                peerbuf0[i] = (T *)get_remote_node_tmp_buf(0, comm)[i];
+                peerbuf1[i] = (T *)get_remote_node_tmp_buf(1, comm)[i];
+            }
+            ipcbuf0 = (T *)get_tmp_buf(0, comm);
+            ipcbuf1 = (T *)get_tmp_buf(1, comm);
+        }
+        else {
+            // small buffer
+            auto [local_tmp_buf, remote_ptrs] = node_comm->get_all_tmp_bufs(true);
+            for (int i = 0; i < NRanks; i++) {
+                peerbuf0[i] = (T *)remote_ptrs[i];
+                peerbuf1[i] = (T *)((char *)remote_ptrs[i] + ccl_tmp_bufs::buf_size / 2);
+            }
+            ipcbuf0 = (T *)local_tmp_buf;
+            ipcbuf1 = (T *)((char *)local_tmp_buf + ccl_tmp_bufs::buf_size / 2);
         }
-        T *ipcbuf0 = (T *)get_tmp_buf(0, comm);
-        T *ipcbuf1 = (T *)get_tmp_buf(1, comm);
         sycl::event e = AllReduce<T, Proto, RingTransmit>::launch(NRanks,
                                                                   (T *)dst,
                                                                   ipcbuf0,
@@ -63,12 +78,17 @@ ccl::event allreduce_ll_ring(const void *src,
                                                                   comm_rank,
                                                                   pattern,
                                                                   q,
+                                                                  node_comm,
                                                                   p2p,
                                                                   done);
-        comm->update_rt_pattern(pattern_type::collective, -1, pattern);
         return e;
     };
 
+    if (ccl::global_data::env().sycl_ll_buffer_global) {
+        const bool is_cpu_barrier = ccl::global_data::env().sycl_ccl_barrier;
+        sycl::event barrier_event = invoke_barrier(node_comm, q, {}, is_cpu_barrier);
+    }
+
     if (count * dt_sz <= ccl::global_data::env().sycl_allreduce_ll_threshold) {
         // small ring with LL
         sycl_e = invoke_pcie_type<Rt64_PCIE>(lambda, comm_size, dtype);
 
@@ -56,7 +56,11 @@ struct AllReduce : public Transmit<T, Proto, SubGroupSize> {
     static int scatterVerify(uint32_t* host, int rank, uint32_t flag, size_t nWorkElemsInInt);
     static int stage2Verify(T* host, int rank, uint32_t flag, size_t nWorkElemsInInt);
 
-    sycl::nd_range<1> getLaunchParam(uint32_t& updateSeqNo) const {
+    sycl::nd_range<1> getLaunchParam(sycl::queue q,
+                                     const std::shared_ptr<ccl_comm> comm,
+                                     T* ipcbuf0,
+                                     T* ipcbuf1,
+                                     uint32_t& updateSeqNo) const {
         constexpr uint32_t nThreads = 64; /* TODO: get EU/thread config */
 // TODO: can be queried
 #if defined(CCL_SYCL_ENABLE_PVC)
@@ -72,7 +76,19 @@ struct AllReduce : public Transmit<T, Proto, SubGroupSize> {
         size_t nSS = divUp(nWire, wirePerSS);
         auto actualSS = std::min(nSS, maxSS);
         auto nSteps = divUp(nWire, actualSS * wirePerSS);
-        updateSeqNo += nSteps;
+        auto nSlot = Transmit<T, Proto, SubGroupSize>::nSlot;
+        nSteps = (nSteps + nSlot - 1) / nSlot;
+        auto newSeqNo =
+            comm->increase_rt_pattern(pattern_type::collective, -1, updateSeqNo, nSteps);
+        // check for pattern wraparound
+        rt_check_pattern<T>(q,
+                            comm,
+                            updateSeqNo,
+                            newSeqNo,
+                            ipcbuf0,
+                            ipcbuf1,
+                            RingTransmit<int, Rt64_128_PCIE>::ringSize / sizeof(T));
+        updateSeqNo = newSeqNo;
         //
         // XXX: we over updated sequence number. Should be nSteps / nSlot
         // No harm, but not nice.
@@ -91,6 +107,7 @@ struct AllReduce : public Transmit<T, Proto, SubGroupSize> {
                               int rank,
                               uint32_t& step,
                               sycl::queue queue,
+                              const std::shared_ptr<ccl_comm> comm,
                               bool p2p,
                               bool& done) {
         sycl::event e;
@@ -102,8 +119,10 @@ struct AllReduce : public Transmit<T, Proto, SubGroupSize> {
         }
         done = true;
 
+        const sycl::nd_range<1> ndrange =
+            offload.getLaunchParam(queue, comm, ipcbuf0, ipcbuf1, step);
         e = queue.submit([&](sycl::handler& cgh) {
-            cgh.parallel_for(offload.getLaunchParam(step), offload);
+            cgh.parallel_for(ndrange, offload);
         });
         return e;
     }